From 67fbc9a95224e2a358393297c4f0f3cd6e6d11cb Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Fri, 16 May 2025 14:37:32 -0700 Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?= =?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.6-beta.1 [skip ci] --- .github/new-prs-labeler.yml | 1 + bolt/include/bolt/Profile/Heatmap.h | 3 + bolt/lib/Core/BinaryFunction.cpp | 2 +- bolt/lib/Core/DIEBuilder.cpp | 8 +- bolt/lib/Passes/AsmDump.cpp | 2 +- bolt/lib/Passes/BinaryPasses.cpp | 2 +- bolt/lib/Profile/DataAggregator.cpp | 8 + bolt/lib/Profile/Heatmap.cpp | 15 +- bolt/lib/Rewrite/RewriteInstance.cpp | 5 +- bolt/test/X86/callcont-fallthru.s | 6 +- bolt/test/X86/heatmap-preagg.test | 4 + clang-tools-extra/clang-doc/HTMLGenerator.cpp | 4 +- .../clang-tidy/llvm/HeaderGuardCheck.cpp | 6 +- .../readability/IdentifierNamingCheck.cpp | 4 +- .../readability/MagicNumbersCheck.cpp | 9 +- clang-tools-extra/clangd/Diagnostics.cpp | 2 +- clang-tools-extra/clangd/FindTarget.cpp | 2 +- clang-tools-extra/clangd/index/FileIndex.cpp | 3 +- .../clangd/index/dex/dexp/Dexp.cpp | 2 +- .../modularize/ModularizeUtilities.cpp | 2 +- .../modularize/ModuleAssistant.cpp | 6 +- .../modularize/PreprocessorTracker.cpp | 2 +- .../pp-trace/PPCallbacksTracker.cpp | 4 +- clang/bindings/python/clang/cindex.py | 302 +- .../python/tests/cindex/test_cursor.py | 14 + .../bindings/python/tests/cindex/test_lib.py | 31 + clang/docs/CMakeLists.txt | 28 + clang/docs/DebuggingCoroutines.rst | 2 +- clang/docs/InternalsManual.rst | 2 +- clang/docs/LanguageExtensions.rst | 8 +- clang/docs/Modules.rst | 4 +- clang/docs/PointerAuthentication.rst | 2 +- clang/docs/RealtimeSanitizer.rst | 4 +- clang/docs/ReleaseNotes.rst | 70 +- clang/docs/StandardCPlusPlusModules.rst | 2 +- clang/docs/UsersManual.rst | 11 +- clang/docs/analyzer/checkers.rst | 4 +- clang/docs/analyzer/user-docs.rst | 1 + .../analyzer/user-docs/CommandLineUsage.rst | 2 + .../docs/analyzer/user-docs/Installation.rst | 2 +- clang/docs/analyzer/user-docs/Options.rst.in | 114 + .../tools/generate_analyzer_options_docs.py | 293 + clang/include/clang/AST/ASTConcept.h | 4 +- clang/include/clang/AST/ASTDiagnostic.h | 3 + clang/include/clang/AST/Decl.h | 30 +- clang/include/clang/AST/DeclCXX.h | 21 +- clang/include/clang/AST/DeclTemplate.h | 13 +- clang/include/clang/AST/Expr.h | 6 +- clang/include/clang/AST/OpenACCClause.h | 85 +- clang/include/clang/AST/OpenMPClause.h | 61 +- clang/include/clang/AST/PropertiesBase.td | 3 +- clang/include/clang/AST/Type.h | 1 + .../clang/Analysis/Analyses/ThreadSafetyTIL.h | 4 +- clang/include/clang/Basic/Attr.td | 14 +- clang/include/clang/Basic/AttrDocs.td | 1 + clang/include/clang/Basic/Builtins.td | 6 + clang/include/clang/Basic/BuiltinsNVPTX.td | 6 + clang/include/clang/Basic/BuiltinsRISCV.td | 6 + .../clang/Basic/BuiltinsWebAssembly.def | 1 + .../clang/Basic/DiagnosticCategories.h | 2 +- .../clang/Basic/DiagnosticDriverKinds.td | 4 + clang/include/clang/Basic/DiagnosticGroups.td | 48 + .../clang/Basic/DiagnosticSemaKinds.td | 48 +- clang/include/clang/Basic/JsonSupport.h | 2 +- clang/include/clang/Basic/Module.h | 2 +- clang/include/clang/Basic/TargetInfo.h | 4 +- clang/include/clang/Basic/arm_neon.td | 104 +- clang/include/clang/CIR/Dialect/IR/CIROps.td | 50 +- .../OpenACC/CIROpenACCTypeInterfaces.h | 35 + .../OpenACC/RegisterOpenACCExtensions.h | 22 + .../clang/Frontend/CommandLineSourceLoc.h | 11 +- clang/include/clang/Parse/Parser.h | 10910 +- clang/include/clang/Sema/Overload.h | 17 +- clang/include/clang/Sema/ScopeInfo.h | 2 +- clang/include/clang/Sema/Sema.h | 1 + clang/include/clang/Sema/SemaHLSL.h | 7 + clang/include/clang/Sema/SemaWasm.h | 1 + .../StaticAnalyzer/Core/AnalyzerOptions.def | 3 + .../StaticAnalyzer/Core/AnalyzerOptions.h | 9 +- .../Core/PathSensitive/ExplodedGraph.h | 55 +- .../Core/PathSensitive/ExprEngine.h | 4 +- .../Core/PathSensitive/SMTConstraintManager.h | 2 + .../Core/PathSensitive/SymbolManager.h | 36 +- clang/lib/AST/ASTConcept.cpp | 10 +- clang/lib/AST/ASTDiagnostic.cpp | 30 + clang/lib/AST/ByteCode/Compiler.cpp | 24 +- clang/lib/AST/ByteCode/Compiler.h | 1 + clang/lib/AST/ByteCode/Context.cpp | 1 + clang/lib/AST/ByteCode/Interp.cpp | 4 + clang/lib/AST/ByteCode/Interp.h | 12 +- clang/lib/AST/ByteCode/Opcodes.td | 6 +- clang/lib/AST/ByteCode/Pointer.h | 8 + clang/lib/AST/Decl.cpp | 24 +- clang/lib/AST/DeclCXX.cpp | 10 +- clang/lib/AST/DeclTemplate.cpp | 9 +- clang/lib/AST/ExprConstant.cpp | 25 +- clang/lib/AST/OpenACCClause.cpp | 8 +- clang/lib/AST/StmtProfile.cpp | 5 +- clang/lib/AST/Type.cpp | 19 + clang/lib/Basic/DiagnosticIDs.cpp | 17 +- clang/lib/Basic/Targets/AArch64.cpp | 9 +- clang/lib/Basic/Targets/AArch64.h | 4 +- clang/lib/Basic/Targets/AMDGPU.cpp | 4 +- clang/lib/Basic/Targets/PPC.cpp | 16 +- clang/lib/Basic/Targets/RISCV.cpp | 3 +- clang/lib/Basic/Targets/RISCV.h | 4 +- clang/lib/Basic/Targets/SystemZ.cpp | 4 +- clang/lib/CIR/CodeGen/CIRGenCall.cpp | 77 +- clang/lib/CIR/CodeGen/CIRGenDecl.cpp | 3 +- clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h | 18 +- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 17 +- clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h | 4 +- clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 14 + clang/lib/CIR/CodeGen/CIRGenTypes.h | 32 +- clang/lib/CIR/CodeGen/CIRGenerator.cpp | 7 + clang/lib/CIR/CodeGen/CMakeLists.txt | 1 + clang/lib/CIR/Dialect/CMakeLists.txt | 1 + clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 96 + .../OpenACC/CIROpenACCTypeInterfaces.cpp | 65 + clang/lib/CIR/Dialect/OpenACC/CMakeLists.txt | 12 + .../OpenACC/RegisterOpenACCExtensions.cpp | 27 + .../Dialect/Transforms/CIRCanonicalize.cpp | 17 +- .../lib/CIR/Dialect/Transforms/FlattenCFG.cpp | 236 +- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 8 +- clang/lib/CodeGen/CGBlocks.cpp | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 428 +- clang/lib/CodeGen/CGCall.cpp | 20 +- clang/lib/CodeGen/CGClass.cpp | 53 +- clang/lib/CodeGen/CGExpr.cpp | 176 +- clang/lib/CodeGen/CGExprScalar.cpp | 23 +- clang/lib/CodeGen/CGHLSLBuiltins.cpp | 15 + clang/lib/CodeGen/CGHLSLRuntime.cpp | 71 +- clang/lib/CodeGen/CGHLSLRuntime.h | 2 + clang/lib/CodeGen/CGObjCGNU.cpp | 4 +- clang/lib/CodeGen/CodeGenFunction.cpp | 10 +- clang/lib/CodeGen/CodeGenFunction.h | 25 +- clang/lib/CodeGen/CodeGenModule.cpp | 2 +- clang/lib/CodeGen/CodeGenTypes.cpp | 3 - clang/lib/CodeGen/ItaniumCXXABI.cpp | 7 +- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 43 +- clang/lib/CodeGen/TargetBuiltins/RISCV.cpp | 6 + .../CodeGen/TargetBuiltins/WebAssembly.cpp | 5 + clang/lib/Driver/Driver.cpp | 2 +- clang/lib/Driver/OffloadBundler.cpp | 3 +- clang/lib/Driver/SanitizerArgs.cpp | 21 +- clang/lib/Driver/ToolChains/AMDGPU.cpp | 2 +- clang/lib/Driver/ToolChains/Haiku.cpp | 2 + clang/lib/Format/Format.cpp | 5 +- clang/lib/Format/FormatToken.cpp | 3 +- clang/lib/Frontend/CompilerInvocation.cpp | 6 +- clang/lib/Frontend/DiagnosticRenderer.cpp | 3 +- .../lib/Frontend/VerifyDiagnosticConsumer.cpp | 2 +- clang/lib/Headers/__clang_hip_cmath.h | 17 +- clang/lib/Headers/opencl-c-base.h | 9 + clang/lib/Lex/PPDirectives.cpp | 3 +- clang/lib/Lex/Preprocessor.cpp | 2 + clang/lib/Parse/ParseCXXInlineMethods.cpp | 52 - clang/lib/Parse/ParseDecl.cpp | 573 +- clang/lib/Parse/ParseDeclCXX.cpp | 419 - clang/lib/Parse/ParseExpr.cpp | 543 - clang/lib/Parse/ParseExprCXX.cpp | 526 - clang/lib/Parse/ParseInit.cpp | 61 - clang/lib/Parse/ParseObjc.cpp | 454 +- clang/lib/Parse/ParseOpenACC.cpp | 57 - clang/lib/Parse/ParseOpenMP.cpp | 363 +- clang/lib/Parse/ParsePragma.cpp | 10 - clang/lib/Parse/ParseStmt.cpp | 240 - clang/lib/Parse/ParseStmtAsm.cpp | 58 - clang/lib/Parse/ParseTemplate.cpp | 233 - clang/lib/Parse/ParseTentative.cpp | 396 +- clang/lib/Parse/Parser.cpp | 205 +- clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp | 20 + clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h | 3 +- clang/lib/Sema/HLSLExternalSemaSource.cpp | 3 +- clang/lib/Sema/SemaARM.cpp | 2 +- clang/lib/Sema/SemaChecking.cpp | 69 +- clang/lib/Sema/SemaConcept.cpp | 3 +- clang/lib/Sema/SemaDecl.cpp | 32 +- clang/lib/Sema/SemaDeclCXX.cpp | 18 +- clang/lib/Sema/SemaExpr.cpp | 87 + clang/lib/Sema/SemaExprMember.cpp | 2 +- clang/lib/Sema/SemaHLSL.cpp | 107 +- clang/lib/Sema/SemaInit.cpp | 40 +- clang/lib/Sema/SemaOpenACCClause.cpp | 20 + clang/lib/Sema/SemaOverload.cpp | 4 + clang/lib/Sema/SemaTemplate.cpp | 55 +- clang/lib/Sema/SemaTemplateDeduction.cpp | 11 +- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 3 +- clang/lib/Sema/SemaWasm.cpp | 19 +- clang/lib/Serialization/ASTWriter.cpp | 65 +- .../Checkers/AnalyzerStatsChecker.cpp | 4 +- clang/lib/StaticAnalyzer/Core/BugReporter.cpp | 7 +- clang/lib/StaticAnalyzer/Core/CoreEngine.cpp | 15 +- .../lib/StaticAnalyzer/Core/ExplodedGraph.cpp | 21 +- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 2 +- .../lib/StaticAnalyzer/Core/SymbolManager.cpp | 43 + clang/test/AST/ByteCode/builtin-bit-cast.cpp | 10 + clang/test/AST/ByteCode/cxx20.cpp | 18 + clang/test/AST/ByteCode/lifetimes26.cpp | 49 + .../test/AST/HLSL/ByteAddressBuffers-AST.hlsl | 22 + .../test/AST/HLSL/StructuredBuffers-AST.hlsl | 22 + clang/test/AST/HLSL/TypedBuffers-AST.hlsl | 22 + .../AST/HLSL/ast-dump-comment-cbuffer.hlsl | 1 + clang/test/AST/HLSL/packoffset.hlsl | 1 + clang/test/AST/HLSL/pch_hlsl_buffer.hlsl | 2 + clang/test/Analysis/ftime-trace-no-init.cpp | 5 + .../generate_analyzer_options_docs.test | 14 + clang/test/CIR/CodeGen/switch_flat_op.cpp | 77 + clang/test/CIR/CodeGenOpenACC/combined.cpp | 63 + clang/test/CIR/CodeGenOpenACC/data.c | 16 +- clang/test/CIR/CodeGenOpenACC/kernels.c | 16 +- .../openacc-not-implemented.cpp | 4 +- clang/test/CIR/CodeGenOpenACC/parallel.c | 16 +- clang/test/CIR/CodeGenOpenACC/serial.c | 16 +- clang/test/CIR/CodeGenOpenACC/wait.c | 6 +- clang/test/CIR/IR/switch-flat.cir | 68 + clang/test/CIR/Transforms/switch.cir | 318 + clang/test/CodeCompletion/source-loc-zero.cpp | 11 + .../CodeGen/AArch64/cpu-supports-target.c | 2 +- clang/test/CodeGen/AArch64/fp8-init-list.c | 14 +- .../fp8-intrinsics/acle_neon_fp8_untyped.c | 1158 + .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 8 +- .../fp8-intrinsics/acle_sve2_fp8_fmla.c | 24 +- .../AArch64/struct-coerce-using-ptr.cpp | 622 + clang/test/CodeGen/AArch64/targetattr.c | 36 +- .../attr-riscv-rvv-vector-bits-less-8-call.c | 104 +- .../attr-riscv-rvv-vector-bits-less-8-cast.c | 56 +- .../attr-rvv-vector-bits-bitcast-less-8.c | 32 +- .../CodeGen/RISCV/attr-rvv-vector-bits-cast.c | 18 +- .../RISCV/attr-rvv-vector-bits-codegen.c | 37 +- .../RISCV/attr-rvv-vector-bits-globals.c | 16 +- clang/test/CodeGen/RISCV/riscv-zihintpause.c | 14 + clang/test/CodeGen/X86/avx512-error.c | 1 + clang/test/CodeGen/arm-mfp8.c | 44 +- .../CodeGen/attr-counted-by-for-pointers.c | 473 + clang/test/CodeGen/attr-counted-by.c | 198 + clang/test/CodeGen/builtins-arm64.c | 2 +- clang/test/CodeGen/builtins-nvptx.c | 20 + clang/test/CodeGen/builtins-wasm.c | 8 +- clang/test/CodeGen/dllexport.c | 2 + clang/test/CodeGen/dllimport.c | 3 + clang/test/CodeGen/dso-local-executable.c | 3 + clang/test/CodeGen/target-avx-abi-diag.c | 2 + clang/test/CodeGenCXX/dllexport-members.cpp | 70 +- .../test/CodeGenCXX/dllexport-missing-key.cpp | 1 + clang/test/CodeGenCXX/dllexport.cpp | 30 +- clang/test/CodeGenCXX/dllimport-members.cpp | 104 +- .../test/CodeGenCXX/dllimport-missing-key.cpp | 1 + clang/test/CodeGenCXX/dllimport-rtti.cpp | 7 +- clang/test/CodeGenCXX/dllimport.cpp | 30 +- .../test/CodeGenCXX/dso-local-executable.cpp | 1 + .../CodeGenCXX/mingw-template-dllexport.cpp | 3 + clang/test/CodeGenCXX/rtti-mingw64.cpp | 2 + clang/test/CodeGenCXX/virt-dtor-key.cpp | 8 +- .../CodeGenCXX/vtable-key-function-ios.cpp | 2 + .../GlobalConstructorFunction.hlsl | 4 + .../CodeGenHLSL/GlobalConstructorLib.hlsl | 2 +- clang/test/CodeGenHLSL/RootSignature.hlsl | 31 + .../ByteAddressBuffers-constructors.hlsl | 75 +- .../builtins/RWBuffer-constructor.hlsl | 83 +- .../StructuredBuffers-constructors.hlsl | 82 +- clang/test/CodeGenHLSL/cbuffer.hlsl | 60 +- .../CodeGenHLSL/convergence/global_array.hlsl | 16 + clang/test/CodeGenHLSL/static-local-ctor.hlsl | 2 +- clang/test/Driver/fsanitize.c | 3 + clang/test/Driver/haiku.c | 1 + .../ppc-mrop-protection-support-check.c | 16 +- .../Driver/print-supported-extensions-riscv.c | 2 + clang/test/Driver/riscv-arch.c | 26 +- .../__clang_hip_cmath-return_types.hip | 1023 + clang/test/Headers/opencl-c-header.cl | 6 + clang/test/Modules/pr130712.cppm | 33 + clang/test/Modules/pr140130.cpp | 33 + clang/test/Modules/sdk-settings-json-dep.m | 53 + clang/test/OpenMP/metadirective_messages.cpp | 10 + clang/test/OpenMP/openmp_non_c_directives.c | 12 + .../test/Preprocessor/riscv-target-features.c | 12 + clang/test/Refactor/source-loc-zero.cpp | 17 + clang/test/Sema/atomic-expr.c | 17 + clang/test/Sema/bitfield-layout.c | 44 +- clang/test/Sema/bitfield-layout_1.c | 1 + clang/test/Sema/builtins-wasm.c | 3 + clang/test/Sema/mms-bitfields.c | 6 +- clang/test/SemaCXX/bitfield.cpp | 1 + .../SemaCXX/constant-expression-p2280r4.cpp | 21 + clang/test/SemaCXX/consteval-assert.cpp | 34 + clang/test/SemaCXX/cxx2a-consteval.cpp | 22 + .../SemaCXX/cxx2a-three-way-comparison.cpp | 9 + clang/test/SemaCXX/cxx2b-deducing-this.cpp | 27 + clang/test/SemaCXX/dllexport.cpp | 2 + clang/test/SemaCXX/dllimport.cpp | 3 + .../SemaCXX/libstdcxx_format_kind_hack.cpp | 17 + clang/test/SemaCXX/libstdcxx_gets_hack.cpp | 28 - .../libstdcxx_pointer_return_false_hack.cpp | 34 - .../SemaCXX/ms_struct-bitfield-padding.cpp | 196 + ...overload-resolution-deferred-templates.cpp | 42 + .../warn-implicit-unicode-conversions.cpp | 151 + clang/test/SemaCXX/warn-nrvo.cpp | 73 + clang/test/SemaObjCXX/cxxoperator-selector.mm | 5 + clang/test/SemaOpenACC/gh139894.cpp | 14 + clang/test/SemaTemplate/GH55509.cpp | 35 + .../SemaTemplate/concepts-out-of-line-def.cpp | 15 + clang/test/lit.cfg.py | 2 + clang/tools/clang-refactor/ClangRefactor.cpp | 3 +- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 12 +- clang/unittests/AST/ASTImporterTest.cpp | 6 +- clang/unittests/CIR/CMakeLists.txt | 16 + clang/unittests/CIR/PointerLikeTest.cpp | 364 + clang/unittests/CMakeLists.txt | 4 +- clang/unittests/Driver/ToolChainTest.cpp | 18 +- clang/utils/TableGen/NeonEmitter.cpp | 14 +- compiler-rt/cmake/Modules/AddCompilerRT.cmake | 2 +- compiler-rt/cmake/builtin-config-ix.cmake | 2 +- compiler-rt/lib/builtins/CMakeLists.txt | 1 + compiler-rt/lib/builtins/aarch64/chkstk.S | 14 +- compiler-rt/lib/builtins/aarch64/lse.S | 4 +- .../builtins/aarch64/sme-libc-mem-routines.S | 2 +- compiler-rt/lib/builtins/clear_cache.c | 7 +- compiler-rt/lib/builtins/cpu_model/aarch64.c | 3 +- compiler-rt/lib/builtins/cpu_model/aarch64.h | 3 +- compiler-rt/lib/builtins/fp_compare_impl.inc | 2 +- compiler-rt/lib/builtins/fp_lib.h | 2 +- compiler-rt/lib/builtins/udivmodti4.c | 2 +- .../builtins/Unit/enable_execute_stack_test.c | 13 + .../test/builtins/Unit/fixunstfdi_test.c | 4 +- compiler-rt/test/builtins/Unit/multc3_test.c | 4 +- .../Posix/sanitizer_set_report_path_test.cpp | 4 +- .../include/flang-rt/runtime/emit-encoded.h | 2 +- flang-rt/lib/runtime/assign.cpp | 5 +- flang-rt/lib/runtime/edit-input.cpp | 8 +- flang/docs/ModFiles.md | 11 + flang/include/flang/Parser/preprocessor.h | 1 + flang/include/flang/Semantics/symbol.h | 1 + flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 6 +- flang/lib/Lower/OpenMP/Utils.cpp | 8 +- .../Transforms/ControlFlowConverter.cpp | 13 +- flang/lib/Parser/openmp-parsers.cpp | 16 +- flang/lib/Parser/preprocessor.cpp | 76 +- flang/lib/Semantics/check-omp-structure.cpp | 36 +- flang/lib/Semantics/expression.cpp | 13 + .../Fir/convert-to-llvm-openmp-and-fir.fir | 5 +- flang/test/Fir/vector-always.fir | 4 +- flang/test/Integration/unroll.f90 | 18 +- flang/test/Integration/unroll_and_jam.f90 | 20 +- flang/test/Integration/vector-always.f90 | 8 +- .../OpenACC/acc-data-unwrap-defaultbounds.f90 | 4 +- flang/test/Lower/OpenACC/acc-data.f90 | 4 +- .../acc-enter-data-unwrap-defaultbounds.f90 | 10 +- flang/test/Lower/OpenACC/acc-enter-data.f90 | 10 +- .../acc-exit-data-unwrap-defaultbounds.f90 | 14 +- flang/test/Lower/OpenACC/acc-exit-data.f90 | 14 +- flang/test/Lower/OpenACC/acc-kernels-loop.f90 | 10 +- flang/test/Lower/OpenACC/acc-kernels.f90 | 4 +- .../test/Lower/OpenACC/acc-parallel-loop.f90 | 10 +- flang/test/Lower/OpenACC/acc-parallel.f90 | 4 +- flang/test/Lower/OpenACC/acc-serial-loop.f90 | 10 +- flang/test/Lower/OpenACC/acc-serial.f90 | 4 +- flang/test/Lower/OpenACC/acc-update.f90 | 12 +- flang/test/Lower/OpenACC/acc-wait.f90 | 2 +- .../OpenMP/Todo/taskloop-inreduction.f90 | 13 + .../Lower/OpenMP/Todo/taskloop-reduction.f90 | 13 + flang/test/Lower/OpenMP/target.f90 | 2 +- .../Preprocessing/func-on-command-line.F90 | 4 + .../Semantics/OpenACC/acc-kernels-loop.f90 | 9 + .../OpenACC/acc-parallel-loop-validity.f90 | 9 + .../Semantics/OpenACC/acc-serial-loop.f90 | 9 + .../OpenMP/cancellation-construct-type.f90 | 11 + flang/test/Semantics/pad-hollerith-arg.f | 5 + libc/config/config.json | 2 +- libc/config/linux/riscv/entrypoints.txt | 5 +- libc/docs/configure.rst | 2 +- libc/docs/dev/code_style.rst | 2 +- libc/docs/dev/source_tree_layout.rst | 2 +- libc/hdr/types/ACTION.h | 22 + libc/hdr/types/CMakeLists.txt | 9 + libc/include/sys/syscall.h.def | 8 + libc/src/__support/FPUtil/dyadic_float.h | 2 +- libc/src/__support/GPU/utils.h | 12 + libc/src/poll/linux/poll.cpp | 21 +- libc/src/search/CMakeLists.txt | 2 + libc/src/search/hsearch.h | 2 +- libc/src/search/hsearch_r.h | 3 +- libc/src/sys/time/linux/utimes.cpp | 22 +- libclc/clc/include/clc/clcmacro.h | 135 +- libclc/clc/lib/generic/math/clc_lgamma_r.cl | 24 +- libclc/cmake/modules/AddLibclc.cmake | 5 +- libcxx/docs/CodingGuidelines.rst | 4 +- libcxx/docs/DesignDocs/FileTimeType.rst | 16 +- libcxx/docs/ReleaseNotes/21.rst | 3 + libcxx/docs/TestingLibcxx.rst | 4 +- .../__cxx03/__algorithm/adjacent_find.h | 7 +- libcxx/include/__cxx03/__algorithm/all_of.h | 2 +- libcxx/include/__cxx03/__algorithm/any_of.h | 2 +- .../__cxx03/__algorithm/binary_search.h | 4 +- libcxx/include/__cxx03/__algorithm/comp.h | 4 +- .../__cxx03/__algorithm/comp_ref_type.h | 13 +- libcxx/include/__cxx03/__algorithm/copy.h | 24 +- .../__cxx03/__algorithm/copy_backward.h | 19 +- libcxx/include/__cxx03/__algorithm/copy_if.h | 2 +- .../__cxx03/__algorithm/copy_move_common.h | 10 +- libcxx/include/__cxx03/__algorithm/copy_n.h | 6 +- libcxx/include/__cxx03/__algorithm/count.h | 8 +- libcxx/include/__cxx03/__algorithm/count_if.h | 3 +- libcxx/include/__cxx03/__algorithm/equal.h | 8 +- .../include/__cxx03/__algorithm/equal_range.h | 6 +- libcxx/include/__cxx03/__algorithm/fill.h | 7 +- libcxx/include/__cxx03/__algorithm/fill_n.h | 14 +- libcxx/include/__cxx03/__algorithm/find.h | 23 +- libcxx/include/__cxx03/__algorithm/find_end.h | 12 +- .../__cxx03/__algorithm/find_first_of.h | 6 +- libcxx/include/__cxx03/__algorithm/find_if.h | 2 +- .../include/__cxx03/__algorithm/find_if_not.h | 2 +- .../__cxx03/__algorithm/find_segment_if.h | 2 +- libcxx/include/__cxx03/__algorithm/for_each.h | 3 +- .../__cxx03/__algorithm/for_each_segment.h | 3 +- libcxx/include/__cxx03/__algorithm/generate.h | 3 +- .../include/__cxx03/__algorithm/generate_n.h | 3 +- .../__cxx03/__algorithm/half_positive.h | 4 +- libcxx/include/__cxx03/__algorithm/includes.h | 6 +- libcxx/include/__cxx03/__algorithm/is_heap.h | 4 +- .../__cxx03/__algorithm/is_heap_until.h | 6 +- .../__cxx03/__algorithm/is_partitioned.h | 2 +- .../__cxx03/__algorithm/is_permutation.h | 14 +- .../include/__cxx03/__algorithm/is_sorted.h | 5 +- .../__cxx03/__algorithm/is_sorted_until.h | 6 +- .../include/__cxx03/__algorithm/iter_swap.h | 4 +- .../__cxx03/__algorithm/iterator_operations.h | 29 +- .../__algorithm/lexicographical_compare.h | 6 +- .../include/__cxx03/__algorithm/lower_bound.h | 10 +- .../include/__cxx03/__algorithm/make_heap.h | 7 +- .../__cxx03/__algorithm/make_projected.h | 12 +- libcxx/include/__cxx03/__algorithm/max.h | 4 +- .../include/__cxx03/__algorithm/max_element.h | 6 +- libcxx/include/__cxx03/__algorithm/merge.h | 6 +- libcxx/include/__cxx03/__algorithm/min.h | 4 +- .../include/__cxx03/__algorithm/min_element.h | 9 +- libcxx/include/__cxx03/__algorithm/minmax.h | 4 +- .../__cxx03/__algorithm/minmax_element.h | 11 +- libcxx/include/__cxx03/__algorithm/mismatch.h | 14 +- libcxx/include/__cxx03/__algorithm/move.h | 25 +- .../__cxx03/__algorithm/move_backward.h | 18 +- .../__cxx03/__algorithm/next_permutation.h | 7 +- libcxx/include/__cxx03/__algorithm/none_of.h | 2 +- .../include/__cxx03/__algorithm/nth_element.h | 10 +- .../__cxx03/__algorithm/partial_sort.h | 8 +- .../__cxx03/__algorithm/partial_sort_copy.h | 6 +- .../include/__cxx03/__algorithm/partition.h | 8 +- .../__cxx03/__algorithm/partition_copy.h | 2 +- .../__cxx03/__algorithm/partition_point.h | 2 +- libcxx/include/__cxx03/__algorithm/pop_heap.h | 7 +- .../__cxx03/__algorithm/prev_permutation.h | 7 +- .../include/__cxx03/__algorithm/push_heap.h | 9 +- libcxx/include/__cxx03/__algorithm/remove.h | 2 +- .../include/__cxx03/__algorithm/remove_copy.h | 2 +- .../__cxx03/__algorithm/remove_copy_if.h | 2 +- .../include/__cxx03/__algorithm/remove_if.h | 2 +- libcxx/include/__cxx03/__algorithm/replace.h | 2 +- .../__cxx03/__algorithm/replace_copy.h | 2 +- .../__cxx03/__algorithm/replace_copy_if.h | 2 +- .../include/__cxx03/__algorithm/replace_if.h | 2 +- libcxx/include/__cxx03/__algorithm/reverse.h | 9 +- .../__cxx03/__algorithm/reverse_copy.h | 2 +- libcxx/include/__cxx03/__algorithm/rotate.h | 22 +- .../include/__cxx03/__algorithm/rotate_copy.h | 2 +- libcxx/include/__cxx03/__algorithm/search.h | 12 +- libcxx/include/__cxx03/__algorithm/search_n.h | 12 +- .../__cxx03/__algorithm/set_difference.h | 7 +- .../__cxx03/__algorithm/set_intersection.h | 21 +- .../__algorithm/set_symmetric_difference.h | 9 +- .../include/__cxx03/__algorithm/set_union.h | 9 +- libcxx/include/__cxx03/__algorithm/shuffle.h | 13 +- .../include/__cxx03/__algorithm/sift_down.h | 4 +- libcxx/include/__cxx03/__algorithm/sort.h | 20 +- .../include/__cxx03/__algorithm/sort_heap.h | 7 +- .../include/__cxx03/__algorithm/swap_ranges.h | 6 +- .../include/__cxx03/__algorithm/transform.h | 4 +- libcxx/include/__cxx03/__algorithm/unique.h | 6 +- .../include/__cxx03/__algorithm/unique_copy.h | 10 +- .../include/__cxx03/__algorithm/unwrap_iter.h | 15 +- .../__cxx03/__algorithm/unwrap_range.h | 4 +- .../include/__cxx03/__algorithm/upper_bound.h | 6 +- libcxx/include/__cxx03/__atomic/atomic.h | 10 +- libcxx/include/__cxx03/__atomic/atomic_base.h | 6 +- libcxx/include/__cxx03/__atomic/atomic_flag.h | 2 +- .../__cxx03/__atomic/cxx_atomic_impl.h | 8 +- .../include/__cxx03/__atomic/to_gcc_order.h | 4 +- libcxx/include/__cxx03/__bit/blsr.h | 10 +- libcxx/include/__cxx03/__bit/countl.h | 12 +- libcxx/include/__cxx03/__bit/countr.h | 10 +- libcxx/include/__cxx03/__bit/invert_if.h | 2 +- libcxx/include/__cxx03/__bit/popcount.h | 10 +- libcxx/include/__cxx03/__bit/rotate.h | 4 +- libcxx/include/__cxx03/__bit_reference | 161 +- libcxx/include/__cxx03/__chrono/duration.h | 99 +- .../include/__cxx03/__chrono/steady_clock.h | 2 +- .../include/__cxx03/__chrono/system_clock.h | 2 +- libcxx/include/__cxx03/__chrono/time_point.h | 43 +- .../__condition_variable/condition_variable.h | 2 +- libcxx/include/__cxx03/__config | 43 +- .../__cxx03/__debug_utils/randomize_range.h | 2 +- .../__cxx03/__debug_utils/sanitizers.h | 2 +- .../strict_weak_ordering_check.h | 2 +- .../__cxx03/__functional/binary_function.h | 8 +- .../__cxx03/__functional/binary_negate.h | 12 +- .../include/__cxx03/__functional/binder1st.h | 5 +- .../include/__cxx03/__functional/binder2nd.h | 5 +- .../include/__cxx03/__functional/identity.h | 2 +- libcxx/include/__cxx03/__functional/mem_fn.h | 9 +- .../__cxx03/__functional/mem_fun_ref.h | 37 +- .../include/__cxx03/__functional/operations.h | 68 +- .../__functional/pointer_to_binary_function.h | 6 +- .../__functional/pointer_to_unary_function.h | 6 +- .../__cxx03/__functional/reference_wrapper.h | 20 +- .../__cxx03/__functional/unary_function.h | 6 +- .../__cxx03/__functional/unary_negate.h | 14 +- .../__cxx03/__functional/weak_result_type.h | 22 +- libcxx/include/__cxx03/__fwd/array.h | 4 +- libcxx/include/__cxx03/__fwd/pair.h | 6 +- libcxx/include/__cxx03/__hash_table | 72 +- libcxx/include/__cxx03/__iterator/access.h | 4 +- libcxx/include/__cxx03/__iterator/advance.h | 8 +- .../__cxx03/__iterator/back_insert_iterator.h | 17 +- .../include/__cxx03/__iterator/bounded_iter.h | 58 +- libcxx/include/__cxx03/__iterator/distance.h | 6 +- .../__iterator/front_insert_iterator.h | 15 +- .../__cxx03/__iterator/insert_iterator.h | 14 +- .../__cxx03/__iterator/istream_iterator.h | 2 +- .../__cxx03/__iterator/istreambuf_iterator.h | 2 +- libcxx/include/__cxx03/__iterator/iterator.h | 2 +- .../__cxx03/__iterator/move_iterator.h | 63 +- libcxx/include/__cxx03/__iterator/next.h | 2 +- libcxx/include/__cxx03/__iterator/prev.h | 2 +- .../__cxx03/__iterator/reverse_iterator.h | 69 +- libcxx/include/__cxx03/__iterator/wrap_iter.h | 81 +- libcxx/include/__cxx03/__locale | 14 +- libcxx/include/__cxx03/__math/traits.h | 48 +- libcxx/include/__cxx03/__memory/addressof.h | 2 +- .../__cxx03/__memory/allocate_at_least.h | 3 +- libcxx/include/__cxx03/__memory/allocator.h | 65 +- .../__cxx03/__memory/allocator_traits.h | 30 +- .../include/__cxx03/__memory/assume_aligned.h | 2 +- libcxx/include/__cxx03/__memory/auto_ptr.h | 6 +- .../__cxx03/__memory/builtin_new_allocator.h | 2 +- .../__cxx03/__memory/compressed_pair.h | 52 +- .../include/__cxx03/__memory/construct_at.h | 11 +- .../include/__cxx03/__memory/pointer_traits.h | 15 +- .../__cxx03/__memory/raw_storage_iterator.h | 3 +- libcxx/include/__cxx03/__memory/shared_ptr.h | 18 +- .../include/__cxx03/__memory/swap_allocator.h | 9 +- libcxx/include/__cxx03/__memory/temp_value.h | 11 +- .../__cxx03/__memory/temporary_buffer.h | 4 +- .../__memory/uninitialized_algorithms.h | 19 +- libcxx/include/__cxx03/__memory/unique_ptr.h | 144 +- libcxx/include/__cxx03/__memory/voidify.h | 2 +- libcxx/include/__cxx03/__mutex/mutex.h | 2 +- libcxx/include/__cxx03/__mutex/once_flag.h | 2 +- libcxx/include/__cxx03/__numeric/accumulate.h | 5 +- .../__cxx03/__numeric/adjacent_difference.h | 8 +- .../include/__cxx03/__numeric/inner_product.h | 4 +- libcxx/include/__cxx03/__numeric/iota.h | 3 +- .../include/__cxx03/__numeric/partial_sum.h | 4 +- .../__cxx03/__random/clamp_to_integral.h | 2 +- .../__cxx03/__random/discard_block_engine.h | 12 +- .../__random/independent_bits_engine.h | 47 +- .../__cxx03/__random/is_seed_sequence.h | 2 +- .../__random/linear_congruential_engine.h | 32 +- .../__random/mersenne_twister_engine.h | 168 +- .../include/__cxx03/__random/random_device.h | 8 +- .../__cxx03/__random/shuffle_order_engine.h | 20 +- .../__random/subtract_with_carry_engine.h | 27 +- .../__random/uniform_int_distribution.h | 8 +- libcxx/include/__cxx03/__split_buffer | 191 +- libcxx/include/__cxx03/__string/char_traits.h | 138 +- .../__cxx03/__string/constexpr_c_functions.h | 21 +- .../__cxx03/__system_error/error_category.h | 2 +- .../__cxx03/__thread/poll_with_backoff.h | 4 +- libcxx/include/__cxx03/__thread/this_thread.h | 2 +- libcxx/include/__cxx03/__tree | 47 +- .../__cxx03/__type_traits/aligned_storage.h | 4 +- .../__cxx03/__type_traits/aligned_union.h | 2 +- .../__cxx03/__type_traits/integral_constant.h | 6 +- libcxx/include/__cxx03/__type_traits/invoke.h | 25 +- .../__type_traits/is_constant_evaluated.h | 2 +- .../__cxx03/__type_traits/is_literal_type.h | 3 +- .../__cxx03/__type_traits/is_swappable.h | 6 +- .../include/__cxx03/__type_traits/result_of.h | 2 +- .../__cxx03/__utility/convert_to_integral.h | 25 +- .../__cxx03/__utility/exception_guard.h | 25 +- libcxx/include/__cxx03/__utility/forward.h | 4 +- .../__cxx03/__utility/is_pointer_in_range.h | 4 +- .../__cxx03/__utility/is_valid_range.h | 3 +- libcxx/include/__cxx03/__utility/move.h | 4 +- libcxx/include/__cxx03/__utility/no_destroy.h | 2 +- libcxx/include/__cxx03/__utility/pair.h | 58 +- libcxx/include/__cxx03/__utility/rel_ops.h | 8 +- libcxx/include/__cxx03/__utility/swap.h | 6 +- libcxx/include/__cxx03/array | 156 +- libcxx/include/__cxx03/bitset | 303 +- libcxx/include/__cxx03/cmath | 12 +- libcxx/include/__cxx03/codecvt | 24 +- libcxx/include/__cxx03/complex | 272 +- libcxx/include/__cxx03/cwchar | 7 +- libcxx/include/__cxx03/deque | 33 +- libcxx/include/__cxx03/forward_list | 27 +- libcxx/include/__cxx03/limits | 632 +- libcxx/include/__cxx03/list | 32 +- libcxx/include/__cxx03/locale | 13 +- libcxx/include/__cxx03/map | 49 +- libcxx/include/__cxx03/new | 4 +- libcxx/include/__cxx03/queue | 20 +- libcxx/include/__cxx03/ratio | 16 +- libcxx/include/__cxx03/regex | 27 +- libcxx/include/__cxx03/set | 30 +- libcxx/include/__cxx03/stack | 7 +- libcxx/include/__cxx03/string | 851 +- libcxx/include/__cxx03/string_view | 234 +- libcxx/include/__cxx03/typeinfo | 10 +- libcxx/include/__cxx03/unordered_map | 60 +- libcxx/include/__cxx03/unordered_set | 18 +- libcxx/include/__cxx03/vector | 759 +- libcxx/include/__flat_set/utils.h | 4 +- libcxx/include/__format/format_functions.h | 43 + libcxx/include/__fwd/pair.h | 6 + .../__memory/uses_allocator_construction.h | 9 +- libcxx/include/__node_handle | 32 +- libcxx/include/__tree | 112 +- libcxx/include/__vector/vector_bool.h | 7 +- libcxx/include/map | 150 +- libcxx/include/print | 2 +- libcxx/src/.clang-tidy | 16 +- libcxx/src/include/overridable_function.h | 10 +- libcxx/src/locale.cpp | 2 +- .../algorithms/pstl.stable_sort.bench.cpp | 42 - .../algorithms/ranges_sort.bench.cpp | 40 - .../algorithms/ranges_stable_sort.bench.cpp | 40 - .../test/benchmarks/algorithms/sort.bench.cpp | 38 - .../benchmarks/algorithms/sorting/common.h | 141 + .../algorithms/sorting/is_sorted.bench.cpp | 82 + .../sorting/is_sorted_until.bench.cpp | 82 + .../algorithms/sorting/partial_sort.bench.cpp | 95 + .../sorting/partial_sort_copy.bench.cpp | 90 + .../algorithms/sorting/sort.bench.cpp | 91 + .../algorithms/sorting/stable_sort.bench.cpp | 159 + .../algorithms/stable_sort.bench.cpp | 40 - .../test/benchmarks/format/format.bench.cpp | 11 + libcxx/test/configs/cmake-bridge.cfg.in | 1 + libcxx/test/libcxx/clang_tidy.sh.py | 11 + .../tree_key_value_traits.pass.cpp | 4 - .../flat.multiset/insert_range.pass.cpp | 43 + .../flat.set/insert_range.pass.cpp | 43 + .../libcxx/gdb/gdb_pretty_printer_test.sh.cpp | 5 +- .../alg.swap/ranges.swap_ranges.pass.cpp | 224 +- .../alg.swap/swap_ranges.pass.cpp | 31 +- .../alg.nonmodifying/alg.equal/equal.pass.cpp | 2 + .../alg.nonmodifying/alg.find/find.pass.cpp | 1 + .../map.modifiers/insert_or_assign.pass.cpp | 24 +- .../vector.bool/enabled_hash.pass.cpp | 7 +- .../vector.bool/vector_bool.pass.cpp | 46 +- .../std/localization/codecvt_unicode.pass.cpp | 12 +- .../char16_t_char8_t_in.pass.cpp | 2 +- .../char16_t_char8_t_out.pass.cpp | 2 +- .../char32_t_char8_t_in.pass.cpp | 2 +- .../char32_t_char8_t_out.pass.cpp | 2 +- .../assign2.pass.cpp | 4 +- libcxx/utils/ci/run-buildbot | 1 + libcxx/utils/gdb/libcxx/printers.py | 4 +- libcxx/utils/libcxx/test/features.py | 4 + libcxx/utils/sym_diff.py | 5 + libcxxabi/src/demangle/ItaniumDemangle.h | 7 +- lld/COFF/COFFLinkerContext.h | 8 + lld/COFF/Chunks.cpp | 2 +- lld/COFF/DLL.cpp | 74 +- lld/COFF/Driver.cpp | 40 +- lld/COFF/InputFiles.cpp | 4 +- lld/COFF/Options.td | 1 + lld/COFF/SymbolTable.cpp | 2 +- lld/COFF/Writer.cpp | 13 +- lld/ELF/Arch/ARM.cpp | 2 +- lld/ELF/Driver.cpp | 2 +- lld/ELF/Writer.cpp | 4 +- lld/docs/ELF/warn_backrefs.rst | 4 +- lld/docs/windows_support.rst | 2 +- lld/test/COFF/arm64ec-entry-mangle.test | 2 +- lld/test/COFF/arm64ec-hybmp.s | 4 +- lld/test/COFF/arm64ec-lib.test | 4 +- lld/test/COFF/arm64ec-patchable-thunks.test | 2 +- lld/test/COFF/arm64ec-range-thunks.s | 5 + lld/test/COFF/arm64ec.test | 9 +- lld/test/COFF/arm64x-altnames.s | 6 + lld/test/COFF/arm64x-buildid.s | 3 + lld/test/COFF/arm64x-comm.s | 3 + lld/test/COFF/arm64x-crt-sec.s | 3 + lld/test/COFF/arm64x-ctors-sec.s | 4 + lld/test/COFF/arm64x-guardcf.s | 42 +- lld/test/COFF/arm64x-import.test | 150 +- lld/test/COFF/arm64x-sameaddress.test | 56 + lld/test/COFF/arm64x-symtab.s | 16 + lld/test/COFF/arm64x-wrap.s | 4 + lld/test/COFF/autoimport-arm64ec-data.test | 2 +- lld/test/ELF/link-open-file.test | 12 +- lldb/docs/resources/build.rst | 2 +- lldb/docs/resources/contributing.rst | 2 +- lldb/docs/resources/debugging.rst | 16 +- lldb/docs/resources/qemu-testing.rst | 2 +- lldb/docs/use/variable.rst | 2 +- lldb/include/lldb/Core/Address.h | 13 +- .../test/tools/lldb-dap/dap_server.py | 310 +- .../test/tools/lldb-dap/lldbdap_testcase.py | 246 +- lldb/source/Core/Address.cpp | 15 +- lldb/source/Interpreter/Options.cpp | 2 +- .../source/Plugins/ABI/AArch64/ABIAArch64.cpp | 42 + lldb/source/Plugins/ABI/AArch64/ABIAArch64.h | 3 + .../Plugins/ABI/AArch64/ABIMacOSX_arm64.cpp | 45 - .../Plugins/ABI/AArch64/ABIMacOSX_arm64.h | 4 - .../Plugins/ABI/AArch64/ABISysV_arm64.cpp | 44 - .../Plugins/ABI/AArch64/ABISysV_arm64.h | 4 - .../Clang/ASTStructExtractor.cpp | 3 +- .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp | 57 +- .../ObjectFile/XCOFF/ObjectFileXCOFF.h | 12 + .../source/Plugins/Process/AIX/CMakeLists.txt | 1 + .../Plugins/Process/AIX/NativeThreadAIX.cpp | 58 + .../Plugins/Process/AIX/NativeThreadAIX.h | 53 + .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 4 +- .../SymbolFile/DWARF/SymbolFileDWARF.cpp | 8 +- lldb/source/Symbol/FuncUnwinders.cpp | 39 +- lldb/source/Target/DynamicRegisterInfo.cpp | 5 +- lldb/source/Target/RegisterContextUnwind.cpp | 40 +- lldb/source/Target/Target.cpp | 3 +- .../unwind/frameless-faulted/Makefile | 2 +- .../tools/lldb-dap/attach/TestDAP_attach.py | 18 +- .../TestDAP_breakpointEvents.py | 7 +- .../tools/lldb-dap/cancel/TestDAP_cancel.py | 10 +- .../lldb-dap/commands/TestDAP_commands.py | 5 +- .../completions/TestDAP_completions.py | 10 +- .../tools/lldb-dap/console/TestDAP_console.py | 7 +- .../lldb-dap/coreFile/TestDAP_coreFile.py | 10 +- .../lldb-dap/exception/TestDAP_exception.py | 1 + lldb/test/API/tools/lldb-dap/io/TestDAP_io.py | 4 - .../tools/lldb-dap/launch/TestDAP_launch.py | 35 +- .../tools/lldb-dap/module/TestDAP_module.py | 4 +- .../tools/lldb-dap/output/TestDAP_output.py | 4 +- .../tools/lldb-dap/restart/TestDAP_restart.py | 43 +- .../lldb-dap/stop-hooks/TestDAP_stop_hooks.py | 5 - .../lldb-dap/variables/TestDAP_variables.py | 2 +- .../Shell/ObjectFile/XCOFF/basic-info32.yaml | 110 + .../Inputs/basic-block-sections-with-dwarf.s | 59 +- ...asic-block-sections-with-dwarf-static.test | 24 +- lldb/tools/debugserver/source/CMakeLists.txt | 17 +- lldb/tools/lldb-dap/DAP.cpp | 14 +- lldb/tools/lldb-dap/DAP.h | 7 +- lldb/tools/lldb-dap/DAPError.cpp | 9 + lldb/tools/lldb-dap/DAPError.h | 11 +- .../Handler/ContinueRequestHandler.cpp | 5 +- lldb/tools/lldb-dap/Handler/RequestHandler.h | 83 +- .../lldb-dap/Handler/ScopesRequestHandler.cpp | 122 +- lldb/tools/lldb-dap/JSONUtils.cpp | 8 +- lldb/tools/lldb-dap/JSONUtils.h | 21 - .../lldb-dap/Protocol/ProtocolRequests.cpp | 14 + .../lldb-dap/Protocol/ProtocolRequests.h | 15 +- .../tools/lldb-dap/Protocol/ProtocolTypes.cpp | 326 +- lldb/tools/lldb-dap/Protocol/ProtocolTypes.h | 103 +- lldb/tools/lldb-dap/Watchpoint.cpp | 2 +- lldb/tools/lldb-dap/Watchpoint.h | 2 +- lldb/tools/lldb-dap/package.json | 20 + lldb/tools/lldb-dap/src-ts/extension.ts | 10 +- .../src-ts/ui/modules-data-provider.ts | 98 +- lldb/unittests/DAP/CMakeLists.txt | 4 + lldb/unittests/DAP/DAPTest.cpp | 38 + lldb/unittests/DAP/Handler/DisconnectTest.cpp | 35 + lldb/unittests/DAP/ProtocolTypesTest.cpp | 404 +- lldb/unittests/DAP/TestBase.cpp | 70 + lldb/unittests/DAP/TestBase.h | 48 + lldb/unittests/DAP/TransportTest.cpp | 94 + llvm/cmake/modules/HandleLLVMOptions.cmake | 2 + llvm/docs/AMDGPUUsage.rst | 12 +- llvm/docs/Coroutines.rst | 9 +- llvm/docs/GitHub.rst | 6 +- llvm/docs/GlobalISel/KnownBits.rst | 8 +- llvm/docs/LangRef.rst | 84 +- llvm/docs/MLGO.rst | 177 +- llvm/docs/NVPTXUsage.rst | 10 +- llvm/docs/RISCVUsage.rst | 4 + llvm/docs/ReleaseNotes.md | 5 +- llvm/include/llvm-c/Orc.h | 2 +- llvm/include/llvm/ADT/APFixedPoint.h | 5 +- llvm/include/llvm/ADT/APFloat.h | 5 +- llvm/include/llvm/ADT/APInt.h | 4 +- llvm/include/llvm/ADT/BitmaskEnum.h | 6 + llvm/include/llvm/ADT/DynamicAPInt.h | 2 + llvm/include/llvm/ADT/EquivalenceClasses.h | 2 + llvm/include/llvm/ADT/ImmutableSet.h | 10 +- llvm/include/llvm/ADT/SlowDynamicAPInt.h | 3 + llvm/include/llvm/ADT/TrieRawHashMap.h | 6 + llvm/include/llvm/ADT/Twine.h | 10 +- llvm/include/llvm/Analysis/AliasAnalysis.h | 15 +- .../llvm/Analysis/LoopAccessAnalysis.h | 9 +- llvm/include/llvm/BinaryFormat/DXContainer.h | 2 +- .../BinaryFormat/DXContainerConstants.def | 6 +- llvm/include/llvm/Bitstream/BitCodes.h | 12 +- llvm/include/llvm/CodeGen/GCMetadata.h | 39 +- .../CodeGen/GlobalISel/GISelValueTracking.h | 30 +- llvm/include/llvm/CodeGen/ISDOpcodes.h | 18 + llvm/include/llvm/CodeGen/MachineInstr.h | 2 +- llvm/include/llvm/CodeGen/MachineOperand.h | 10 +- llvm/include/llvm/CodeGen/Passes.h | 960 +- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 2 +- .../llvm/DebugInfo/DWARF/DWARFContext.h | 5 + llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h | 6 +- .../GSYM/{GsymDIContext.h => GsymContext.h} | 16 +- llvm/include/llvm/Demangle/ItaniumDemangle.h | 7 +- .../llvm/Frontend/Directive/DirectiveBase.td | 12 + .../llvm/Frontend/HLSL/HLSLRootSignature.h | 45 +- llvm/include/llvm/Frontend/OpenACC/ACC.td | 160 +- llvm/include/llvm/Frontend/OpenMP/OMP.td | 82 +- llvm/include/llvm/IR/DataLayout.h | 71 +- llvm/include/llvm/IR/DerivedTypes.h | 17 + llvm/include/llvm/IR/GCStrategy.h | 1 + llvm/include/llvm/IR/IRBuilder.h | 131 +- llvm/include/llvm/IR/Intrinsics.td | 2 +- llvm/include/llvm/IR/IntrinsicsNVVM.td | 3050 +- llvm/include/llvm/IR/IntrinsicsRISCV.td | 6 + llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td | 17 + llvm/include/llvm/IR/IntrinsicsSPIRV.td | 5 + llvm/include/llvm/IR/Metadata.h | 4 +- llvm/include/llvm/IR/ModuleSummaryIndex.h | 4 +- llvm/include/llvm/IR/User.h | 6 +- llvm/include/llvm/InitializePasses.h | 2 +- .../llvm/MC/DXContainerRootSignature.h | 69 +- .../llvm/Passes/MachinePassRegistry.def | 2 + .../include/llvm/ProfileData/DataAccessProf.h | 214 + llvm/include/llvm/ProfileData/InstrProf.h | 17 +- llvm/include/llvm/Support/BranchProbability.h | 4 +- llvm/include/llvm/Support/ConvertUTF.h | 4 + llvm/include/llvm/Support/DebugCounter.h | 4 +- llvm/include/llvm/Support/FileOutputBuffer.h | 5 +- llvm/include/llvm/Support/KnownBits.h | 5 +- llvm/include/llvm/Support/SMTAPI.h | 16 +- llvm/include/llvm/Support/ScaledNumber.h | 12 +- llvm/include/llvm/TableGen/DirectiveEmitter.h | 14 +- llvm/include/llvm/TableGen/Record.h | 72 +- .../include/llvm/Target/TargetSelectionDAG.td | 7 + .../llvm/Transforms/Coroutines/CoroShape.h | 11 +- .../Transforms/Vectorize/EVLIndVarSimplify.h | 31 + llvm/lib/Analysis/AliasAnalysis.cpp | 8 +- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 33 +- llvm/lib/Analysis/ValueTracking.cpp | 3 - llvm/lib/CGData/StableFunctionMap.cpp | 10 +- llvm/lib/CGData/StableFunctionMapRecord.cpp | 4 +- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h | 6 +- llvm/lib/CodeGen/CodeGenPrepare.cpp | 41 +- llvm/lib/CodeGen/GCMetadata.cpp | 21 +- .../CodeGen/GlobalISel/GISelValueTracking.cpp | 47 +- .../CodeGen/GlobalISel/InstructionSelect.cpp | 8 +- llvm/lib/CodeGen/GlobalISel/Legalizer.cpp | 9 +- llvm/lib/CodeGen/MIRPrinter.cpp | 5 +- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 9 +- llvm/lib/CodeGen/MachinePipeliner.cpp | 125 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 57 +- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 37 +- .../SelectionDAG/LegalizeFloatTypes.cpp | 38 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 34 +- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 + .../SelectionDAG/LegalizeTypesGeneric.cpp | 17 +- .../SelectionDAG/LegalizeVectorTypes.cpp | 56 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 20 + .../SelectionDAG/SelectionDAGBuilder.cpp | 20 +- .../SelectionDAG/SelectionDAGDumper.cpp | 4 + .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 1 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 51 +- llvm/lib/CodeGen/ShadowStackGCLowering.cpp | 2 +- llvm/lib/DebugInfo/DWARF/DWARFContext.cpp | 9 +- llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp | 215 +- llvm/lib/DebugInfo/GSYM/CMakeLists.txt | 2 +- llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp | 19 +- .../{GsymDIContext.cpp => GsymContext.cpp} | 26 +- .../DebugInfo/LogicalView/Core/LVRange.cpp | 2 +- .../DebugInfo/LogicalView/Core/LVReader.cpp | 7 +- .../DebugInfo/LogicalView/Core/LVScope.cpp | 5 +- .../LogicalView/Readers/LVDWARFReader.cpp | 2 +- llvm/lib/DebugInfo/Symbolize/Symbolize.cpp | 6 +- .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 2 +- llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp | 62 + llvm/lib/IR/AutoUpgrade.cpp | 30 +- llvm/lib/IR/IRBuilder.cpp | 109 +- llvm/lib/IR/Verifier.cpp | 15 +- llvm/lib/LTO/LTOBackend.cpp | 48 +- llvm/lib/MC/DXContainerRootSignature.cpp | 40 +- llvm/lib/MC/MCParser/AsmParser.cpp | 9 - llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 30 +- llvm/lib/Passes/CMakeLists.txt | 1 + llvm/lib/Passes/PassBuilder.cpp | 2 + llvm/lib/Passes/PassBuilderPipelines.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/ProfileData/CMakeLists.txt | 1 + llvm/lib/ProfileData/DataAccessProf.cpp | 265 + llvm/lib/ProfileData/InstrProf.cpp | 8 +- llvm/lib/ProfileData/InstrProfWriter.cpp | 4 +- llvm/lib/Support/APFixedPoint.cpp | 3 + llvm/lib/Support/ConvertUTFWrapper.cpp | 10 + llvm/lib/Support/DebugCounter.cpp | 2 + llvm/lib/Support/DynamicAPInt.cpp | 4 +- llvm/lib/Support/FileOutputBuffer.cpp | 4 +- llvm/lib/Support/KnownBits.cpp | 5 +- llvm/lib/Support/ScaledNumber.cpp | 4 +- llvm/lib/Support/SlowDynamicAPInt.cpp | 4 +- llvm/lib/Support/Z3Solver.cpp | 2 + llvm/lib/TableGen/Record.cpp | 333 +- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 13 +- llvm/lib/Target/AArch64/AArch64CollectLOH.cpp | 9 +- .../Target/AArch64/AArch64FrameLowering.cpp | 12 +- .../Target/AArch64/AArch64ISelLowering.cpp | 368 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 8 +- .../lib/Target/AArch64/AArch64InstrFormats.td | 2 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 40 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 18 +- .../AArch64/AArch64TargetTransformInfo.cpp | 48 +- .../GISel/AArch64O0PreLegalizerCombiner.cpp | 9 +- .../GISel/AArch64PostLegalizerCombiner.cpp | 9 +- .../GISel/AArch64PreLegalizerCombiner.cpp | 9 +- .../AArch64/Utils/AArch64SMEAttributes.cpp | 67 +- .../AArch64/Utils/AArch64SMEAttributes.h | 115 +- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 8 +- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 18 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 + .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 6 + .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 9 +- .../AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 9 +- .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 39 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 124 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 446 +- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 1 + llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 61 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 13 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 4 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 16 +- llvm/lib/Target/ARM/ARMConstantIslandPass.cpp | 3 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 23 +- .../Target/CSKY/CSKYConstantIslandPass.cpp | 3 +- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 60 +- llvm/lib/Target/DirectX/DXILShaderFlags.cpp | 115 +- llvm/lib/Target/DirectX/DXILShaderFlags.h | 10 +- .../Target/DirectX/DirectXTargetMachine.cpp | 2 +- .../Hexagon/AsmParser/HexagonAsmParser.cpp | 5 + .../Target/Hexagon/HexagonISelLowering.cpp | 18 +- .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 12 +- llvm/lib/Target/Hexagon/HexagonPatterns.td | 8 +- llvm/lib/Target/Hexagon/HexagonPatternsHVX.td | 16 +- llvm/lib/Target/M68k/CMakeLists.txt | 2 + llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp | 2 +- llvm/lib/Target/M68k/M68kISelLowering.cpp | 59 +- llvm/lib/Target/M68k/M68kISelLowering.h | 73 - llvm/lib/Target/M68k/M68kInstrData.td | 16 +- llvm/lib/Target/M68k/M68kInstrInfo.td | 42 +- llvm/lib/Target/M68k/M68kSelectionDAGInfo.cpp | 19 + llvm/lib/Target/M68k/M68kSelectionDAGInfo.h | 28 + llvm/lib/Target/M68k/M68kSubtarget.cpp | 13 +- llvm/lib/Target/M68k/M68kSubtarget.h | 12 +- .../M68k/MCTargetDesc/M68kMCCodeEmitter.cpp | 12 + .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp | 2 + .../Target/Mips/MipsConstantIslandPass.cpp | 3 +- llvm/lib/Target/Mips/MipsISelLowering.cpp | 29 + llvm/lib/Target/Mips/MipsISelLowering.h | 1 + llvm/lib/Target/Mips/MipsInstrInfo.cpp | 8 + llvm/lib/Target/Mips/MipsInstrInfo.h | 2 + .../Target/Mips/MipsPostLegalizerCombiner.cpp | 9 +- .../Target/Mips/MipsPreLegalizerCombiner.cpp | 9 +- llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h | 14 +- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 28 - llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 - llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 46 +- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 22 +- .../RISCV/Disassembler/RISCVDisassembler.cpp | 14 +- .../GISel/RISCVO0PreLegalizerCombiner.cpp | 9 +- .../GISel/RISCVPostLegalizerCombiner.cpp | 9 +- .../RISCV/GISel/RISCVPreLegalizerCombiner.cpp | 9 +- .../MCTargetDesc/RISCVTargetStreamer.cpp | 56 + .../RISCV/MCTargetDesc/RISCVTargetStreamer.h | 1 + llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 15 +- llvm/lib/Target/RISCV/RISCVFeatures.td | 16 + llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 19 +- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 99 +- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h | 6 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 224 +- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 119 +- llvm/lib/Target/RISCV/RISCVInstrInfo.h | 12 + llvm/lib/Target/RISCV/RISCVInstrInfo.td | 23 +- llvm/lib/Target/RISCV/RISCVInstrInfoD.td | 15 +- llvm/lib/Target/RISCV/RISCVInstrInfoF.td | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfoQ.td | 167 + llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 73 +- llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 102 + llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td | 36 + llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td | 14 + .../Target/RISCV/RISCVMakeCompressible.cpp | 29 +- .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp | 2 + llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 27 + llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 21 + llvm/lib/Target/RISCV/RISCVSchedGenericOOO.td | 2 + llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td | 1 + llvm/lib/Target/RISCV/RISCVSchedRocket.td | 1 + llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 1 + llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td | 1 + llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td | 1 + llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td | 1 + .../lib/Target/RISCV/RISCVSchedSpacemitX60.td | 1 + .../Target/RISCV/RISCVSchedSyntacoreSCR345.td | 1 + .../Target/RISCV/RISCVSchedSyntacoreSCR7.td | 1 + .../lib/Target/RISCV/RISCVSchedTTAscalonD8.td | 1 + .../Target/RISCV/RISCVSchedXiangShanNanHu.td | 1 + llvm/lib/Target/RISCV/RISCVSchedule.td | 121 +- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 7 + llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 3 +- llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp | 2 +- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 12 +- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 18 +- llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 2 +- .../SPIRV/SPIRVPreLegalizerCombiner.cpp | 9 +- llvm/lib/Target/Sparc/SparcISelLowering.cpp | 12 +- .../Target/SystemZ/SystemZFrameLowering.cpp | 2 +- .../Target/SystemZ/SystemZISelLowering.cpp | 32 +- .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 4 + llvm/lib/Target/X86/X86ISelLowering.cpp | 131 +- llvm/lib/TargetParser/RISCVISAInfo.cpp | 5 +- llvm/lib/TextAPI/InterfaceFile.cpp | 3 +- llvm/lib/Transforms/Coroutines/CoroEarly.cpp | 42 +- llvm/lib/Transforms/Coroutines/Coroutines.cpp | 17 +- .../lib/Transforms/IPO/ForceFunctionAttrs.cpp | 8 +- .../IPO/MemProfContextDisambiguation.cpp | 49 +- llvm/lib/Transforms/IPO/SampleProfile.cpp | 4 +- .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 37 +- .../InstCombine/InstructionCombining.cpp | 5 +- .../ObjCARC/ARCRuntimeEntryPoints.h | 8 + llvm/lib/Transforms/ObjCARC/CMakeLists.txt | 1 + llvm/lib/Transforms/ObjCARC/ObjCARC.cpp | 29 + llvm/lib/Transforms/ObjCARC/ObjCARC.h | 7 +- .../Transforms/ObjCARC/ObjCARCContract.cpp | 46 +- llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp | 2 +- .../Scalar/ConstraintElimination.cpp | 25 +- .../Transforms/Scalar/InferAddressSpaces.cpp | 13 +- .../Transforms/Scalar/LoopIdiomRecognize.cpp | 52 +- llvm/lib/Transforms/Utils/CloneFunction.cpp | 15 +- .../Transforms/Utils/MemoryTaggingSupport.cpp | 5 +- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../Vectorize/EVLIndVarSimplify.cpp | 301 + .../Transforms/Vectorize/LoopVectorize.cpp | 73 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 254 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 10 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 12 +- llvm/lib/Transforms/Vectorize/VPlan.h | 334 +- .../Transforms/Vectorize/VPlanAnalysis.cpp | 22 + llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 2 + .../Transforms/Vectorize/VPlanPatternMatch.h | 19 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 80 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 257 +- .../Transforms/Vectorize/VPlanTransforms.h | 7 + llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 6 +- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 17 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 + .../Transforms/Vectorize/VPlanVerifier.cpp | 37 +- llvm/test/Analysis/CostModel/AArch64/div.ll | 32 +- llvm/test/Analysis/CostModel/AArch64/rem.ll | 32 +- .../CostModel/AArch64/shuffle-broadcast.ll | 48 +- .../CostModel/AArch64/shuffle-load.ll | 12 +- .../CostModel/AArch64/shuffle-other.ll | 4 +- .../CostModel/AArch64/shuffle-store.ll | 26 +- llvm/test/Assembler/amdgcn-unreachable.ll | 32 + .../Assembler/autoupgrade-thread-pointer.ll | 4 +- .../AArch64/GlobalISel/knownbits-const.mir | 27 + .../test/CodeGen/AArch64/aarch64-sme-stubs.ll | 47 + .../argument-blocks-array-of-struct.ll | 493 +- .../CodeGen/AArch64/arm64-arith-saturating.ll | 4 +- llvm/test/CodeGen/AArch64/arm64-vshift.ll | 11 + .../AArch64/arm64ec-hybrid-patchable.ll | 12 +- llvm/test/CodeGen/AArch64/arm64ec-varargs.ll | 82 +- llvm/test/CodeGen/AArch64/bitcast-extend.ll | 8 +- llvm/test/CodeGen/AArch64/darwinpcs-tail.ll | 4 +- .../AArch64/fix-shuffle-vector-be-rev.ll | 4 +- .../CodeGen/AArch64/fp16-vector-shuffle.ll | 10 +- llvm/test/CodeGen/AArch64/itofp.ll | 112 +- llvm/test/CodeGen/AArch64/neon-bitcast.ll | 4 +- .../CodeGen/AArch64/neon-insert-sve-elt.ll | 12 +- .../CodeGen/AArch64/neon-insextbitcast.ll | 57 +- llvm/test/CodeGen/AArch64/nofpclass.ll | 182 + llvm/test/CodeGen/AArch64/reserveXreg.ll | 3 +- llvm/test/CodeGen/AArch64/shuffle-extend.ll | 20 +- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 23 +- llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 4 +- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 68 +- .../CodeGen/AArch64/stack-tagging-prologue.ll | 2 +- .../CodeGen/AArch64/sve-intrinsics-while.ll | 8 +- llvm/test/CodeGen/AArch64/sve2-bsl.ll | 13 + llvm/test/CodeGen/AArch64/vararg-tallcall.ll | 22 +- llvm/test/CodeGen/AArch64/variant-pcs.ll | 3 + llvm/test/CodeGen/AArch64/vector-fcvt.ll | 48 +- .../test/CodeGen/AArch64/vselect-constants.ll | 7 +- llvm/test/CodeGen/AArch64/win64_vararg2.ll | 23 +- .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll | 68 +- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 159 +- .../GlobalISel/combine-shift-amount-zext.mir | 146 + .../GlobalISel/dropped_debug_info_assert.ll | 82 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 32 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 66 +- .../GlobalISel/irtranslator-inline-asm.ll | 14 +- .../GlobalISel/irtranslator-metadata.ll | 3 +- .../load-legalize-range-metadata.ll | 6 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 105 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll | 6 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 77 +- .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 196 +- .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 7 +- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 108 +- .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll | 58 +- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 24 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 33 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 258025 +++++++++++---- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 20846 +- .../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll | 3348 +- .../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll | 1024 +- .../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll | 7968 +- .../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll | 4171 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 36624 +- .../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll | 5048 +- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 37347 ++- .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 8300 +- .../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll | 5871 +- .../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll | 12957 +- .../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll | 14652 +- .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 1403 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 80561 ++++- .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 29953 +- .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 32586 +- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 14264 +- .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 36241 +- .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 40427 ++- .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 44733 ++- .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 59944 +++- .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 69883 ++-- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 10983 +- .../test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll | 36 +- llvm/test/CodeGen/AMDGPU/anyext.ll | 8 +- llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll | 21 +- .../atomic_optimizations_local_pointer.ll | 18 - llvm/test/CodeGen/AMDGPU/bitreverse.ll | 9 +- .../CodeGen/AMDGPU/calling-conventions.ll | 413 +- .../CodeGen/AMDGPU/cgp-bitfield-extract.ll | 14 +- llvm/test/CodeGen/AMDGPU/commute-compares.ll | 979 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 28 +- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 16 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 15 +- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 44 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 13 +- ....path.ll => fptrunc.v2f16.no.fast.math.ll} | 14 + .../AMDGPU/lower-kernel-and-module-lds.ll | 18 +- llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll | 14 +- .../AMDGPU/lower-lds-struct-aa-memcpy.ll | 31 +- .../AMDGPU/lower-lds-struct-aa-merge.ll | 37 +- .../CodeGen/AMDGPU/lower-lds-struct-aa.ll | 51 +- .../lower-module-lds-all-indirect-accesses.ll | 12 +- ...ect-extern-uses-max-reachable-alignment.ll | 35 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 33 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 14 +- llvm/test/CodeGen/AMDGPU/min.ll | 466 +- llvm/test/CodeGen/AMDGPU/mmra.ll | 42 +- .../AMDGPU/sdwa-peephole-cndmask-fail.ll | 65 + .../AMDGPU/sdwa-peephole-cndmask-wave32.mir | 89 + llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 19 +- llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 13 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 6 +- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 234 +- llvm/test/CodeGen/AMDGPU/sra.ll | 43 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 34 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 21 +- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 2 +- llvm/test/CodeGen/AMDGPU/zero_extend.ll | 7 +- llvm/test/CodeGen/ARM/nofpclass.ll | 37 + .../DirectX/ShaderFlags/disable-opt-cs.ll | 2 +- .../DirectX/ShaderFlags/disable-opt-lib.ll | 2 +- .../DirectX/ShaderFlags/low-precision.ll | 22 +- .../max-64-uavs-array-valver1.5.ll | 2 +- .../max-64-uavs-array-valver1.6.ll | 2 +- .../DirectX/ShaderFlags/max-64-uavs.ll | 2 +- .../ShaderFlags/res-may-not-alias-sm6.7.ll | 8 +- .../uavs-at-every-stage-lib-valver1.7.ll | 2 +- .../ShaderFlags/uavs-at-every-stage-vs.ll | 2 +- .../ShaderFlags/use-native-low-precision-0.ll | 22 +- .../ShaderFlags/use-native-low-precision-1.ll | 26 +- llvm/test/CodeGen/DirectX/llc-pipeline.ll | 3 +- llvm/test/CodeGen/Hexagon/fminmax-v67.ll | 45 +- llvm/test/CodeGen/Hexagon/fminmax.ll | 45 +- llvm/test/CodeGen/Mips/nofpclass.ll | 224 + .../CodeGen/Mips/private-global-prefix.ll | 24 + llvm/test/CodeGen/Mips/qnan.ll | 14 + llvm/test/CodeGen/Mips/unreachable.ll | 13 + llvm/test/CodeGen/NVPTX/convert-sm100a.ll | 82 + .../CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll | 46 + llvm/test/CodeGen/NVPTX/cp-async-bulk.ll | 6 +- llvm/test/CodeGen/NVPTX/shift-opt.ll | 182 + llvm/test/CodeGen/RISCV/attributes.ll | 8 + llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll | 58 +- .../CodeGen/RISCV/ctz_zero_return_test.ll | 360 +- .../test/CodeGen/RISCV/double-calling-conv.ll | 4 +- llvm/test/CodeGen/RISCV/double-convert.ll | 113 +- llvm/test/CodeGen/RISCV/double-imm.ll | 7 +- llvm/test/CodeGen/RISCV/double-mem.ll | 89 +- .../CodeGen/RISCV/double-previous-failure.ll | 14 +- .../CodeGen/RISCV/double-round-conv-sat.ll | 348 +- llvm/test/CodeGen/RISCV/features-info.ll | 2 + .../RISCV/fold-addi-loadstore-zilsd.ll | 30 + .../CodeGen/RISCV/make-compressible-zilsd.mir | 299 + llvm/test/CodeGen/RISCV/mul-expand.ll | 504 +- llvm/test/CodeGen/RISCV/mul.ll | 296 +- .../RISCV/note-gnu-property-zicfiss.ll | 31 + llvm/test/CodeGen/RISCV/riscv-zihintpause.ll | 14 + llvm/test/CodeGen/RISCV/rv64xtheadbb.ll | 160 +- llvm/test/CodeGen/RISCV/rv64zbb.ll | 160 +- .../RISCV/rvv/combine-reduce-add-to-vcpop.ll | 54 +- .../CodeGen/RISCV/rvv/known-never-zero.ll | 33 +- .../RISCV/rvv/stack-probing-dynamic.ll | 82 +- llvm/test/CodeGen/RISCV/rvv/vl-opt.mir | 25 + .../RISCV/rvv/xandesvpackfph-vfpmadb.ll | 299 + .../RISCV/rvv/xandesvpackfph-vfpmadt.ll | 299 + .../CodeGen/RISCV/srem-seteq-illegal-types.ll | 231 +- llvm/test/CodeGen/RISCV/stack-offset.ll | 196 + .../CodeGen/RISCV/urem-seteq-illegal-types.ll | 346 +- .../CodeGen/RISCV/xqccmp-additional-stack.ll | 6 +- llvm/test/CodeGen/RISCV/xqcibi.ll | 359 + llvm/test/CodeGen/RISCV/xqcibm-extract.ll | 233 + .../CodeGen/RISCV/zcmp-additional-stack.ll | 7 +- .../CodeGen/RISCV/zdinx-boundary-check.ll | 297 +- llvm/test/CodeGen/RISCV/zdinx-large-spill.mir | 53 +- llvm/test/CodeGen/RISCV/zdinx-memoperand.ll | 5 +- llvm/test/CodeGen/RISCV/zdinx-spill.ll | 71 + llvm/test/CodeGen/RISCV/zilsd.ll | 121 + .../CodeGen/SPIRV/global-var-name-align.ll | 76 + .../CodeGen/SPIRV/hlsl-resources/Packed.ll | 37 + .../pointers/resource-addrspacecast-2.ll | 3 - .../SPIRV/pointers/resource-addrspacecast.ll | 3 - llvm/test/CodeGen/X86/avg-mask.ll | 48 +- llvm/test/CodeGen/X86/avgfloors.ll | 12 +- .../CodeGen/X86/machine-combiner-int-vec.ll | 18 +- llvm/test/CodeGen/X86/nofpclass.ll | 25 + llvm/test/CodeGen/X86/pr63108.ll | 2 +- .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 60 + .../HWAddressSanitizer/alloca-array.ll | 2 +- .../HWAddressSanitizer/alloca-compat.ll | 2 +- .../HWAddressSanitizer/alloca-with-calls.ll | 2 +- .../HWAddressSanitizer/exception-lifetime.ll | 2 +- .../HWAddressSanitizer/prologue.ll | 4 +- .../use-after-scope-setjmp.ll | 2 +- .../HWAddressSanitizer/use-after-scope.ll | 40 +- llvm/test/MC/AsmParser/token.s | 7 + .../Mips/mips32r6/valid-mips32r6.txt | 6 +- .../Mips/mips64r6/valid-mips64r6.txt | 6 +- llvm/test/MC/M68k/Data/Classes/MxMOVEM_MR.s | 4 + llvm/test/MC/M68k/Data/Classes/MxMOVEM_RM.s | 4 + llvm/test/MC/Mips/macro-rem.s | 2 +- llvm/test/MC/RISCV/rv32q-invalid.s | 21 + llvm/test/MC/RISCV/rv64q-invalid.s | 9 + llvm/test/MC/RISCV/rv64q-valid.s | 43 + llvm/test/MC/RISCV/rv64zfa-only-valid.s | 19 + llvm/test/MC/RISCV/rvq-aliases-valid.s | 55 + llvm/test/MC/RISCV/rvq-pseudos.s | 12 + llvm/test/MC/RISCV/rvq-valid.s | 184 + llvm/test/MC/RISCV/xandesvdot-valid.s | 51 + llvm/test/MC/RISCV/xqcilia-valid.s | 10 + llvm/test/MC/RISCV/zfa-invalid.s | 13 +- llvm/test/MC/RISCV/zfa-quad-invalid.s | 42 + llvm/test/MC/RISCV/zfa-valid.s | 391 +- llvm/test/TableGen/directive1.td | 17 + llvm/test/TableGen/directive2.td | 17 + llvm/test/ThinLTO/X86/cache-emit-asm.ll | 15 + .../CodeGenPrepare/X86/sink-addr-reuse.ll | 44 + .../and-implied-by-operands.ll | 42 +- .../Transforms/ConstraintElimination/eq.ll | 6 +- .../gep-arithmetic-signed-predicates.ll | 6 +- .../geps-precondition-overflow-check.ll | 12 +- .../loops-bottom-tested-pointer-cmps.ll | 13 +- .../loops-header-tested-pointer-cmps.ll | 6 +- .../or-implied-by-operands.ll | 32 +- .../Transforms/ConstraintElimination/or.ll | 3 +- llvm/test/Transforms/Coroutines/gh105595.ll | 31 + .../ForcedFunctionAttrs/open-file-error.ll | 6 + llvm/test/Transforms/GVN/phi.ll | 5 +- llvm/test/Transforms/GVN/pre-compare.ll | 29 +- llvm/test/Transforms/GVN/readattrs.ll | 22 +- llvm/test/Transforms/GVN/setjmp.ll | 42 +- llvm/test/Transforms/GVN/tbaa.ll | 221 +- llvm/test/Transforms/GVN/vscale.ll | 454 +- .../LoopUnroll/peel-last-iteration.ll | 176 +- .../LoopVectorize/AArch64/blend-costs.ll | 4 +- .../LoopVectorize/AArch64/masked-call.ll | 6 +- .../LoopVectorize/ARM/mve-reduction-types.ll | 4 +- .../LoopVectorize/ARM/mve-reductions.ll | 120 +- .../LoopVectorize/RISCV/evl-iv-simplify.ll | 333 + ...ruction-or-drop-poison-generating-flags.ll | 4 +- .../Transforms/LoopVectorize/RISCV/pr88802.ll | 4 +- .../LoopVectorize/if-conversion-nest.ll | 14 +- .../Transforms/LoopVectorize/if-reduction.ll | 10 +- .../LoopVectorize/no_outside_user.ll | 6 +- .../test/Transforms/LoopVectorize/phi-cost.ll | 4 +- .../pr55167-fold-tail-live-out.ll | 10 +- .../LoopVectorize/reduction-inloop-pred.ll | 22 +- .../LoopVectorize/reduction-inloop.ll | 28 +- .../Transforms/LoopVectorize/reduction.ll | 20 +- .../LoopVectorize/single-value-blend-phis.ll | 12 +- .../tail-folding-counting-down.ll | 2 +- .../Transforms/LoopVectorize/uniform-blend.ll | 5 +- .../vplan-printing-reductions.ll | 145 + .../contract-attached-call-retain-to-claim.ll | 35 + .../AArch64/reused-scalar-repeated-in-node.ll | 10 +- .../X86/buildvectors-parent-phi-nodes.ll | 4 +- .../X86/full-matched-bv-with-subvectors.ll | 2 +- .../X86/long-pointer-distance.ll | 8 +- .../X86/matched-bv-schedulable.ll | 4 +- .../X86/matched-nodes-updated.ll | 8 +- .../X86/node-outside-used-only.ll | 40 + .../X86/phi-operand-gathered-loads.ll | 53 + .../reduced-val-vectorized-in-transform.ll | 2 +- .../X86/split-node-num-operands.ll | 2 +- llvm/test/Transforms/SLPVectorizer/revec.ll | 4 +- llvm/test/Transforms/SafeStack/AArch64/abi.ll | 2 +- .../Transforms/SafeStack/AArch64/abi_ssp.ll | 4 +- .../SafeStack/AArch64/unreachable.ll | 2 +- .../virtual-const-prop-begin.ll | 10 +- .../virtual-const-prop-check.ll | 18 +- .../virtual-const-prop-end.ll | 18 +- .../virtual-const-prop-small-alignment-32.ll | 76 +- .../virtual-const-prop-small-alignment-64.ll | 43 +- .../AArch64/skip_unsupported_instructions.s | 6 +- llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 2 +- llvm/tools/llvm-objdump/llvm-objdump.cpp | 59 +- llvm/unittests/ADT/BitmaskEnumTest.cpp | 11 + llvm/unittests/CodeGen/CMakeLists.txt | 1 + llvm/unittests/CodeGen/GCMetadata.cpp | 76 + .../CodeGen/GlobalISel/KnownBitsTest.cpp | 46 - llvm/unittests/ProfileData/CMakeLists.txt | 1 + .../ProfileData/DataAccessProfTest.cpp | 181 + .../Support/FileOutputBufferTest.cpp | 2 +- .../Target/AArch64/SMEAttributesTest.cpp | 106 +- .../TargetParser/RISCVISAInfoTest.cpp | 2 + .../Transforms/IPO/WholeProgramDevirt.cpp | 2 +- .../Transforms/Vectorize/CMakeLists.txt | 1 + .../Vectorize/VPlanPatternMatchTest.cpp | 55 + .../Transforms/Vectorize/VPlanTest.cpp | 3 +- .../Vectorize/VPlanVerifierTest.cpp | 38 + llvm/utils/TableGen/AsmWriterEmitter.cpp | 11 +- .../utils/TableGen/Basic/DirectiveEmitter.cpp | 77 + .../TableGen/Common/CodeGenDAGPatterns.cpp | 12 +- .../TableGen/Common/VarLenCodeEmitterGen.cpp | 5 +- llvm/utils/TableGen/FastISelEmitter.cpp | 223 +- llvm/utils/TableGen/X86RecognizableInstr.cpp | 806 +- llvm/utils/TableGen/X86RecognizableInstr.h | 25 +- .../clang-tools-extra/clang-doc/BUILD.gn | 1 + .../unittests/clang-doc/BUILD.gn | 2 + .../gn/secondary/clang/unittests/CIR/BUILD.gn | 5 + .../llvm/lib/DebugInfo/GSYM/BUILD.gn | 2 +- .../gn/secondary/llvm/lib/Passes/BUILD.gn | 1 + .../secondary/llvm/lib/ProfileData/BUILD.gn | 1 + .../llvm/lib/Transforms/Vectorize/BUILD.gn | 1 + .../secondary/llvm/unittests/CodeGen/BUILD.gn | 1 + .../llvm/unittests/ProfileData/BUILD.gn | 1 + .../unittests/Transforms/Vectorize/BUILD.gn | 1 + mlir/docs/Bufferization.md | 14 +- mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td | 163 +- .../IR/BufferizableOpInterface.h | 4 +- .../Dialect/Bufferization/IR/Bufferization.h | 8 +- .../Bufferization/IR/BufferizationOps.td | 12 +- .../Bufferization/Transforms/Bufferize.h | 2 +- .../FuncBufferizableOpInterfaceImpl.h | 3 + .../Bufferization/Transforms/Passes.td | 6 +- mlir/include/mlir/Dialect/CMakeLists.txt | 1 - mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 70 + .../mlir/Dialect/OpenACC/OpenACCOps.td | 55 +- .../mlir/Dialect/Polynomial/CMakeLists.txt | 1 - .../mlir/Dialect/Polynomial/IR/CMakeLists.txt | 7 - .../mlir/Dialect/Polynomial/IR/Polynomial.h | 282 - .../mlir/Dialect/Polynomial/IR/Polynomial.td | 350 - .../Polynomial/IR/PolynomialAttributes.h | 17 - .../Polynomial/IR/PolynomialAttributes.td | 222 - .../Dialect/Polynomial/IR/PolynomialDialect.h | 19 - .../Polynomial/IR/PolynomialDialect.td | 55 - .../Dialect/Polynomial/IR/PolynomialOps.h | 21 - .../Dialect/Polynomial/IR/PolynomialTypes.h | 17 - .../Dialect/Polynomial/IR/PolynomialTypes.td | 33 - .../SparseTensor/IR/SparseTensorOps.td | 16 +- .../Vector/Transforms/VectorRewritePatterns.h | 15 +- .../mlir/Dialect/X86Vector/X86Vector.td | 25 +- .../Dialect/X86Vector/X86VectorInterfaces.td | 6 +- mlir/include/mlir/IR/OpImplementation.h | 6 + mlir/include/mlir/InitAllDialects.h | 2 - mlir/include/mlir/Target/LLVMIR/Import.h | 6 +- .../include/mlir/Target/LLVMIR/ModuleImport.h | 5 +- .../mlir/Target/LLVMIR/ModuleTranslation.h | 28 +- .../include/mlir/Target/LLVMIR/TypeFromLLVM.h | 5 +- mlir/lib/AsmParser/AsmParserImpl.h | 10 + mlir/lib/AsmParser/Lexer.cpp | 2 +- mlir/lib/AsmParser/TokenKinds.def | 1 + mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp | 2 +- .../Transforms/AffineDataCopyGeneration.cpp | 9 +- .../Affine/Transforms/RaiseMemrefDialect.cpp | 2 +- .../ArmSME/Transforms/TileAllocation.cpp | 8 +- .../Transforms/LegalizeForLLVMExport.cpp | 84 +- .../IR/BufferDeallocationOpInterface.cpp | 3 +- .../IR/BufferizableOpInterface.cpp | 16 +- .../Bufferization/IR/BufferizationOps.cpp | 88 +- .../Bufferization/Transforms/Bufferize.cpp | 38 +- .../FuncBufferizableOpInterfaceImpl.cpp | 44 +- .../Transforms/OneShotAnalysis.cpp | 2 +- .../Transforms/OneShotModuleBufferize.cpp | 12 +- mlir/lib/Dialect/CMakeLists.txt | 1 - mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp | 5 +- .../TransformOps/LinalgTransformOps.cpp | 86 + .../Transforms/ConvertToDestinationStyle.cpp | 8 +- .../Transforms/DataLayoutPropagation.cpp | 23 +- ...DecomposeGenericByUnfoldingPermutation.cpp | 2 +- .../Linalg/Transforms/TilingInterfaceImpl.cpp | 4 +- .../Linalg/Transforms/Vectorization.cpp | 101 +- .../Linalg/Transforms/WinogradConv2D.cpp | 2 +- mlir/lib/Dialect/Mesh/IR/MeshOps.cpp | 2 +- mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 93 +- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 47 +- mlir/lib/Dialect/Polynomial/CMakeLists.txt | 1 - mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt | 26 - mlir/lib/Dialect/Polynomial/IR/Polynomial.cpp | 68 - .../Polynomial/IR/PolynomialAttributes.cpp | 236 - .../IR/PolynomialCanonicalization.td | 44 - .../Polynomial/IR/PolynomialDialect.cpp | 49 - .../Dialect/Polynomial/IR/PolynomialOps.cpp | 298 - mlir/lib/Dialect/Quant/IR/QuantOps.cpp | 2 +- .../Transforms/StructuralTypeConversions.cpp | 2 +- .../SparseTensor/IR/SparseTensorDialect.cpp | 4 +- .../Transforms/SparseGPUCodegen.cpp | 2 +- .../Transforms/Sparsification.cpp | 2 +- .../Transforms/Utils/CodegenEnv.cpp | 9 +- .../Transforms/Utils/CodegenUtils.cpp | 41 +- .../Transforms/Utils/LoopEmitter.cpp | 2 +- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 2 +- .../BufferizableOpInterfaceImpl.cpp | 6 +- .../Tosa/Transforms/TosaProfileCompliance.cpp | 5 +- mlir/lib/Dialect/Utils/StaticValueUtils.cpp | 4 +- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 32 +- .../Vector/Transforms/VectorLinearize.cpp | 421 +- mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 3 +- .../Dialect/X86Vector/IR/X86VectorDialect.cpp | 72 +- .../Transforms/LegalizeForLLVMExport.cpp | 21 +- mlir/lib/ExecutionEngine/JitRunner.cpp | 10 +- mlir/lib/Pass/Pass.cpp | 2 +- mlir/lib/TableGen/Pattern.cpp | 4 +- mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp | 8 +- .../LLVMIR/LLVMToLLVMIRTranslation.cpp | 17 +- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 24 +- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 54 +- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 6 +- mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp | 19 +- mlir/lib/Transforms/CompositePass.cpp | 4 +- .../MeshToMPI/convert-mesh-to-mpi.mlir | 2 +- .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 25 + .../OpenACCToSCF/convert-openacc-to-scf.mlir | 12 +- mlir/test/Dialect/Affine/loop-fusion-4.mlir | 2 +- mlir/test/Dialect/Arith/bufferize.mlir | 6 +- .../Dialect/ArmSVE/legalize-for-llvm.mlir | 53 + mlir/test/Dialect/ArmSVE/roundtrip.mlir | 11 + .../dealloc-other.mlir | 4 +- ...ne-shot-bufferize-allow-return-allocs.mlir | 4 +- .../one-shot-bufferize-analysis.mlir | 20 +- .../one-shot-bufferize-encodings.mlir | 12 +- .../one-shot-bufferize-partial.mlir | 22 +- .../Transforms/one-shot-bufferize.mlir | 16 +- ...ule-bufferize-force-copy-before-write.mlir | 12 +- .../Transforms/one-shot-module-bufferize.mlir | 10 +- .../Transforms/tensorlike-bufferlike.mlir | 4 +- .../Transforms/transform-ops.mlir | 6 +- .../Dialect/Bufferization/canonicalize.mlir | 32 +- mlir/test/Dialect/Bufferization/ops.mlir | 8 +- .../ControlFlow/one-shot-bufferize.mlir | 4 +- mlir/test/Dialect/LLVMIR/roundtrip.mlir | 8 + mlir/test/Dialect/Linalg/bufferize.mlir | 14 +- .../Linalg/data-layout-propagation.mlir | 31 +- mlir/test/Dialect/Linalg/hoisting.mlir | 4 +- .../transform-op-bufferize-to-allocation.mlir | 4 +- .../transform-op-fuse-into-containing.mlir | 100 + mlir/test/Dialect/Linalg/vectorization.mlir | 8 +- .../Dialect/MemRef/normalize-memrefs.mlir | 2 +- mlir/test/Dialect/OpenACC/invalid.mlir | 6 +- mlir/test/Dialect/OpenACC/ops.mlir | 100 +- mlir/test/Dialect/OpenMP/ops.mlir | 10 +- mlir/test/Dialect/Polynomial/attributes.mlir | 73 - .../Dialect/Polynomial/canonicalization.mlir | 47 - mlir/test/Dialect/Polynomial/ops.mlir | 112 - mlir/test/Dialect/Polynomial/ops_errors.mlir | 126 - mlir/test/Dialect/Polynomial/types.mlir | 65 - mlir/test/Dialect/SCF/bufferize.mlir | 12 +- .../SCF/one-shot-bufferize-encodings.mlir | 8 +- mlir/test/Dialect/Shape/bufferize.mlir | 2 +- .../SparseTensor/GPU/gpu_matmul24_lib.mlir | 6 +- .../SparseTensor/GPU/gpu_matmul_lib.mlir | 4 +- .../SparseTensor/GPU/gpu_matvec_lib.mlir | 4 +- .../GPU/gpu_sampled_matmul_lib.mlir | 4 +- .../SparseTensor/GPU/gpu_sddmm_lib.mlir | 4 +- .../SparseTensor/constant_index_map.mlir | 4 +- mlir/test/Dialect/SparseTensor/dense.mlir | 6 +- .../fuse_sparse_pad_with_consumer.mlir | 2 +- .../test/Dialect/SparseTensor/sorted_coo.mlir | 4 +- mlir/test/Dialect/SparseTensor/sparse_1d.mlir | 60 +- mlir/test/Dialect/SparseTensor/sparse_2d.mlir | 78 +- mlir/test/Dialect/SparseTensor/sparse_3d.mlir | 82 +- .../Dialect/SparseTensor/sparse_affine.mlir | 16 +- .../Dialect/SparseTensor/sparse_batch.mlir | 2 +- .../Dialect/SparseTensor/sparse_fp_ops.mlir | 22 +- .../Dialect/SparseTensor/sparse_fusion.mlir | 2 +- .../Dialect/SparseTensor/sparse_int_ops.mlir | 34 +- .../Dialect/SparseTensor/sparse_kernels.mlir | 18 +- .../sparse_kernels_to_iterator.mlir | 2 +- .../Dialect/SparseTensor/sparse_lower.mlir | 8 +- .../SparseTensor/sparse_lower_col.mlir | 8 +- .../SparseTensor/sparse_lower_inplace.mlir | 8 +- mlir/test/Dialect/SparseTensor/sparse_nd.mlir | 4 +- .../Dialect/SparseTensor/sparse_outbuf.mlir | 6 +- .../Dialect/SparseTensor/sparse_pack.mlir | 12 +- .../SparseTensor/sparse_parallel_reduce.mlir | 4 +- .../Dialect/SparseTensor/sparse_perm.mlir | 4 +- .../SparseTensor/sparse_perm_lower.mlir | 4 +- .../Dialect/SparseTensor/sparse_scalars.mlir | 4 +- .../Dialect/SparseTensor/sparse_sddmm.mlir | 10 +- .../SparseTensor/sparse_sddmm_org.mlir | 4 +- .../SparseTensor/sparse_vector_chain.mlir | 2 +- .../SparseTensor/sparse_vector_index.mlir | 4 +- mlir/test/Dialect/SparseTensor/spy_sddmm.mlir | 4 +- .../Dialect/SparseTensor/spy_sddmm_bsr.mlir | 4 +- .../Dialect/SparseTensor/unused-tensor.mlir | 4 +- .../SparseTensor/vectorize_reduction.mlir | 28 +- mlir/test/Dialect/Tensor/bufferize.mlir | 42 +- mlir/test/Dialect/Vector/bufferize.mlir | 6 +- mlir/test/Dialect/Vector/canonicalize.mlir | 22 - .../Vector/canonicalize/vector-transpose.mlir | 93 +- mlir/test/Dialect/Vector/linearize.mlir | 135 +- .../Vector/vector-transpose-lowering.mlir | 16 +- mlir/test/IR/parser.mlir | 7 + mlir/test/IR/parser_dialect_loading.mlir | 19 - .../Tosa/CPU/test-maxpool-dynamic.mlir | 4 +- .../Dialect/Vector/CPU/AMX/mulf-full.mlir | 4 +- .../Dialect/Vector/CPU/AMX/muli-full.mlir | 4 +- .../Import/import-structs-as-literals.ll | 13 + .../test/Target/LLVMIR/Import/instructions.ll | 45 +- mlir/test/Target/LLVMIR/Import/struct.ll | 10 + mlir/test/Target/LLVMIR/arm-sve.mlir | 44 + mlir/test/Target/LLVMIR/blockaddress.mlir | 29 + mlir/test/Target/LLVMIR/llvmir.mlir | 24 +- mlir/test/Transforms/composite-pass.mlir | 6 +- .../Linalg/TestLinalgFusionTransforms.cpp | 3 +- mlir/test/lib/Dialect/Test/CMakeLists.txt | 1 - mlir/test/lib/Dialect/Test/TestAttrDefs.td | 30 +- mlir/test/lib/Dialect/Test/TestAttributes.cpp | 18 + mlir/test/lib/Dialect/Test/TestAttributes.h | 4 +- mlir/test/lib/Dialect/Test/TestOps.td | 14 +- .../Dialect/Vector/TestVectorTransforms.cpp | 12 +- .../verify-entry-point-result.mlir | 7 - mlir/test/mlir-runner/verify-entry-point.mlir | 48 + mlir/test/mlir-tblgen/cpp-class-comments.td | 139 + mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 9 +- mlir/tools/mlir-tblgen/CMakeLists.txt | 1 + mlir/tools/mlir-tblgen/CppGenUtilities.cpp | 39 + mlir/tools/mlir-tblgen/CppGenUtilities.h | 29 + mlir/tools/mlir-tblgen/DialectGen.cpp | 8 +- mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 9 +- mlir/tools/mlir-tblgen/OpDocGen.cpp | 17 +- mlir/tools/mlir-tblgen/OpInterfacesGen.cpp | 9 +- mlir/unittests/Dialect/CMakeLists.txt | 1 - .../Dialect/Polynomial/CMakeLists.txt | 8 - .../Dialect/Polynomial/PolynomialMathTest.cpp | 44 - .../tree-sitter-mlir/dialect/bufferization.js | 45 +- .../tree-sitter-mlir/queries/highlights.scm | 2 +- .../common/include/PluginInterface.h | 3 - .../test/offloading/gpupgo/pgo_atomic_teams.c | 102 + .../offloading/gpupgo/pgo_atomic_threads.c | 84 + .../gpupgo/{pgo2.c => pgo_device_and_host.c} | 11 +- .../gpupgo/{pgo1.c => pgo_device_only.c} | 20 +- openmp/runtime/src/ompt-internal.h | 3 +- openmp/runtime/src/ompt-specific.h | 7 +- openmp/runtime/test/ompt/callback.h | 8 + third-party/unittest/googletest/README.LLVM | 3 + .../googletest/include/gtest/gtest-printers.h | 8 +- utils/bazel/configure.bzl | 2 +- .../lldb/source/Plugins/BUILD.bazel | 3 +- .../llvm-project-overlay/mlir/BUILD.bazel | 91 - .../mlir/python/BUILD.bazel | 2 +- .../mlir/test/BUILD.bazel | 3 +- 1592 files changed, 693079 insertions(+), 219652 deletions(-) create mode 100644 clang/bindings/python/tests/cindex/test_lib.py create mode 100644 clang/docs/analyzer/user-docs/Options.rst.in create mode 100644 clang/docs/tools/generate_analyzer_options_docs.py create mode 100644 clang/include/clang/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.h create mode 100644 clang/include/clang/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.h create mode 100644 clang/lib/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.cpp create mode 100644 clang/lib/CIR/Dialect/OpenACC/CMakeLists.txt create mode 100644 clang/lib/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.cpp create mode 100644 clang/test/AST/ByteCode/lifetimes26.cpp create mode 100644 clang/test/Analysis/ftime-trace-no-init.cpp create mode 100644 clang/test/Analysis/generate_analyzer_options_docs.test create mode 100644 clang/test/CIR/CodeGen/switch_flat_op.cpp create mode 100644 clang/test/CIR/IR/switch-flat.cir create mode 100644 clang/test/CIR/Transforms/switch.cir create mode 100644 clang/test/CodeCompletion/source-loc-zero.cpp create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c create mode 100644 clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp create mode 100644 clang/test/CodeGen/RISCV/riscv-zihintpause.c create mode 100644 clang/test/CodeGen/attr-counted-by-for-pointers.c create mode 100644 clang/test/CodeGenHLSL/RootSignature.hlsl create mode 100644 clang/test/CodeGenHLSL/convergence/global_array.hlsl create mode 100644 clang/test/Headers/__clang_hip_cmath-return_types.hip create mode 100644 clang/test/Modules/pr130712.cppm create mode 100644 clang/test/Modules/pr140130.cpp create mode 100644 clang/test/Modules/sdk-settings-json-dep.m create mode 100644 clang/test/OpenMP/openmp_non_c_directives.c create mode 100644 clang/test/Refactor/source-loc-zero.cpp create mode 100644 clang/test/SemaCXX/consteval-assert.cpp create mode 100644 clang/test/SemaCXX/libstdcxx_format_kind_hack.cpp delete mode 100644 clang/test/SemaCXX/libstdcxx_gets_hack.cpp delete mode 100644 clang/test/SemaCXX/libstdcxx_pointer_return_false_hack.cpp create mode 100644 clang/test/SemaCXX/ms_struct-bitfield-padding.cpp create mode 100644 clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp create mode 100644 clang/test/SemaCXX/warn-nrvo.cpp create mode 100644 clang/test/SemaOpenACC/gh139894.cpp create mode 100644 clang/unittests/CIR/CMakeLists.txt create mode 100644 clang/unittests/CIR/PointerLikeTest.cpp create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 create mode 100644 flang/test/Preprocessing/func-on-command-line.F90 create mode 100644 flang/test/Semantics/OpenMP/cancellation-construct-type.f90 create mode 100644 flang/test/Semantics/pad-hollerith-arg.f create mode 100644 libc/hdr/types/ACTION.h delete mode 100644 libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp delete mode 100644 libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp delete mode 100644 libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp delete mode 100644 libcxx/test/benchmarks/algorithms/sort.bench.cpp create mode 100644 libcxx/test/benchmarks/algorithms/sorting/common.h create mode 100644 libcxx/test/benchmarks/algorithms/sorting/is_sorted.bench.cpp create mode 100644 libcxx/test/benchmarks/algorithms/sorting/is_sorted_until.bench.cpp create mode 100644 libcxx/test/benchmarks/algorithms/sorting/partial_sort.bench.cpp create mode 100644 libcxx/test/benchmarks/algorithms/sorting/partial_sort_copy.bench.cpp create mode 100644 libcxx/test/benchmarks/algorithms/sorting/sort.bench.cpp create mode 100644 libcxx/test/benchmarks/algorithms/sorting/stable_sort.bench.cpp delete mode 100644 libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp create mode 100644 libcxx/test/libcxx/clang_tidy.sh.py create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.set/insert_range.pass.cpp create mode 100644 lld/test/COFF/arm64x-sameaddress.test create mode 100644 lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp create mode 100644 lldb/source/Plugins/Process/AIX/NativeThreadAIX.h create mode 100644 lldb/test/Shell/ObjectFile/XCOFF/basic-info32.yaml create mode 100644 lldb/unittests/DAP/DAPTest.cpp create mode 100644 lldb/unittests/DAP/Handler/DisconnectTest.cpp create mode 100644 lldb/unittests/DAP/TestBase.cpp create mode 100644 lldb/unittests/DAP/TestBase.h create mode 100644 lldb/unittests/DAP/TransportTest.cpp rename llvm/include/llvm/DebugInfo/GSYM/{GsymDIContext.h => GsymContext.h} (81%) create mode 100644 llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td create mode 100644 llvm/include/llvm/ProfileData/DataAccessProf.h create mode 100644 llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h rename llvm/lib/DebugInfo/GSYM/{GsymDIContext.cpp => GsymContext.cpp} (82%) create mode 100644 llvm/lib/ProfileData/DataAccessProf.cpp create mode 100644 llvm/lib/Target/M68k/M68kSelectionDAGInfo.cpp create mode 100644 llvm/lib/Target/M68k/M68kSelectionDAGInfo.h create mode 100644 llvm/lib/Target/RISCV/RISCVInstrInfoQ.td create mode 100644 llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp create mode 100644 llvm/test/Assembler/amdgcn-unreachable.ll create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/knownbits-const.mir create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sme-stubs.ll create mode 100644 llvm/test/CodeGen/AArch64/nofpclass.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir rename llvm/test/CodeGen/AMDGPU/{fptrunc.v2f16.no.fast.path.ll => fptrunc.v2f16.no.fast.math.ll} (93%) create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll create mode 100644 llvm/test/CodeGen/ARM/nofpclass.ll create mode 100644 llvm/test/CodeGen/Mips/nofpclass.ll create mode 100644 llvm/test/CodeGen/Mips/private-global-prefix.ll create mode 100644 llvm/test/CodeGen/Mips/qnan.ll create mode 100644 llvm/test/CodeGen/Mips/unreachable.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll create mode 100644 llvm/test/CodeGen/NVPTX/shift-opt.ll create mode 100644 llvm/test/CodeGen/RISCV/fold-addi-loadstore-zilsd.ll create mode 100644 llvm/test/CodeGen/RISCV/make-compressible-zilsd.mir create mode 100644 llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll create mode 100644 llvm/test/CodeGen/RISCV/riscv-zihintpause.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadb.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadt.ll create mode 100644 llvm/test/CodeGen/RISCV/xqcibi.ll create mode 100644 llvm/test/CodeGen/RISCV/xqcibm-extract.ll create mode 100644 llvm/test/CodeGen/RISCV/zdinx-spill.ll create mode 100644 llvm/test/CodeGen/RISCV/zilsd.ll create mode 100644 llvm/test/CodeGen/SPIRV/global-var-name-align.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll create mode 100644 llvm/test/CodeGen/X86/nofpclass.ll create mode 100644 llvm/test/MC/AsmParser/token.s create mode 100644 llvm/test/MC/RISCV/rv32q-invalid.s create mode 100644 llvm/test/MC/RISCV/rv64q-invalid.s create mode 100644 llvm/test/MC/RISCV/rv64q-valid.s create mode 100644 llvm/test/MC/RISCV/rv64zfa-only-valid.s create mode 100644 llvm/test/MC/RISCV/rvq-aliases-valid.s create mode 100644 llvm/test/MC/RISCV/rvq-pseudos.s create mode 100644 llvm/test/MC/RISCV/rvq-valid.s create mode 100644 llvm/test/MC/RISCV/xandesvdot-valid.s create mode 100644 llvm/test/MC/RISCV/zfa-quad-invalid.s create mode 100644 llvm/test/ThinLTO/X86/cache-emit-asm.ll create mode 100644 llvm/test/Transforms/CodeGenPrepare/X86/sink-addr-reuse.ll create mode 100644 llvm/test/Transforms/Coroutines/gh105595.ll create mode 100644 llvm/test/Transforms/ForcedFunctionAttrs/open-file-error.ll create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll create mode 100644 llvm/test/Transforms/ObjCARC/contract-attached-call-retain-to-claim.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll create mode 100644 llvm/unittests/CodeGen/GCMetadata.cpp create mode 100644 llvm/unittests/ProfileData/DataAccessProfTest.cpp create mode 100644 llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp create mode 100644 llvm/utils/gn/secondary/clang/unittests/CIR/BUILD.gn delete mode 100644 mlir/include/mlir/Dialect/Polynomial/CMakeLists.txt delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.h delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.h delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.td delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/PolynomialOps.h delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.h delete mode 100644 mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.td delete mode 100644 mlir/lib/Dialect/Polynomial/CMakeLists.txt delete mode 100644 mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt delete mode 100644 mlir/lib/Dialect/Polynomial/IR/Polynomial.cpp delete mode 100644 mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp delete mode 100644 mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td delete mode 100644 mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp delete mode 100644 mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp delete mode 100644 mlir/test/Dialect/Polynomial/attributes.mlir delete mode 100644 mlir/test/Dialect/Polynomial/canonicalization.mlir delete mode 100644 mlir/test/Dialect/Polynomial/ops.mlir delete mode 100644 mlir/test/Dialect/Polynomial/ops_errors.mlir delete mode 100644 mlir/test/Dialect/Polynomial/types.mlir delete mode 100644 mlir/test/IR/parser_dialect_loading.mlir create mode 100644 mlir/test/Target/LLVMIR/Import/import-structs-as-literals.ll create mode 100644 mlir/test/Target/LLVMIR/Import/struct.ll delete mode 100644 mlir/test/mlir-runner/verify-entry-point-result.mlir create mode 100644 mlir/test/mlir-runner/verify-entry-point.mlir create mode 100644 mlir/test/mlir-tblgen/cpp-class-comments.td create mode 100644 mlir/tools/mlir-tblgen/CppGenUtilities.cpp create mode 100644 mlir/tools/mlir-tblgen/CppGenUtilities.h delete mode 100644 mlir/unittests/Dialect/Polynomial/CMakeLists.txt delete mode 100644 mlir/unittests/Dialect/Polynomial/PolynomialMathTest.cpp create mode 100644 offload/test/offloading/gpupgo/pgo_atomic_teams.c create mode 100644 offload/test/offloading/gpupgo/pgo_atomic_threads.c rename offload/test/offloading/gpupgo/{pgo2.c => pgo_device_and_host.c} (95%) rename offload/test/offloading/gpupgo/{pgo1.c => pgo_device_only.c} (85%) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index ebf68e0690fa5..69ba47a5e75d2 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -702,6 +702,7 @@ mlgo: - llvm/unittests/CodeGen/ML* - llvm/test/CodeGen/MLRegAlloc/** - llvm/utils/mlgo-utils/** + - llvm/docs/MLGO.rst tools:llvm-exegesis: - llvm/tools/llvm-exegesis/** diff --git a/bolt/include/bolt/Profile/Heatmap.h b/bolt/include/bolt/Profile/Heatmap.h index fc1e2cd30011e..9813e7fed486d 100644 --- a/bolt/include/bolt/Profile/Heatmap.h +++ b/bolt/include/bolt/Profile/Heatmap.h @@ -52,6 +52,9 @@ class Heatmap { : BucketSize(BucketSize), MinAddress(MinAddress), MaxAddress(MaxAddress), TextSections(TextSections) {} + uint64_t HotStart{0}; + uint64_t HotEnd{0}; + inline bool ignoreAddress(uint64_t Address) const { return (Address > MaxAddress) || (Address < MinAddress); } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 8367d2d686991..851fa36a6b4b7 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -3326,7 +3326,7 @@ void BinaryFunction::duplicateConstantIslands() { static std::string constructFilename(std::string Filename, std::string Annotation, std::string Suffix) { - std::replace(Filename.begin(), Filename.end(), '/', '-'); + llvm::replace(Filename, '/', '-'); if (!Annotation.empty()) Annotation.insert(0, "-"); if (Filename.size() + Annotation.size() + Suffix.size() > MAX_PATH) { diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp index 136c23d50df64..072a152119ae2 100644 --- a/bolt/lib/Core/DIEBuilder.cpp +++ b/bolt/lib/Core/DIEBuilder.cpp @@ -437,10 +437,10 @@ getUnitForOffset(DIEBuilder &Builder, DWARFContext &DWCtx, // This is a work around for XCode clang. There is a build error when we // pass DWCtx.compile_units() to llvm::upper_bound std::call_once(InitVectorFlag, initCUVector); - auto CUIter = std::upper_bound(CUOffsets.begin(), CUOffsets.end(), Offset, - [](uint64_t LHS, const DWARFUnit *RHS) { - return LHS < RHS->getNextUnitOffset(); - }); + auto CUIter = llvm::upper_bound(CUOffsets, Offset, + [](uint64_t LHS, const DWARFUnit *RHS) { + return LHS < RHS->getNextUnitOffset(); + }); CU = CUIter != CUOffsets.end() ? (*CUIter) : nullptr; } return CU; diff --git a/bolt/lib/Passes/AsmDump.cpp b/bolt/lib/Passes/AsmDump.cpp index c0be0116f20fb..0bc5a06f53ac2 100644 --- a/bolt/lib/Passes/AsmDump.cpp +++ b/bolt/lib/Passes/AsmDump.cpp @@ -109,7 +109,7 @@ void dumpFunction(const BinaryFunction &BF) { } std::string PrintName = BF.getPrintName(); - std::replace(PrintName.begin(), PrintName.end(), '/', '-'); + llvm::replace(PrintName, '/', '-'); std::string Filename = opts::AsmDump.empty() ? (PrintName + ".s") diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 420ffc8e01c5c..e356481bbdc7c 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -35,7 +35,7 @@ static const char *dynoStatsOptName(const bolt::DynoStats::Category C) { OptNames[C] = bolt::DynoStats::Description(C); - std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-'); + llvm::replace(OptNames[C], ' ', '-'); return OptNames[C].c_str(); } diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index c7db9d262e942..6beb60741406e 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -1316,6 +1316,14 @@ std::error_code DataAggregator::printLBRHeatMap() { } Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress, opts::HeatmapMaxAddress, getTextSections(BC)); + auto getSymbolValue = [&](const MCSymbol *Symbol) -> uint64_t { + if (Symbol) + if (ErrorOr SymValue = BC->getSymbolValue(*Symbol)) + return SymValue.get(); + return 0; + }; + HM.HotStart = getSymbolValue(BC->getHotTextStartSymbol()); + HM.HotEnd = getSymbolValue(BC->getHotTextEndSymbol()); if (!NumTotalSamples) { if (opts::BasicAggregation) { diff --git a/bolt/lib/Profile/Heatmap.cpp b/bolt/lib/Profile/Heatmap.cpp index 003db3cc61137..c66c2e5487613 100644 --- a/bolt/lib/Profile/Heatmap.cpp +++ b/bolt/lib/Profile/Heatmap.cpp @@ -8,6 +8,7 @@ #include "bolt/Profile/Heatmap.h" #include "bolt/Utils/CommandLineOpts.h" +#include "llvm/ADT/AddressRanges.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/Debug.h" @@ -313,6 +314,9 @@ void Heatmap::printSectionHotness(raw_ostream &OS) const { UnmappedHotness += Frequency; }; + AddressRange HotTextRange(HotStart, HotEnd); + StringRef HotTextName = "[hot text]"; + for (const std::pair &KV : Map) { NumTotalCounts += KV.second; // We map an address bucket to the first section (lowest address) @@ -328,15 +332,24 @@ void Heatmap::printSectionHotness(raw_ostream &OS) const { } SectionHotness[TextSections[TextSectionIndex].Name] += KV.second; ++BucketUtilization[TextSections[TextSectionIndex].Name]; + if (HotTextRange.contains(Address)) { + SectionHotness[HotTextName] += KV.second; + ++BucketUtilization[HotTextName]; + } } + std::vector Sections(TextSections); + // Append synthetic hot text section to TextSections + if (!HotTextRange.empty()) + Sections.emplace_back(SectionNameAndRange{HotTextName, HotStart, HotEnd}); + assert(NumTotalCounts > 0 && "total number of heatmap buckets should be greater than 0"); OS << "Section Name, Begin Address, End Address, Percentage Hotness, " << "Utilization Pct, Partition Score\n"; const uint64_t MappedCounts = NumTotalCounts - UnmappedHotness; - for (const auto [Name, Begin, End] : TextSections) { + for (const auto [Name, Begin, End] : Sections) { const float Hotness = 1. * SectionHotness[Name] / NumTotalCounts; const float MappedHotness = MappedCounts ? 1. * SectionHotness[Name] / MappedCounts : 0; diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 614938d0e3b65..dd519431fb2e3 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -968,8 +968,9 @@ void RewriteInstance::discoverFileObjects() { continue; } - // Ignore input hot markers - if (SymName == "__hot_start" || SymName == "__hot_end") + // Ignore input hot markers unless in heatmap mode + if ((SymName == "__hot_start" || SymName == "__hot_end") && + !opts::HeatmapMode) continue; FileSymRefs.emplace(SymbolAddress, Symbol); diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index 44e3bf21c14c0..4994cfb541eef 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -6,7 +6,7 @@ # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib # RUN: link_fdata %s %t %t.pat PREAGGT1 # RUN: link_fdata %s %t %t.pat2 PREAGGT2 -# RUN: link_fdata %s %t %t.patplt PREAGGPLT +# RUN-DISABLED: link_fdata %s %t %t.patplt PREAGGPLT # RUN: llvm-strip --strip-unneeded %t -o %t.strip # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh @@ -26,8 +26,8 @@ ## Check pre-aggregated traces don't report zero-sized PLT fall-through as ## invalid trace -# RUN: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \ -# RUN: --check-prefix=CHECK-PLT +# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \ +# RUN-DISABLED: --check-prefix=CHECK-PLT # CHECK-PLT: traces mismatching disassembled function contents: 0 .globl foo diff --git a/bolt/test/X86/heatmap-preagg.test b/bolt/test/X86/heatmap-preagg.test index 702dc804f5133..306e74800a353 100644 --- a/bolt/test/X86/heatmap-preagg.test +++ b/bolt/test/X86/heatmap-preagg.test @@ -13,6 +13,7 @@ RUN: --reorder-functions=cdsort --enable-bat --dyno-stats --skip-funcs=main RUN: llvm-bolt-heatmap %t.out -o %t2 --pa -p %p/Inputs/blarge_new_bat.preagg.txt \ RUN: 2>&1 | FileCheck --check-prefix CHECK-HEATMAP-BAT %s RUN: FileCheck %s --check-prefix CHECK-SEC-HOT-BAT --input-file %t2-section-hotness.csv +RUN: llvm-nm -n %t.out | FileCheck %s --check-prefix=CHECK-HOT-SYMS CHECK-HEATMAP: PERF2BOLT: read 81 aggregated LBR entries CHECK-HEATMAP: HEATMAP: invalid traces: 1 @@ -33,3 +34,6 @@ CHECK-SEC-HOT-BAT-NEXT: .bolt.org.text, 0x4010b0, 0x401c25, 38.3385, 51.0638, 0. CHECK-SEC-HOT-BAT-NEXT: .fini, 0x401c28, 0x401c35, 0.0000, 0.0000, 0.0000 CHECK-SEC-HOT-BAT-NEXT: .text, 0x800000, 0x8002cc, 38.7595, 91.6667, 0.3553 CHECK-SEC-HOT-BAT-NEXT: .text.cold, 0x800300, 0x800415, 0.0000, 0.0000, 0.0000 +CHECK-SEC-HOT-BAT-NEXT: [hot text], 0x800000, 0x8002cc, 38.7595, 91.6667, 0.3553 +CHECK-HOT-SYMS: 800000 W __hot_start +CHECK-HOT-SYMS: 8002cc W __hot_end diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp index 4ad5ba29b28b8..eea9c6bcd18ad 100644 --- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp +++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp @@ -1022,12 +1022,12 @@ static llvm::Error serializeIndex(ClangDocContext &CDCtx) { // JavaScript from escaping characters incorrectly, and introducing bad paths // in the URLs. std::string RootPathEscaped = RootPath.str().str(); - std::replace(RootPathEscaped.begin(), RootPathEscaped.end(), '\\', '/'); + llvm::replace(RootPathEscaped, '\\', '/'); OS << "var RootPath = \"" << RootPathEscaped << "\";\n"; llvm::SmallString<128> Base(CDCtx.Base); std::string BaseEscaped = Base.str().str(); - std::replace(BaseEscaped.begin(), BaseEscaped.end(), '\\', '/'); + llvm::replace(BaseEscaped, '\\', '/'); OS << "var Base = \"" << BaseEscaped << "\";\n"; CDCtx.Idx.sort(); diff --git a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp index 42d358a15083a..ba89070be59cc 100644 --- a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.cpp @@ -49,9 +49,9 @@ std::string LLVMHeaderGuardCheck::getHeaderGuard(StringRef Filename, if (PosLLVM != StringRef::npos) Guard = Guard.substr(PosLLVM); - std::replace(Guard.begin(), Guard.end(), '/', '_'); - std::replace(Guard.begin(), Guard.end(), '.', '_'); - std::replace(Guard.begin(), Guard.end(), '-', '_'); + llvm::replace(Guard, '/', '_'); + llvm::replace(Guard, '.', '_'); + llvm::replace(Guard, '-', '_'); // The prevalent style in clang is LLVM_CLANG_FOO_BAR_H if (StringRef(Guard).starts_with("clang")) diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp index 3f63eec2c51a8..04040d580b6f1 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp @@ -496,7 +496,7 @@ void IdentifierNamingCheck::HungarianNotation::loadFileConfig( StringRef Val = Options.get(Buffer, ""); if (!Val.empty()) { std::string Type = PrimType.str(); - std::replace(Type.begin(), Type.end(), '-', ' '); + llvm::replace(Type, '-', ' '); HNOption.PrimitiveType[Type] = Val.str(); } } @@ -1358,7 +1358,7 @@ IdentifierNamingCheck::getFailureInfo( std::string KindName = fixupWithCase(Type, StyleNames[SK], ND, Style, HNOption, IdentifierNamingCheck::CT_LowerCase); - std::replace(KindName.begin(), KindName.end(), '_', ' '); + llvm::replace(KindName, '_', ' '); std::string Fixup = fixupWithStyle(Type, Name, Style, HNOption, ND); if (StringRef(Fixup) == Name) { diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp index 97c20cf200fa2..f8fd5e91d90d1 100644 --- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp @@ -202,8 +202,7 @@ bool MagicNumbersCheck::isIgnoredValue(const IntegerLiteral *Literal) const { if (IgnorePowersOf2IntegerValues && IntValue.isPowerOf2()) return true; - return std::binary_search(IgnoredIntegerValues.begin(), - IgnoredIntegerValues.end(), Value); + return llvm::binary_search(IgnoredIntegerValues, Value); } bool MagicNumbersCheck::isIgnoredValue(const FloatingLiteral *Literal) const { @@ -213,14 +212,12 @@ bool MagicNumbersCheck::isIgnoredValue(const FloatingLiteral *Literal) const { if (&FloatValue.getSemantics() == &llvm::APFloat::IEEEsingle()) { const float Value = FloatValue.convertToFloat(); - return std::binary_search(IgnoredFloatingPointValues.begin(), - IgnoredFloatingPointValues.end(), Value); + return llvm::binary_search(IgnoredFloatingPointValues, Value); } if (&FloatValue.getSemantics() == &llvm::APFloat::IEEEdouble()) { const double Value = FloatValue.convertToDouble(); - return std::binary_search(IgnoredDoublePointValues.begin(), - IgnoredDoublePointValues.end(), Value); + return llvm::binary_search(IgnoredDoublePointValues, Value); } return false; diff --git a/clang-tools-extra/clangd/Diagnostics.cpp b/clang-tools-extra/clangd/Diagnostics.cpp index 28bb994a9e99a..bc605fda5b0ce 100644 --- a/clang-tools-extra/clangd/Diagnostics.cpp +++ b/clang-tools-extra/clangd/Diagnostics.cpp @@ -800,7 +800,7 @@ void StoreDiags::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, M << "'"; } // Don't allow source code to inject newlines into diagnostics. - std::replace(Message.begin(), Message.end(), '\n', ' '); + llvm::replace(Message, '\n', ' '); } } if (Message.empty()) // either !SyntheticMessage, or we failed to make one. diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp index 62f220b32bd10..91fd3b0f8567b 100644 --- a/clang-tools-extra/clangd/FindTarget.cpp +++ b/clang-tools-extra/clangd/FindTarget.cpp @@ -57,7 +57,7 @@ LLVM_ATTRIBUTE_UNUSED std::string nodeToString(const DynTypedNode &N) { OS << ": "; N.print(OS, PrintingPolicy(LangOptions())); } - std::replace(S.begin(), S.end(), '\n', ' '); + llvm::replace(S, '\n', ' '); return S; } diff --git a/clang-tools-extra/clangd/index/FileIndex.cpp b/clang-tools-extra/clangd/index/FileIndex.cpp index 0fe069783d64f..c49de377d54ca 100644 --- a/clang-tools-extra/clangd/index/FileIndex.cpp +++ b/clang-tools-extra/clangd/index/FileIndex.cpp @@ -79,7 +79,8 @@ SlabTuple indexSymbols(ASTContext &AST, Preprocessor &PP, SymbolCollector Collector(std::move(CollectorOpts)); Collector.setPreprocessor(PP); - index::indexTopLevelDecls(AST, PP, DeclsToIndex, Collector, IndexOpts); + index::indexTopLevelDecls(AST, PP, DeclsToIndex, Collector, + std::move(IndexOpts)); if (MacroRefsToIndex) Collector.handleMacros(*MacroRefsToIndex); diff --git a/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp b/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp index d192749870d6f..fa451daf1bb52 100644 --- a/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp +++ b/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp @@ -381,7 +381,7 @@ std::unique_ptr openIndex(llvm::StringRef Index) { bool runCommand(std::string Request, const SymbolIndex &Index) { // Split on spaces and add required null-termination. - std::replace(Request.begin(), Request.end(), ' ', '\0'); + llvm::replace(Request, ' ', '\0'); llvm::SmallVector Args; llvm::StringRef(Request).split(Args, '\0', /*MaxSplit=*/-1, /*KeepEmpty=*/false); diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp index 576e863c8a9d2..a8f1ddf64d34b 100644 --- a/clang-tools-extra/modularize/ModularizeUtilities.cpp +++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp @@ -443,7 +443,7 @@ static std::string replaceDotDot(StringRef Path) { // \returns The file path in canonical form. std::string ModularizeUtilities::getCanonicalPath(StringRef FilePath) { std::string Tmp(replaceDotDot(FilePath)); - std::replace(Tmp.begin(), Tmp.end(), '\\', '/'); + llvm::replace(Tmp, '\\', '/'); StringRef Tmp2(Tmp); if (Tmp2.starts_with("./")) Tmp = std::string(Tmp2.substr(2)); diff --git a/clang-tools-extra/modularize/ModuleAssistant.cpp b/clang-tools-extra/modularize/ModuleAssistant.cpp index b4ec96617449f..7519056833040 100644 --- a/clang-tools-extra/modularize/ModuleAssistant.cpp +++ b/clang-tools-extra/modularize/ModuleAssistant.cpp @@ -156,8 +156,8 @@ ensureNoCollisionWithReservedName(llvm::StringRef MightBeReservedName) { static std::string ensureVaidModuleName(llvm::StringRef MightBeInvalidName) { std::string SafeName(MightBeInvalidName); - std::replace(SafeName.begin(), SafeName.end(), '-', '_'); - std::replace(SafeName.begin(), SafeName.end(), '.', '_'); + llvm::replace(SafeName, '-', '_'); + llvm::replace(SafeName, '.', '_'); if (isdigit(SafeName[0])) SafeName = "_" + SafeName; return SafeName; @@ -192,7 +192,7 @@ static bool addModuleDescription(Module *RootModule, return true; } // Make canonical. - std::replace(FilePath.begin(), FilePath.end(), '\\', '/'); + llvm::replace(FilePath, '\\', '/'); // Insert module into tree, using subdirectories as submodules. for (llvm::sys::path::const_iterator I = llvm::sys::path::begin(FilePath), E = llvm::sys::path::end(FilePath); diff --git a/clang-tools-extra/modularize/PreprocessorTracker.cpp b/clang-tools-extra/modularize/PreprocessorTracker.cpp index 336f570217933..04abb2733f5a7 100644 --- a/clang-tools-extra/modularize/PreprocessorTracker.cpp +++ b/clang-tools-extra/modularize/PreprocessorTracker.cpp @@ -904,7 +904,7 @@ class PreprocessorTrackerImpl : public PreprocessorTracker { // Convert to a canonical path. std::string getCanonicalPath(llvm::StringRef path) const { std::string CanonicalPath(path); - std::replace(CanonicalPath.begin(), CanonicalPath.end(), '\\', '/'); + llvm::replace(CanonicalPath, '\\', '/'); return CanonicalPath; } diff --git a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp index 4c916fa30685b..f92e406c652ea 100644 --- a/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp +++ b/clang-tools-extra/pp-trace/PPCallbacksTracker.cpp @@ -44,7 +44,7 @@ static std::string getSourceLocationString(Preprocessor &PP, std::string Result = SS.str(); // YAML treats backslash as escape, so use forward slashes. - std::replace(Result.begin(), Result.end(), '\\', '/'); + llvm::replace(Result, '\\', '/'); return Result; } @@ -653,7 +653,7 @@ void PPCallbacksTracker::appendFilePathArgument(const char *Name, llvm::StringRef Value) { std::string Path(Value); // YAML treats backslash as escape, so use forward slashes. - std::replace(Path.begin(), Path.end(), '\\', '/'); + llvm::replace(Path, '\\', '/'); appendQuotedArgument(Name, Path); } diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 4ff7f318416b7..a49441e815004 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -62,7 +62,24 @@ # # o implement additional SourceLocation, SourceRange, and File methods. -from ctypes import * +from ctypes import ( + Array, + CDLL, + CFUNCTYPE, + POINTER, + Structure, + byref, + c_char_p, + c_int, + c_longlong, + c_uint, + c_ulong, + c_ulonglong, + c_void_p, + cast, + cdll, + py_object, +) import os import sys @@ -73,6 +90,7 @@ Callable, cast as Tcast, Generic, + Iterator, Optional, Sequence, Type as TType, @@ -1544,6 +1562,24 @@ class ExceptionSpecificationKind(BaseEnumeration): ### Cursors ### +def cursor_null_guard(func): + """ + This decorator is used to ensure that no methods are called on null-cursors. + The bindings map null cursors to `None`, so users are not expected + to encounter them. + + If necessary, you can check whether a cursor is the null-cursor by + calling its `is_null` method. + """ + + def inner(self, *args, **kwargs): + if self.is_null(): + raise Exception("Tried calling method on a null-cursor.") + return func(self, *args, **kwargs) + + return inner + + class Cursor(Structure): """ The Cursor class represents a reference to an element within the AST. It @@ -1552,68 +1588,81 @@ class Cursor(Structure): _fields_ = [("_kind_id", c_int), ("xdata", c_int), ("data", c_void_p * 3)] - @staticmethod - def from_location(tu, location): - # We store a reference to the TU in the instance so the TU won't get - # collected before the cursor. - cursor = conf.lib.clang_getCursor(tu, location) - cursor._tu = tu + _tu: TranslationUnit - return cursor + @staticmethod + def from_location(tu: TranslationUnit, location: SourceLocation) -> Cursor | None: + return Cursor.from_result(conf.lib.clang_getCursor(tu, location), tu) - def __eq__(self, other): + # This function is not null-guarded because it is used in cursor_null_guard itself + def __eq__(self, other: object) -> bool: if not isinstance(other, Cursor): return False return conf.lib.clang_equalCursors(self, other) # type: ignore [no-any-return] - def __ne__(self, other): + # Not null-guarded for consistency with __eq__ + def __ne__(self, other: object) -> bool: return not self.__eq__(other) + @cursor_null_guard def __hash__(self) -> int: return self.hash - def is_definition(self): + # This function is not null-guarded because it is used in cursor_null_guard itself + def is_null(self) -> bool: + return self == conf.null_cursor + + @cursor_null_guard + def is_definition(self) -> bool: """ Returns true if the declaration pointed at by the cursor is also a definition of that entity. """ return conf.lib.clang_isCursorDefinition(self) # type: ignore [no-any-return] - def is_const_method(self): + @cursor_null_guard + def is_const_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared 'const'. """ return conf.lib.clang_CXXMethod_isConst(self) # type: ignore [no-any-return] - def is_converting_constructor(self): + @cursor_null_guard + def is_converting_constructor(self) -> bool: """Returns True if the cursor refers to a C++ converting constructor.""" return conf.lib.clang_CXXConstructor_isConvertingConstructor(self) # type: ignore [no-any-return] - def is_copy_constructor(self): + @cursor_null_guard + def is_copy_constructor(self) -> bool: """Returns True if the cursor refers to a C++ copy constructor.""" return conf.lib.clang_CXXConstructor_isCopyConstructor(self) # type: ignore [no-any-return] - def is_default_constructor(self): + @cursor_null_guard + def is_default_constructor(self) -> bool: """Returns True if the cursor refers to a C++ default constructor.""" return conf.lib.clang_CXXConstructor_isDefaultConstructor(self) # type: ignore [no-any-return] - def is_move_constructor(self): + @cursor_null_guard + def is_move_constructor(self) -> bool: """Returns True if the cursor refers to a C++ move constructor.""" return conf.lib.clang_CXXConstructor_isMoveConstructor(self) # type: ignore [no-any-return] - def is_default_method(self): + @cursor_null_guard + def is_default_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared '= default'. """ return conf.lib.clang_CXXMethod_isDefaulted(self) # type: ignore [no-any-return] - def is_deleted_method(self): + @cursor_null_guard + def is_deleted_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared '= delete'. """ return conf.lib.clang_CXXMethod_isDeleted(self) # type: ignore [no-any-return] - def is_copy_assignment_operator_method(self): + @cursor_null_guard + def is_copy_assignment_operator_method(self) -> bool: """Returnrs True if the cursor refers to a copy-assignment operator. A copy-assignment operator `X::operator=` is a non-static, @@ -1638,7 +1687,8 @@ class Bar { """ return conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self) # type: ignore [no-any-return] - def is_move_assignment_operator_method(self): + @cursor_null_guard + def is_move_assignment_operator_method(self) -> bool: """Returnrs True if the cursor refers to a move-assignment operator. A move-assignment operator `X::operator=` is a non-static, @@ -1663,7 +1713,8 @@ class Bar { """ return conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self) # type: ignore [no-any-return] - def is_explicit_method(self): + @cursor_null_guard + def is_explicit_method(self) -> bool: """Determines if a C++ constructor or conversion function is explicit, returning 1 if such is the case and 0 otherwise. @@ -1708,41 +1759,48 @@ class Foo { """ return conf.lib.clang_CXXMethod_isExplicit(self) # type: ignore [no-any-return] - def is_mutable_field(self): + @cursor_null_guard + def is_mutable_field(self) -> bool: """Returns True if the cursor refers to a C++ field that is declared 'mutable'. """ return conf.lib.clang_CXXField_isMutable(self) # type: ignore [no-any-return] - def is_pure_virtual_method(self): + @cursor_null_guard + def is_pure_virtual_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared pure virtual. """ return conf.lib.clang_CXXMethod_isPureVirtual(self) # type: ignore [no-any-return] - def is_static_method(self): + @cursor_null_guard + def is_static_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared 'static'. """ return conf.lib.clang_CXXMethod_isStatic(self) # type: ignore [no-any-return] - def is_virtual_method(self): + @cursor_null_guard + def is_virtual_method(self) -> bool: """Returns True if the cursor refers to a C++ member function or member function template that is declared 'virtual'. """ return conf.lib.clang_CXXMethod_isVirtual(self) # type: ignore [no-any-return] - def is_abstract_record(self): + @cursor_null_guard + def is_abstract_record(self) -> bool: """Returns True if the cursor refers to a C++ record declaration that has pure virtual member functions. """ return conf.lib.clang_CXXRecord_isAbstract(self) # type: ignore [no-any-return] - def is_scoped_enum(self): + @cursor_null_guard + def is_scoped_enum(self) -> bool: """Returns True if the cursor refers to a scoped enum declaration.""" return conf.lib.clang_EnumDecl_isScoped(self) # type: ignore [no-any-return] - def get_definition(self): + @cursor_null_guard + def get_definition(self) -> Cursor | None: """ If the cursor is a reference to a declaration or a declaration of some entity, return a cursor that points to the definition of that @@ -1752,7 +1810,8 @@ def get_definition(self): # declaration prior to issuing the lookup. return Cursor.from_result(conf.lib.clang_getCursorDefinition(self), self) - def get_usr(self): + @cursor_null_guard + def get_usr(self) -> str: """Return the Unified Symbol Resolution (USR) for the entity referenced by the given cursor. @@ -1763,19 +1822,22 @@ def get_usr(self): another translation unit.""" return _CXString.from_result(conf.lib.clang_getCursorUSR(self)) - def get_included_file(self): + @cursor_null_guard + def get_included_file(self) -> File: """Returns the File that is included by the current inclusion cursor.""" assert self.kind == CursorKind.INCLUSION_DIRECTIVE return File.from_result(conf.lib.clang_getIncludedFile(self), self) @property - def kind(self): + @cursor_null_guard + def kind(self) -> CursorKind: """Return the kind of this cursor.""" return CursorKind.from_id(self._kind_id) @property - def spelling(self): + @cursor_null_guard + def spelling(self) -> str: """Return the spelling of the entity pointed at by the cursor.""" if not hasattr(self, "_spelling"): self._spelling = _CXString.from_result( @@ -1784,7 +1846,8 @@ def spelling(self): return self._spelling - def pretty_printed(self, policy): + @cursor_null_guard + def pretty_printed(self, policy: PrintingPolicy) -> str: """ Pretty print declarations. Parameters: @@ -1795,7 +1858,8 @@ def pretty_printed(self, policy): ) @property - def displayname(self): + @cursor_null_guard + def displayname(self) -> str: """ Return the display name for the entity referenced by this cursor. @@ -1811,7 +1875,8 @@ def displayname(self): return self._displayname @property - def mangled_name(self): + @cursor_null_guard + def mangled_name(self) -> str: """Return the mangled name for the entity referenced by this cursor.""" if not hasattr(self, "_mangled_name"): self._mangled_name = _CXString.from_result( @@ -1821,18 +1886,20 @@ def mangled_name(self): return self._mangled_name @property - def location(self): + @cursor_null_guard + def location(self) -> SourceLocation: """ Return the source location (the starting character) of the entity pointed at by the cursor. """ if not hasattr(self, "_loc"): - self._loc = conf.lib.clang_getCursorLocation(self) + self._loc: SourceLocation = conf.lib.clang_getCursorLocation(self) return self._loc @property - def linkage(self): + @cursor_null_guard + def linkage(self) -> LinkageKind: """Return the linkage of this cursor.""" if not hasattr(self, "_linkage"): self._linkage = conf.lib.clang_getCursorLinkage(self) @@ -1840,7 +1907,8 @@ def linkage(self): return LinkageKind.from_id(self._linkage) @property - def tls_kind(self): + @cursor_null_guard + def tls_kind(self) -> TLSKind: """Return the thread-local storage (TLS) kind of this cursor.""" if not hasattr(self, "_tls_kind"): self._tls_kind = conf.lib.clang_getCursorTLSKind(self) @@ -1848,18 +1916,20 @@ def tls_kind(self): return TLSKind.from_id(self._tls_kind) @property - def extent(self): + @cursor_null_guard + def extent(self) -> SourceRange: """ Return the source range (the range of text) occupied by the entity pointed at by the cursor. """ if not hasattr(self, "_extent"): - self._extent = conf.lib.clang_getCursorExtent(self) + self._extent: SourceRange = conf.lib.clang_getCursorExtent(self) return self._extent @property - def storage_class(self): + @cursor_null_guard + def storage_class(self) -> StorageClass: """ Retrieves the storage class (if any) of the entity pointed at by the cursor. @@ -1870,7 +1940,8 @@ def storage_class(self): return StorageClass.from_id(self._storage_class) @property - def availability(self): + @cursor_null_guard + def availability(self) -> AvailabilityKind: """ Retrieves the availability of the entity pointed at by the cursor. """ @@ -1880,7 +1951,8 @@ def availability(self): return AvailabilityKind.from_id(self._availability) @property - def binary_operator(self): + @cursor_null_guard + def binary_operator(self) -> BinaryOperator: """ Retrieves the opcode if this cursor points to a binary operator :return: @@ -1892,7 +1964,8 @@ def binary_operator(self): return BinaryOperator.from_id(self._binopcode) @property - def access_specifier(self): + @cursor_null_guard + def access_specifier(self) -> AccessSpecifier: """ Retrieves the access specifier (if any) of the entity pointed at by the cursor. @@ -1903,7 +1976,8 @@ def access_specifier(self): return AccessSpecifier.from_id(self._access_specifier) @property - def type(self): + @cursor_null_guard + def type(self) -> Type: """ Retrieve the Type (if any) of the entity pointed at by the cursor. """ @@ -1913,7 +1987,8 @@ def type(self): return self._type @property - def canonical(self): + @cursor_null_guard + def canonical(self) -> Cursor: """Return the canonical Cursor corresponding to this Cursor. The canonical cursor is the cursor which is representative for the @@ -1922,14 +1997,15 @@ def canonical(self): declarations will be identical. """ if not hasattr(self, "_canonical"): - self._canonical = Cursor.from_cursor_result( + self._canonical = Cursor.from_non_null_cursor_result( conf.lib.clang_getCanonicalCursor(self), self ) return self._canonical @property - def result_type(self): + @cursor_null_guard + def result_type(self) -> Type: """Retrieve the Type of the result for this Cursor.""" if not hasattr(self, "_result_type"): self._result_type = Type.from_result( @@ -1939,7 +2015,8 @@ def result_type(self): return self._result_type @property - def exception_specification_kind(self): + @cursor_null_guard + def exception_specification_kind(self) -> ExceptionSpecificationKind: """ Retrieve the exception specification kind, which is one of the values from the ExceptionSpecificationKind enumeration. @@ -1953,7 +2030,8 @@ def exception_specification_kind(self): return self._exception_specification_kind @property - def underlying_typedef_type(self): + @cursor_null_guard + def underlying_typedef_type(self) -> Type: """Return the underlying type of a typedef declaration. Returns a Type for the typedef this cursor is a declaration for. If @@ -1968,7 +2046,8 @@ def underlying_typedef_type(self): return self._underlying_type @property - def enum_type(self): + @cursor_null_guard + def enum_type(self) -> Type: """Return the integer type of an enum declaration. Returns a Type corresponding to an integer. If the cursor is not for an @@ -1983,9 +2062,11 @@ def enum_type(self): return self._enum_type @property - def enum_value(self): + @cursor_null_guard + def enum_value(self) -> int: """Return the value of an enum constant.""" if not hasattr(self, "_enum_value"): + self._enum_value: int assert self.kind == CursorKind.ENUM_CONSTANT_DECL # Figure out the underlying type of the enum to know if it # is a signed or unsigned quantity. @@ -2009,7 +2090,8 @@ def enum_value(self): return self._enum_value @property - def objc_type_encoding(self): + @cursor_null_guard + def objc_type_encoding(self) -> str: """Return the Objective-C type encoding as a str.""" if not hasattr(self, "_objc_type_encoding"): self._objc_type_encoding = _CXString.from_result( @@ -2019,15 +2101,17 @@ def objc_type_encoding(self): return self._objc_type_encoding @property - def hash(self): + @cursor_null_guard + def hash(self) -> int: """Returns a hash of the cursor as an int.""" if not hasattr(self, "_hash"): - self._hash = conf.lib.clang_hashCursor(self) + self._hash: int = conf.lib.clang_hashCursor(self) return self._hash @property - def semantic_parent(self): + @cursor_null_guard + def semantic_parent(self) -> Cursor | None: """Return the semantic parent for this cursor.""" if not hasattr(self, "_semantic_parent"): self._semantic_parent = Cursor.from_cursor_result( @@ -2037,7 +2121,8 @@ def semantic_parent(self): return self._semantic_parent @property - def lexical_parent(self): + @cursor_null_guard + def lexical_parent(self) -> Cursor | None: """Return the lexical parent for this cursor.""" if not hasattr(self, "_lexical_parent"): self._lexical_parent = Cursor.from_cursor_result( @@ -2047,6 +2132,7 @@ def lexical_parent(self): return self._lexical_parent @property + @cursor_null_guard def specialized_template(self) -> Cursor | None: """Return the primary template that this cursor is a specialization of, if any.""" return Cursor.from_cursor_result( @@ -2054,14 +2140,16 @@ def specialized_template(self) -> Cursor | None: ) @property - def translation_unit(self): + @cursor_null_guard + def translation_unit(self) -> TranslationUnit: """Returns the TranslationUnit to which this Cursor belongs.""" # If this triggers an AttributeError, the instance was not properly # created. return self._tu @property - def referenced(self): + @cursor_null_guard + def referenced(self) -> Cursor | None: """ For a cursor that is a reference, returns a cursor representing the entity that it references. @@ -2074,54 +2162,62 @@ def referenced(self): return self._referenced @property - def brief_comment(self): + @cursor_null_guard + def brief_comment(self) -> str: """Returns the brief comment text associated with that Cursor""" return _CXString.from_result(conf.lib.clang_Cursor_getBriefCommentText(self)) @property - def raw_comment(self): + @cursor_null_guard + def raw_comment(self) -> str: """Returns the raw comment text associated with that Cursor""" return _CXString.from_result(conf.lib.clang_Cursor_getRawCommentText(self)) - def get_arguments(self): + @cursor_null_guard + def get_arguments(self) -> Iterator[Cursor | None]: """Return an iterator for accessing the arguments of this cursor.""" num_args = conf.lib.clang_Cursor_getNumArguments(self) for i in range(0, num_args): yield Cursor.from_result(conf.lib.clang_Cursor_getArgument(self, i), self) - def get_num_template_arguments(self): + @cursor_null_guard + def get_num_template_arguments(self) -> int: """Returns the number of template args associated with this cursor.""" return conf.lib.clang_Cursor_getNumTemplateArguments(self) # type: ignore [no-any-return] - def get_template_argument_kind(self, num): + @cursor_null_guard + def get_template_argument_kind(self, num: int) -> TemplateArgumentKind: """Returns the TemplateArgumentKind for the indicated template argument.""" return TemplateArgumentKind.from_id( conf.lib.clang_Cursor_getTemplateArgumentKind(self, num) ) - def get_template_argument_type(self, num): + @cursor_null_guard + def get_template_argument_type(self, num: int) -> Type: """Returns the CXType for the indicated template argument.""" return Type.from_result( conf.lib.clang_Cursor_getTemplateArgumentType(self, num), (self, num) ) - def get_template_argument_value(self, num): + @cursor_null_guard + def get_template_argument_value(self, num: int) -> int: """Returns the value of the indicated arg as a signed 64b integer.""" return conf.lib.clang_Cursor_getTemplateArgumentValue(self, num) # type: ignore [no-any-return] - def get_template_argument_unsigned_value(self, num): + @cursor_null_guard + def get_template_argument_unsigned_value(self, num: int) -> int: """Returns the value of the indicated arg as an unsigned 64b integer.""" return conf.lib.clang_Cursor_getTemplateArgumentUnsignedValue(self, num) # type: ignore [no-any-return] - def get_children(self): + @cursor_null_guard + def get_children(self) -> Iterator[Cursor]: """Return an iterator for accessing the children of this cursor.""" # FIXME: Expose iteration from CIndex, PR6125. - def visitor(child, parent, children): + def visitor(child: Cursor, _: Cursor, children: list[Cursor]) -> int: # FIXME: Document this assertion in API. - # FIXME: There should just be an isNull method. - assert child != conf.lib.clang_getNullCursor() + assert not child.is_null() # Create reference to TU so it isn't GC'd before Cursor. child._tu = self._tu @@ -2132,7 +2228,8 @@ def visitor(child, parent, children): conf.lib.clang_visitChildren(self, cursor_visit_callback(visitor), children) return iter(children) - def walk_preorder(self): + @cursor_null_guard + def walk_preorder(self) -> Iterator[Cursor]: """Depth-first preorder walk over the cursor and its descendants. Yields cursors. @@ -2142,7 +2239,8 @@ def walk_preorder(self): for descendant in child.walk_preorder(): yield descendant - def get_tokens(self): + @cursor_null_guard + def get_tokens(self) -> Iterator[Token]: """Obtain Token instances formulating that compose this Cursor. This is a generator for Token instances. It returns all tokens which @@ -2150,19 +2248,23 @@ def get_tokens(self): """ return TokenGroup.get_tokens(self._tu, self.extent) - def get_field_offsetof(self): + @cursor_null_guard + def get_field_offsetof(self) -> int: """Returns the offsetof the FIELD_DECL pointed by this Cursor.""" return conf.lib.clang_Cursor_getOffsetOfField(self) # type: ignore [no-any-return] - def get_base_offsetof(self, parent): + @cursor_null_guard + def get_base_offsetof(self, parent: Cursor) -> int: """Returns the offsetof the CXX_BASE_SPECIFIER pointed by this Cursor.""" return conf.lib.clang_getOffsetOfBase(parent, self) # type: ignore [no-any-return] - def is_virtual_base(self): + @cursor_null_guard + def is_virtual_base(self) -> bool: """Returns whether the CXX_BASE_SPECIFIER pointed by this Cursor is virtual.""" return conf.lib.clang_isVirtualBase(self) # type: ignore [no-any-return] - def is_anonymous(self): + @cursor_null_guard + def is_anonymous(self) -> bool: """ Check whether this is a record type without a name, or a field where the type is a record type without a name. @@ -2174,7 +2276,8 @@ def is_anonymous(self): return self.type.get_declaration().is_anonymous() return conf.lib.clang_Cursor_isAnonymous(self) # type: ignore [no-any-return] - def is_anonymous_record_decl(self): + @cursor_null_guard + def is_anonymous_record_decl(self) -> bool: """ Check if the record is an anonymous union as defined in the C/C++ standard (or an "anonymous struct", the corresponding non-standard extension for @@ -2184,18 +2287,21 @@ def is_anonymous_record_decl(self): return self.type.get_declaration().is_anonymous_record_decl() return conf.lib.clang_Cursor_isAnonymousRecordDecl(self) # type: ignore [no-any-return] - def is_bitfield(self): + @cursor_null_guard + def is_bitfield(self) -> bool: """ Check if the field is a bitfield. """ return conf.lib.clang_Cursor_isBitField(self) # type: ignore [no-any-return] - def get_bitfield_width(self): + @cursor_null_guard + def get_bitfield_width(self) -> int: """ Retrieve the width of a bitfield. """ return conf.lib.clang_getFieldDeclBitWidth(self) # type: ignore [no-any-return] + @cursor_null_guard def has_attrs(self) -> bool: """ Determine whether the given cursor has any attributes. @@ -2203,10 +2309,9 @@ def has_attrs(self) -> bool: return bool(conf.lib.clang_Cursor_hasAttrs(self)) @staticmethod - def from_result(res, arg): + def from_result(res: Cursor, arg: Cursor | TranslationUnit | Type) -> Cursor | None: assert isinstance(res, Cursor) - # FIXME: There should just be an isNull method. - if res == conf.lib.clang_getNullCursor(): + if res.is_null(): return None # Store a reference to the TU in the Python object so it won't get GC'd @@ -2223,14 +2328,22 @@ def from_result(res, arg): return res @staticmethod - def from_cursor_result(res, arg): + def from_cursor_result(res: Cursor, arg: Cursor) -> Cursor | None: assert isinstance(res, Cursor) - if res == conf.lib.clang_getNullCursor(): + if res.is_null(): return None res._tu = arg._tu return res + @staticmethod + def from_non_null_cursor_result(res: Cursor, arg: Cursor | Type) -> Cursor: + assert isinstance(res, Cursor) + assert not res.is_null() + + res._tu = arg._tu + return res + class BinaryOperator(BaseEnumeration): """ @@ -2664,7 +2777,9 @@ def get_declaration(self): """ Return the cursor for the declaration of the given type. """ - return Cursor.from_result(conf.lib.clang_getTypeDeclaration(self), self) + return Cursor.from_non_null_cursor_result( + conf.lib.clang_getTypeDeclaration(self), self + ) def get_result(self): """ @@ -2724,7 +2839,7 @@ def get_fields(self): """Return an iterator for accessing the fields of this type.""" def visitor(field, children): - assert field != conf.lib.clang_getNullCursor() + assert not field.is_null() # Create reference to TU so it isn't GC'd before Cursor. field._tu = self._tu @@ -2739,7 +2854,7 @@ def get_bases(self): """Return an iterator for accessing the base classes of this type.""" def visitor(base, children): - assert base != conf.lib.clang_getNullCursor() + assert not base.is_null() # Create reference to TU so it isn't GC'd before Cursor. base._tu = self._tu @@ -2754,7 +2869,7 @@ def get_methods(self): """Return an iterator for accessing the methods of this type.""" def visitor(method, children): - assert method != conf.lib.clang_getNullCursor() + assert not method.is_null() # Create reference to TU so it isn't GC'd before Cursor. method._tu = self._tu @@ -3952,6 +4067,7 @@ def set_property(self, property, value): ("clang_equalRanges", [SourceRange, SourceRange], bool), ("clang_equalTypes", [Type, Type], bool), ("clang_formatDiagnostic", [Diagnostic, c_uint], _CXString), + ("clang_getAddressSpace", [Type], c_uint), ("clang_getArgType", [Type, c_uint], Type), ("clang_getArrayElementType", [Type], Type), ("clang_getArraySize", [Type], c_longlong), @@ -3970,8 +4086,10 @@ def set_property(self, property, value): ("clang_getCursorAvailability", [Cursor], c_int), ("clang_getCursorDefinition", [Cursor], Cursor), ("clang_getCursorDisplayName", [Cursor], _CXString), + ("clang_getCursorExceptionSpecificationType", [Cursor], c_int), ("clang_getCursorExtent", [Cursor], SourceRange), ("clang_getCursorLexicalParent", [Cursor], Cursor), + ("clang_getCursorLinkage", [Cursor], c_int), ("clang_getCursorLocation", [Cursor], SourceLocation), ("clang_getCursorPrettyPrinted", [Cursor, PrintingPolicy], _CXString), ("clang_getCursorPrintingPolicy", [Cursor], c_object_p), @@ -3980,6 +4098,7 @@ def set_property(self, property, value): ("clang_getCursorResultType", [Cursor], Type), ("clang_getCursorSemanticParent", [Cursor], Cursor), ("clang_getCursorSpelling", [Cursor], _CXString), + ("clang_getCursorTLSKind", [Cursor], c_int), ("clang_getCursorType", [Cursor], Type), ("clang_getCursorUSR", [Cursor], _CXString), ("clang_Cursor_getMangling", [Cursor], _CXString), @@ -4005,6 +4124,7 @@ def set_property(self, property, value): ("clang_getEnumConstantDeclUnsignedValue", [Cursor], c_ulonglong), ("clang_getEnumConstantDeclValue", [Cursor], c_longlong), ("clang_getEnumDeclIntegerType", [Cursor], Type), + ("clang_getExceptionSpecificationType", [Type], c_int), ("clang_getFile", [TranslationUnit, c_interop_string], c_object_p), ("clang_getFileName", [File], _CXString), ("clang_getFileTime", [File], c_uint), @@ -4101,6 +4221,7 @@ def set_property(self, property, value): ("clang_Cursor_getBriefCommentText", [Cursor], _CXString), ("clang_Cursor_getRawCommentText", [Cursor], _CXString), ("clang_Cursor_getOffsetOfField", [Cursor], c_longlong), + ("clang_Cursor_getStorageClass", [Cursor], c_int), ("clang_Cursor_isAnonymous", [Cursor], bool), ("clang_Cursor_isAnonymousRecordDecl", [Cursor], bool), ("clang_Cursor_isBitField", [Cursor], bool), @@ -4224,6 +4345,7 @@ def set_compatibility_check(check_status: bool) -> None: def lib(self) -> CDLL: lib = self.get_cindex_library() register_functions(lib, not Config.compatibility_check) + self.null_cursor = lib.clang_getNullCursor() Config.loaded = True return lib diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py index b90a0495ca7be..eb0d1d50601a6 100644 --- a/clang/bindings/python/tests/cindex/test_cursor.py +++ b/clang/bindings/python/tests/cindex/test_cursor.py @@ -12,6 +12,7 @@ TemplateArgumentKind, TranslationUnit, TypeKind, + conf, ) if "CLANG_LIBRARY_PATH" in os.environ: @@ -1050,3 +1051,16 @@ def test_equality(self): self.assertEqual(cursor1, cursor1_2) self.assertNotEqual(cursor1, cursor2) self.assertNotEqual(cursor1, "foo") + + def test_null_cursor(self): + tu = get_tu("int a = 729;") + + for cursor in tu.cursor.walk_preorder(): + self.assertFalse(cursor.is_null()) + + nc = conf.lib.clang_getNullCursor() + self.assertTrue(nc.is_null()) + with self.assertRaises(Exception): + nc.is_definition() + with self.assertRaises(Exception): + nc.spelling diff --git a/clang/bindings/python/tests/cindex/test_lib.py b/clang/bindings/python/tests/cindex/test_lib.py new file mode 100644 index 0000000000000..5e88ebf9d8448 --- /dev/null +++ b/clang/bindings/python/tests/cindex/test_lib.py @@ -0,0 +1,31 @@ +import os + +import clang.cindex + +if "CLANG_LIBRARY_PATH" in os.environ: + clang.cindex.Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) + +import unittest +import ast + + +class TestLib(unittest.TestCase): + def test_functions_registered(self): + def get_function_spelling(node): + # The call expressions we are interested in have + # their spelling in .attr, not .id + if hasattr(node, "attr"): + return node.attr + return "" + + filename = clang.cindex.__file__ + with open(filename) as file: + root = ast.parse(file.read()) + functions = [ + get_function_spelling(node.func) + for node in ast.walk(root) + if isinstance(node, ast.Call) + ] + used_functions = set([func for func in functions if func.startswith("clang_")]) + registered_functions = set([item[0] for item in clang.cindex.FUNCTION_LIST]) + self.assertEqual(used_functions - registered_functions, set()) diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt index ca625efc6ccef..1f06c040c96cb 100644 --- a/clang/docs/CMakeLists.txt +++ b/clang/docs/CMakeLists.txt @@ -134,6 +134,34 @@ if (LLVM_ENABLE_SPHINX) gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}") gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}") + # Another generated file from a different source + set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools) + set(aopts_rst_rel_path analyzer/user-docs/Options.rst) + set(aopts_rst "${CMAKE_CURRENT_BINARY_DIR}/${aopts_rst_rel_path}") + set(analyzeroptions_def "${CMAKE_CURRENT_SOURCE_DIR}/../include/clang/StaticAnalyzer/Core/AnalyzerOptions.def") + set(aopts_rst_in "${CMAKE_CURRENT_SOURCE_DIR}/${aopts_rst_rel_path}.in") + add_custom_command( + OUTPUT ${aopts_rst} + COMMAND ${Python3_EXECUTABLE} generate_analyzer_options_docs.py + --options-def "${analyzeroptions_def}" + --template "${aopts_rst_in}" + --out "${aopts_rst}" + WORKING_DIRECTORY ${docs_tools_dir} + VERBATIM + COMMENT "Generating ${aopts_rst}" + DEPENDS ${docs_tools_dir}/${generate_aopts_docs} + ${aopts_rst_in} + copy-clang-rst-docs + ) + add_custom_target(generate-analyzer-options-rst DEPENDS ${aopts_rst}) + foreach(target ${docs_targets}) + add_dependencies(${target} generate-analyzer-options-rst) + endforeach() + + # Technically this is redundant because generate-analyzer-options-rst + # depends on the copy operation (because it wants to drop a generated file + # into a subdirectory of the copied tree), but I'm leaving it here for the + # sake of clarity. foreach(target ${docs_targets}) add_dependencies(${target} copy-clang-rst-docs) endforeach() diff --git a/clang/docs/DebuggingCoroutines.rst b/clang/docs/DebuggingCoroutines.rst index 7f464c1f4f28c..80df321340724 100644 --- a/clang/docs/DebuggingCoroutines.rst +++ b/clang/docs/DebuggingCoroutines.rst @@ -255,7 +255,7 @@ However, when optimizations are enabled, the printed result changes drastically: {__resume_fn = 0x401280 , __destroy_fn = 0x401390 , __promise = {count = 1}, __int_32_0 = 43, __coro_index = 1 '\001'} Unused values are optimized out, as well as the name of the local variable `a`. -The only information remained is the value of a 32 bit integer. In this simple +The only information remained is the value of a 32-bit integer. In this simple case, it seems to be pretty clear that `__int_32_0` represents `a`. However, it is not true. diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index 0c08b6a70b4ce..8a44db79a07ff 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -3560,7 +3560,7 @@ For example: // expected-note {{{evaluates to '{{2, 3, 4}} == {0, 3, 4}'}}} -The intent is to allow the delimeter to be wider than the longest `{` or `}` +The intent is to allow the delimiter to be wider than the longest `{` or `}` brace sequence in the content, so that if your expected text contains `{{{` (three braces) it may be delimited with `{{{{` (four braces), and so on. diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index f56f2a640bb36..a40dd4d1a1673 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -497,7 +497,7 @@ __const, __const__, __volatile, __volatile__, __restrict, __restrict__ ---------------------------------------------------------------------- These are alternate spellings for their non-underscore counterparts, but are -available in all langauge modes. +available in all language modes. __decltype ---------- @@ -526,7 +526,7 @@ __typeof, __typeof__, __typeof_unqual, __typeof_unqual__ -------------------------------------------------------- ``__typeof`` and ``__typeof__`` are alternate spellings for ``typeof``, but are -available in all langauge modes. These spellings result in the operand, +available in all language modes. These spellings result in the operand, retaining all qualifiers. ``__typeof_unqual`` and ``__typeof_unqual__`` are alternate spellings for the @@ -2043,7 +2043,7 @@ references can be used instead of numeric references. } -Constexpr strings in GNU ASM statememts +Constexpr strings in GNU ASM statements ======================================= In C++11 mode (and greater), Clang supports specifying the template, @@ -4412,7 +4412,7 @@ It is undefined behavior to call this function on an already initialized A builtin function for the target-specific ``va_start`` function-like macro, available only in C23 and later. The builtin accepts zero or one argument for the ellipsis (``...``). If such an argument is provided, it should be the name -of the parameter preceeding the ellipsis, which is used for compatibility with +of the parameter preceding the ellipsis, which is used for compatibility with C versions before C23. It is an error to provide two or more variadic arguments. This function initializes the given ``__builtin_va_list`` object. It is undefined behavior to call this function on an already initialized diff --git a/clang/docs/Modules.rst b/clang/docs/Modules.rst index 69a45b7fd9ace..acbe45e0be970 100644 --- a/clang/docs/Modules.rst +++ b/clang/docs/Modules.rst @@ -613,10 +613,10 @@ tls A specific target feature (e.g., ``sse4``, ``avx``, ``neon``) is available. *platform/os* - A os/platform variant (e.g. ``freebsd``, ``win32``, ``windows``, ``linux``, ``ios``, ``macos``, ``iossimulator``) is available. + An os/platform variant (e.g. ``freebsd``, ``win32``, ``windows``, ``linux``, ``ios``, ``macos``, ``iossimulator``) is available. *environment* - A environment variant (e.g. ``gnu``, ``gnueabi``, ``android``, ``msvc``) is available. + An environment variant (e.g. ``gnu``, ``gnueabi``, ``android``, ``msvc``) is available. **Example:** The ``std`` module can be extended to also include C++ and C++11 headers using a *requires-declaration*: diff --git a/clang/docs/PointerAuthentication.rst b/clang/docs/PointerAuthentication.rst index b9341a9c3b6a8..913291c954447 100644 --- a/clang/docs/PointerAuthentication.rst +++ b/clang/docs/PointerAuthentication.rst @@ -554,7 +554,7 @@ with this idea: - It's unclear whether this kind of encryption is even possible without increasing the storage size of a signed pointer. If the storage size can be - increased, brute-force atacks can be equally well mitigated by simply storing + increased, brute-force attacks can be equally well mitigated by simply storing a larger signature. - It would likely be impossible to implement a ``strip`` operation, which might diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst index f5d29af2bef3c..b842094445f5d 100644 --- a/clang/docs/RealtimeSanitizer.rst +++ b/clang/docs/RealtimeSanitizer.rst @@ -187,7 +187,7 @@ A **partial** list of flags RealtimeSanitizer respects: * - ``abort_on_error`` - OS dependent - boolean - - If true, the tool calls ``abort()`` instead of ``_exit()`` after printing the error report. On some OSes (MacOS, for exmple) this is beneficial because a better stack trace is emitted on crash. + - If true, the tool calls ``abort()`` instead of ``_exit()`` after printing the error report. On some OSes (MacOS, for example) this is beneficial because a better stack trace is emitted on crash. * - ``symbolize`` - ``true`` - boolean @@ -279,7 +279,7 @@ In general, ``ScopedDisabler`` should be preferred, as it is the most performant - Run-time - Stack - High - - Suppresses any stack trace contaning the specified pattern. + - Suppresses any stack trace containing the specified pattern. ``ScopedDisabler`` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index bc13d02e2d20b..537f29521fb7f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -58,6 +58,8 @@ C++ Specific Potentially Breaking Changes - The type trait builtin ``__is_referenceable`` has been removed, since it has very few users and all the type traits that could benefit from it in the standard library already have their own bespoke builtins. +- A workaround for libstdc++4.7 has been removed. Note that 4.8.3 remains the oldest + supported libstdc++ version. ABI Changes in This Version --------------------------- @@ -77,6 +79,11 @@ Clang Frontend Potentially Breaking Changes Clang Python Bindings Potentially Breaking Changes -------------------------------------------------- +- ``Cursor.from_location`` now returns ``None`` instead of a null cursor. + This eliminates the last known source of null cursors. +- Almost all ``Cursor`` methods now assert that they are called on non-null cursors. + Most of the time null cursors were mapped to ``None``, + so no widespread breakages are expected. What's New in Clang |release|? ============================== @@ -118,6 +125,9 @@ C++23 Feature Support C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ +- Fixed a crash with a defaulted spaceship (``<=>``) operator when the class + contains a member declaration of vector type. Vector types cannot yet be + compared directly, so this causes the operator to be deleted. (#GH137452) C++17 Feature Support ^^^^^^^^^^^^^^^^^^^^^ @@ -303,6 +313,8 @@ New Compiler Flags - New option ``-ftime-report-json`` added which outputs the same timing data as ``-ftime-report`` but formatted as JSON. +- New option ``-Wnrvo`` added and disabled by default to warn about missed NRVO opportunites. + Deprecated Compiler Flags ------------------------- @@ -477,7 +489,7 @@ Improvements to Clang's diagnostics - An error is now emitted when a ``musttail`` call is made to a function marked with the ``not_tail_called`` attribute. (#GH133509). -- ``-Whigher-precisision-for-complex-divison`` warns when: +- ``-Whigher-precision-for-complex-divison`` warns when: - The divisor is complex. - When the complex division happens in a higher precision type due to arithmetic promotion. @@ -515,14 +527,36 @@ Improvements to Clang's diagnostics - Several compatibility diagnostics that were incorrectly being grouped under ``-Wpre-c++20-compat`` are now part of ``-Wc++20-compat``. (#GH138775) -- Improved the ``-Wtautological-overlap-compare`` diagnostics to warn about overlapping and non-overlapping ranges involving character literals and floating-point literals. +- Improved the ``-Wtautological-overlap-compare`` diagnostics to warn about overlapping and non-overlapping ranges involving character literals and floating-point literals. The warning message for non-overlapping cases has also been improved (#GH13473). - Fixed a duplicate diagnostic when performing typo correction on function template calls with explicit template arguments. (#GH139226) -- An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have an - argument larger than what can fit within a 64-bit integer. +- Explanatory note is printed when ``assert`` fails during evaluation of a + constant expression. Prior to this, the error inaccurately implied that assert + could not be used at all in a constant expression (#GH130458) + +- A new off-by-default warning ``-Wms-bitfield-padding`` has been added to alert to cases where bit-field + packing may differ under the MS struct ABI (#GH117428). + +- ``-Watomic-access`` no longer fires on unreachable code. e.g., + + .. code-block:: c + + _Atomic struct S { int a; } s; + void func(void) { + if (0) + s.a = 12; // Previously diagnosed with -Watomic-access, now silenced + s.a = 12; // Still diagnosed with -Watomic-access + return; + s.a = 12; // Previously diagnosed, now silenced + } + + +- A new ``-Wcharacter-conversion`` warns where comparing or implicitly converting + between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``). + This warning only triggers in C++ as these types are aliases in C. (#GH138526) Improvements to Clang's time-trace ---------------------------------- @@ -582,14 +616,20 @@ Bug Fixes in This Version ``#include`` directive. (#GH138094) - Fixed a crash during constant evaluation involving invalid lambda captures (#GH138832) +- Fixed a crash when instantiating an invalid dependent friend template specialization. + (#GH139052) - Fixed a crash with an invalid member function parameter list with a default argument which contains a pragma. (#GH113722) - Fixed assertion failures when generating name lookup table in modules. (#GH61065, #GH134739) +- Fixed an assertion failure in constant compound literal statements. (#GH139160) +- Fix crash due to unknown references and pointer implementation and handling of + base classes. (GH139452) +- Fixed an assertion failure in serialization of constexpr structs containing unions. (#GH140130) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- The behvaiour of ``__add_pointer`` and ``__remove_pointer`` for Objective-C++'s ``id`` and interfaces has been fixed. +- The behaviour of ``__add_pointer`` and ``__remove_pointer`` for Objective-C++'s ``id`` and interfaces has been fixed. - The signature for ``__builtin___clear_cache`` was changed from ``void(char *, char *)`` to ``void(void *, void *)`` to match GCC's signature @@ -664,7 +704,7 @@ Bug Fixes to C++ Support not in the last position. - Disallow overloading on struct vs class on dependent types, which is IFNDR, as this makes the problem diagnosable. -- Improved preservation of the presence or abscence of typename specifier when +- Improved preservation of the presence or absence of typename specifier when printing types in diagnostics. - Clang now correctly parses ``if constexpr`` expressions in immediate function context. (#GH123524) - Fixed an assertion failure affecting code that uses C++23 "deducing this". (#GH130272) @@ -697,6 +737,9 @@ Bug Fixes to C++ Support - Fixed the handling of pack indexing types in the constraints of a member function redeclaration. (#GH138255) - Clang now correctly parses arbitrary order of ``[[]]``, ``__attribute__`` and ``alignas`` attributes for declarations (#GH133107) - Fixed a crash when forming an invalid function type in a dependent context. (#GH138657) (#GH115725) (#GH68852) +- Fixed a function declaration mismatch that caused inconsistencies between concepts and variable template declarations. (#GH139476) +- Clang no longer segfaults when there is a configuration mismatch between modules and their users (http://crbug.com/400353616). +- Fix an incorrect deduction when calling an explicit object member function template through an overload set address. Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -796,6 +839,8 @@ RISC-V Support - `Zicsr` / `Zifencei` are allowed to be duplicated in the presence of `g` in `-march`. +- Add support for the `__builtin_riscv_pause()` intrinsic from the `Zihintpause` extension. + CUDA/HIP Language Changes ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -845,6 +890,11 @@ clang-format - Add ``OneLineFormatOffRegex`` option for turning formatting off for one line. - Add ``SpaceAfterOperatorKeyword`` option. +clang-refactor +-------------- +- Reject `0` as column or line number in 1-based command-line source locations. + Fixes crash caused by `0` input in `-selection=::[-:]`. (#GH139457) + libclang -------- - Fixed a bug in ``clang_File_isEqual`` that sometimes led to different @@ -863,6 +913,8 @@ libclang Code Completion --------------- +- Reject `0` as column or line number in 1-based command-line source locations. + Fixes crash caused by `0` input in `-code-completion-at=::`. (#GH139457) Static Analyzer --------------- @@ -931,6 +983,12 @@ OpenMP Support - Fixed a crashing bug with a malformed ``cancel`` directive. (#GH139360) - Fixed a crashing bug with ``omp distribute dist_schedule`` if the argument to ``dist_schedule`` was not strictly positive. (#GH139266) +- Fixed two crashing bugs with a malformed ``metadirective`` directive. One was + a crash if the next token after ``metadirective`` was a paren, bracket, or + brace. The other was if the next token after the meta directive was not an + open parenthesis. (#GH139665) +- An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have + an argument larger than what can fit within a 64-bit integer. Improvements ^^^^^^^^^^^^ diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst index 2ca014f3fd831..933a57ff34dd9 100644 --- a/clang/docs/StandardCPlusPlusModules.rst +++ b/clang/docs/StandardCPlusPlusModules.rst @@ -709,7 +709,7 @@ Before Clang 19, a change in BMI of any (transitive) dependency would cause the outputs of the BMI to change. Starting with Clang 19, changes to non-direct dependencies should not directly affect the output BMI, unless they affect the results of the compilations. We expect that there are many more opportunities -for this optimization than we currently have realized and would appreaciate +for this optimization than we currently have realized and would appreciate feedback about missed optimization opportunities. For example, .. code-block:: c++ diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 6f804a10748d8..d3a0a3a46db33 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -921,7 +921,7 @@ Clang options that don't fit neatly into other categories. Instruct clang not to emit the signature string for blocks. Disabling the string can potentially break existing code that relies on it. Users should - carefully consider this possibiilty when using the flag. + carefully consider this possibility when using the flag. .. _configuration-files: @@ -2792,6 +2792,9 @@ usual build cycle when using sample profilers for optimization: $ llvm-profgen --binary=./code --output=code.prof --perfdata=perf.data + Please note, ``perf.data`` must be collected with ``-b`` flag to Linux ``perf`` + for the above step to work. + When using SEP the output is in the textual format corresponding to ``llvm-profgen --perfscript``. For example: @@ -3195,7 +3198,7 @@ indexed format, regardeless whether it is produced by frontend or the IR pass. .. option:: -fprofile-continuous Enables the continuous instrumentation profiling where profile counter updates - are continuously synced to a file. This option sets any neccessary modifiers + are continuously synced to a file. This option sets any necessary modifiers (currently ``%c``) in the default profile filename and passes any necessary flags to the middle-end to support this mode. Value profiling is not supported in continuous mode. @@ -3324,7 +3327,7 @@ on the ``-fprofile-generate`` and the ``-fprofile-use`` flags. * ``__LLVM_INSTR_PROFILE_USE``: defined when one of ``-fprofile-use``/``-fprofile-instr-use`` is in effect. -The two macros can be used to provide more flexibiilty so a user program +The two macros can be used to provide more flexibility so a user program can execute code specifically intended for profile generate or profile use. For example, a user program can have special logging during profile generate: @@ -4151,7 +4154,7 @@ There is a set of concrete HW architectures that OpenCL can be compiled for. Generic Targets ^^^^^^^^^^^^^^^ -- A SPIR-V binary can be produced for 32 or 64 bit targets. +- A SPIR-V binary can be produced for 32- or 64-bit targets. .. code-block:: console diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index f91b2af1fd105..c2ae80c47eca1 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -1460,7 +1460,7 @@ overflow occurs), the checker assumes that the the index (more precisely, the memory offeset) is within bounds. However, if :ref:`optin-taint-GenericTaint` is enabled and the index/offset is -tainted (i.e. it is influenced by an untrusted souce), then this checker +tainted (i.e. it is influenced by an untrusted source), then this checker reports the potential out of bounds access: .. code-block:: c @@ -2710,7 +2710,7 @@ Check for proper uses of CFNumber APIs. CFNumberRef test(unsigned char x) { return CFNumberCreate(0, kCFNumberSInt16Type, &x); - // warn: 8 bit integer is used to initialize a 16 bit integer + // warn: 8-bit integer is used to initialize a 16-bit integer } .. _osx-coreFoundation-CFRetainRelease: diff --git a/clang/docs/analyzer/user-docs.rst b/clang/docs/analyzer/user-docs.rst index e265f033a2c54..67c1dfaa40965 100644 --- a/clang/docs/analyzer/user-docs.rst +++ b/clang/docs/analyzer/user-docs.rst @@ -8,6 +8,7 @@ Contents: user-docs/Installation user-docs/CommandLineUsage + user-docs/Options user-docs/UsingWithXCode user-docs/FilingBugs user-docs/CrossTranslationUnit diff --git a/clang/docs/analyzer/user-docs/CommandLineUsage.rst b/clang/docs/analyzer/user-docs/CommandLineUsage.rst index 59f8187f374a9..0252de80b788f 100644 --- a/clang/docs/analyzer/user-docs/CommandLineUsage.rst +++ b/clang/docs/analyzer/user-docs/CommandLineUsage.rst @@ -194,6 +194,8 @@ When compiling your application to run on the simulator, it is important that ** If you aren't certain which compiler Xcode uses to build your project, try just running ``xcodebuild`` (without **scan-build**). You should see the full path to the compiler that Xcode is using, and use that as an argument to ``--use-cc``. +.. _command-line-usage-CodeChecker: + CodeChecker ----------- diff --git a/clang/docs/analyzer/user-docs/Installation.rst b/clang/docs/analyzer/user-docs/Installation.rst index f1656fc80c2e3..d84007328e5dc 100644 --- a/clang/docs/analyzer/user-docs/Installation.rst +++ b/clang/docs/analyzer/user-docs/Installation.rst @@ -28,7 +28,7 @@ Packaged builds for other platforms may eventually be provided, but we need volu [Legacy] Using Packaged Builds ------------------------------ -To use the legacy pacakge builds, simply unpack it anywhere. If the build archive has the name **``checker-XXX.tar.bz2``** then the archive will expand to a directory called **``checker-XXX``**. You do not need to place this directory or the contents of this directory in any special place. Uninstalling the analyzer is as simple as deleting this directory. +To use the legacy package builds, simply unpack it anywhere. If the build archive has the name **``checker-XXX.tar.bz2``** then the archive will expand to a directory called **``checker-XXX``**. You do not need to place this directory or the contents of this directory in any special place. Uninstalling the analyzer is as simple as deleting this directory. Most of the files in the **``checker-XXX``** directory will be supporting files for the analyzer that you can simply ignore. Most users will only care about two files, which are located at the top of the **``checker-XXX``** directory: diff --git a/clang/docs/analyzer/user-docs/Options.rst.in b/clang/docs/analyzer/user-docs/Options.rst.in new file mode 100644 index 0000000000000..0d2883fb9ead1 --- /dev/null +++ b/clang/docs/analyzer/user-docs/Options.rst.in @@ -0,0 +1,114 @@ +======================== +Configuring the Analyzer +======================== + +The clang static analyzer supports two kinds of options: + +1. Global **analyzer options** influence the behavior of the analyzer engine. + They are documented on this page, in the section :ref:`List of analyzer + options`. +2. The **checker options** belong to individual checkers (e.g. + ``core.BitwiseShift:Pedantic`` and ``unix.Stream:Pedantic`` are completely + separate options) and customize the behavior of that particular checker. + These are documented within the documentation of each individual checker at + :doc:`../checkers`. + +Assigning values to options +=========================== + +With the compiler frontend +-------------------------- + +All options can be configured by using the ``-analyzer-config`` flag of ``clang +-cc1`` (the so-called *compiler frontend* part of clang). The values of the +options are specified with the syntax ``-analyzer-config +OPT=VAL,OPT2=VAL2,...`` which supports specifying multiple options, but +separate flags like ``-analyzer-config OPT=VAL -analyzer-config OPT2=VAL2`` are +also accepted (with equivalent behavior). Analyzer options and checker options +can be freely intermixed here because it's easy to recognize that checker +option names are always prefixed with ``some.groups.NameOfChecker:``. + +.. warning:: + This is an internal interface, one should prefer `clang --analyze ...` for + regular use. Clang does not intend to preserve backwards compatibility or + announce breaking changes within the flags accepted by ``clang -cc1`` + (but ``-analyzer-config`` survived many years without major changes). + +With the clang driver +--------------------- + +In a conventional workflow ``clang -cc1`` (which is a low-level internal +interface) is invoked indirectly by the clang *driver* (i.e. plain ``clang`` +without the ``-cc1`` flag), which acts as an "even more frontend" wrapper layer +around the ``clang -cc1`` *compiler frontend*. In this situation **each** +command line argument intended for the *compiler frontend* must be prefixed +with ``-Xclang``. + +For example the following command analyzes ``foo.c`` in :ref:`shallow mode +` with :ref:`loop unrolling +`: + +:: + + clang --analyze -Xclang -analyzer-config -Xclang mode=shallow,unroll-loops=true foo.c + +When this is executed, the *driver* will compose and execute the following +``clang -cc1`` command (which can be inspected by passing the ``-v`` flag to +the *driver*): + +:: + + clang -cc1 -analyze [...] -analyzer-config mode=shallow,unroll-loops=true foo.c + +Here ``[...]`` stands for dozens of low-level flags which ensure that ``clang +-cc1`` does the right thing (e.g. ``-fcolor-diagnostics`` when it's suitable; +``-analyzer-checker`` flags to enable the default set of checkers). Also +note the distinction that the ``clang`` *driver* requires ``--analyze`` (double +dashes) while the ``clang -cc1`` *compiler frontend* requires ``-analyze`` +(single dash). + +.. note:: + The flag ``-Xanalyzer`` is equivalent to ``-Xclang`` in these situations + (but doesn't forward other options of the clang frontend). + +With CodeChecker +---------------- + +If the analysis is performed through :ref:`CodeChecker +` (which e.g. supports the analysis of a whole +project instead of a single file) then it will act as another indirection +layer. CodeChecker provides separate command-line flags called +``--analyzer-config`` (for analyzer options) and ``--checker-config`` (for +checker options): + +:: + + CodeChecker analyze -o outdir --checker-config clangsa:unix.Stream:Pedantic=true \ + --analyzer-config clangsa:mode=shallow clangsa:unroll-loops=true \ + -- compile_commands.json + +These CodeChecker flags may be followed by multiple ``OPT=VAL`` pairs as +separate arguments (and this is why the example needs to use ``--`` before +``compile_commands.json``). The option names are all prefixed with ``clangsa:`` +to ensure that they are passed to the clang static analyzer (and not other +analyzer tools that are also supported by CodeChecker). + +.. _list-of-analyzer-options: + +List of analyzer options +======================== + +.. warning:: + These options are primarily intended for development purposes and + non-default values are usually unsupported. Changing their values may + drastically alter the behavior of the analyzer, and may even result in + instabilities or crashes! Crash reports are welcome and depending on the + severity they may be fixed. + +.. + The contents of this section are automatically generated by the script + clang/docs/tools/generate_analyzer_options_docs.py from the header file + AnalyzerOptions.def to ensure that the RST/web documentation is synchronized + with the command line help options. + +.. OPTIONS_LIST_PLACEHOLDER diff --git a/clang/docs/tools/generate_analyzer_options_docs.py b/clang/docs/tools/generate_analyzer_options_docs.py new file mode 100644 index 0000000000000..26c098d8514a0 --- /dev/null +++ b/clang/docs/tools/generate_analyzer_options_docs.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +# A tool to automatically generate documentation for the config options of the +# clang static analyzer by reading `AnalyzerOptions.def`. + +import argparse +from collections import namedtuple +from enum import Enum, auto +import re +import sys +import textwrap + + +# The following code implements a trivial parser for the narrow subset of C++ +# which is used in AnalyzerOptions.def. This supports the following features: +# - ignores preprocessor directives, even if they are continued with \ at EOL +# - ignores comments: both /* ... */ and // ... +# - parses string literals (even if they contain \" escapes) +# - concatenates adjacent string literals +# - parses numbers even if they contain ' as a thousands separator +# - recognizes MACRO(arg1, arg2, ..., argN) calls + + +class TT(Enum): + "Token type enum." + number = auto() + ident = auto() + string = auto() + punct = auto() + + +TOKENS = [ + (re.compile(r"-?[0-9']+"), TT.number), + (re.compile(r"\w+"), TT.ident), + (re.compile(r'"([^\\"]|\\.)*"'), TT.string), + (re.compile(r"[(),]"), TT.punct), + (re.compile(r"/\*((?!\*/).)*\*/", re.S), None), # C-style comment + (re.compile(r"//.*\n"), None), # C++ style oneline comment + (re.compile(r"#.*(\\\n.*)*(?", which is + # OK for a terse command line printout, but should be prettified for web + # documentation. + # Moreover, the option ctu-invocation-list shows some example file content + # which is formatted as a preformatted block. + paragraphs = [desc] + extra = "" + if m := re.search(r"(^|\s)Value:", desc): + err_handler.record_use_of_tweak("accepted values") + paragraphs = [desc[: m.start()], "Accepted values:" + desc[m.end() :]] + elif m := re.search(r"\s*Example file.content:", desc): + err_handler.record_use_of_tweak("example file content") + paragraphs = [desc[: m.start()]] + extra = "Example file content::\n\n " + desc[m.end() :] + "\n\n" + + wrapped = [textwrap.fill(p, width=80) for p in paragraphs if p.strip()] + + return "\n\n".join(wrapped + [""]) + extra + + +def default_to_rst(tok): + if tok.kind == TT.string: + if tok.code == '""': + return "(empty string)" + return tok.code + if tok.kind == TT.ident: + return tok.code + if tok.kind == TT.number: + return tok.code.replace("'", "") + raise ValueError(f"unexpected token as default value: {tok.kind.name}") + + +def defaults_to_rst_paragraph(defaults): + strs = [default_to_rst(d) for d in defaults] + + if len(strs) == 1: + return f"Default value: {strs[0]}\n\n" + if len(strs) == 2: + return ( + f"Default value: {strs[0]} (in shallow mode) / {strs[1]} (in deep mode)\n\n" + ) + raise ValueError("unexpected count of default values: %d" % len(defaults)) + + +def macro_call_to_rst_paragraphs(macro_call): + try: + arg_count = len(macro_call.args) + param_count = MACRO_NAMES_PARAMCOUNTS[macro_call.name] + if arg_count != param_count: + raise ValueError( + f"expected {param_count} arguments for {macro_call.name}, found {arg_count}" + ) + + _, _, cmdflag, desc, *defaults = macro_call.args + + return ( + cmdflag_to_rst_title(cmdflag) + + desc_to_rst_paragraphs(desc) + + defaults_to_rst_paragraph(defaults) + ) + except ValueError as ve: + err_handler.report_error(ve.args[0]) + return "" + + +def get_option_list(input_file): + with open(input_file, encoding="utf-8") as f: + contents = f.read() + tokens = join_strings(tokenize(contents)) + macro_calls = get_calls(tokens, MACRO_NAMES_PARAMCOUNTS) + + result = "" + for mc in macro_calls: + result += macro_call_to_rst_paragraphs(mc) + return result + + +p = argparse.ArgumentParser() +p.add_argument("--options-def", help="path to AnalyzerOptions.def") +p.add_argument("--template", help="template file") +p.add_argument("--out", help="output file") +opts = p.parse_args() + +with open(opts.template, encoding="utf-8") as f: + doc_template = f.read() + +PLACEHOLDER = ".. OPTIONS_LIST_PLACEHOLDER\n" + +rst_output = doc_template.replace(PLACEHOLDER, get_option_list(opts.options_def)) + +err_handler.report_unused_tweaks() + +with open(opts.out, "w", newline="", encoding="utf-8") as f: + f.write(rst_output) + +if err_handler.seen_errors: + sys.exit(1) diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h index 078e1e848f393..c8f6330a73bb1 100644 --- a/clang/include/clang/AST/ASTConcept.h +++ b/clang/include/clang/AST/ASTConcept.h @@ -93,11 +93,11 @@ struct ASTConstraintSatisfaction final : bool ContainsErrors : 1; const UnsatisfiedConstraintRecord *begin() const { - return getTrailingObjects(); + return getTrailingObjects(); } const UnsatisfiedConstraintRecord *end() const { - return getTrailingObjects() + NumRecords; + return getTrailingObjects() + NumRecords; } ASTConstraintSatisfaction(const ASTContext &C, diff --git a/clang/include/clang/AST/ASTDiagnostic.h b/clang/include/clang/AST/ASTDiagnostic.h index ef22249828629..baa410e3e4a03 100644 --- a/clang/include/clang/AST/ASTDiagnostic.h +++ b/clang/include/clang/AST/ASTDiagnostic.h @@ -38,6 +38,9 @@ namespace clang { /// is initialized before passing it in. QualType desugarForDiagnostic(ASTContext &Context, QualType QT, bool &ShouldAKA); + + std::string FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T); + } // end namespace clang #endif diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index f1013c57e008f..9290ff3764c8c 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -185,7 +185,7 @@ class PragmaCommentDecl final PragmaMSCommentKind getCommentKind() const { return CommentKind; } - StringRef getArg() const { return getTrailingObjects(); } + StringRef getArg() const { return getTrailingObjects(); } // Implement isa/cast/dyncast/etc. static bool classof(const Decl *D) { return classofKind(D->getKind()); } @@ -217,8 +217,8 @@ class PragmaDetectMismatchDecl final static PragmaDetectMismatchDecl * CreateDeserialized(ASTContext &C, GlobalDeclID ID, unsigned NameValueSize); - StringRef getName() const { return getTrailingObjects(); } - StringRef getValue() const { return getTrailingObjects() + ValueStart; } + StringRef getName() const { return getTrailingObjects(); } + StringRef getValue() const { return getTrailingObjects() + ValueStart; } // Implement isa/cast/dyncast/etc. static bool classof(const Decl *D) { return classofKind(D->getKind()); } @@ -1991,7 +1991,7 @@ class FunctionDecl : public DeclaratorDecl, /// Get the unqualified lookup results that should be used in this /// defaulted function definition. ArrayRef getUnqualifiedLookups() const { - return {getTrailingObjects(), NumLookups}; + return getTrailingObjects(NumLookups); } StringLiteral *getDeletedMessage() const { @@ -4780,13 +4780,9 @@ class OutlinedFunctionDecl final explicit OutlinedFunctionDecl(DeclContext *DC, unsigned NumParams); - ImplicitParamDecl *const *getParams() const { - return getTrailingObjects(); - } + ImplicitParamDecl *const *getParams() const { return getTrailingObjects(); } - ImplicitParamDecl **getParams() { - return getTrailingObjects(); - } + ImplicitParamDecl **getParams() { return getTrailingObjects(); } public: friend class ASTDeclReader; @@ -4857,13 +4853,9 @@ class CapturedDecl final explicit CapturedDecl(DeclContext *DC, unsigned NumParams); - ImplicitParamDecl *const *getParams() const { - return getTrailingObjects(); - } + ImplicitParamDecl *const *getParams() const { return getTrailingObjects(); } - ImplicitParamDecl **getParams() { - return getTrailingObjects(); - } + ImplicitParamDecl **getParams() { return getTrailingObjects(); } public: friend class ASTDeclReader; @@ -5187,12 +5179,10 @@ class HLSLRootSignatureDecl final unsigned NumElems; - llvm::hlsl::rootsig::RootElement *getElems() { - return getTrailingObjects(); - } + llvm::hlsl::rootsig::RootElement *getElems() { return getTrailingObjects(); } const llvm::hlsl::rootsig::RootElement *getElems() const { - return getTrailingObjects(); + return getTrailingObjects(); } HLSLRootSignatureDecl(DeclContext *DC, SourceLocation Loc, IdentifierInfo *ID, diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index fa58ed59484ad..d40d11cbe1a3b 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -2606,9 +2606,6 @@ class CXXConstructorDecl final size_t numTrailingObjects(OverloadToken) const { return CXXConstructorDeclBits.IsInheritingConstructor; } - size_t numTrailingObjects(OverloadToken) const { - return CXXConstructorDeclBits.HasTrailingExplicitSpecifier; - } ExplicitSpecifier getExplicitSpecifierInternal() const { if (CXXConstructorDeclBits.HasTrailingExplicitSpecifier) @@ -2625,8 +2622,12 @@ class CXXConstructorDecl final }; uint64_t getTrailingAllocKind() const { - return numTrailingObjects(OverloadToken()) | - (numTrailingObjects(OverloadToken()) << 1); + uint64_t Kind = 0; + if (CXXConstructorDeclBits.IsInheritingConstructor) + Kind |= TAKInheritsConstructor; + if (CXXConstructorDeclBits.HasTrailingExplicitSpecifier) + Kind |= TAKHasTailExplicit; + return Kind; } public: @@ -3864,7 +3865,7 @@ class UsingPackDecl final InstantiatedFrom ? InstantiatedFrom->getDeclName() : DeclarationName()), InstantiatedFrom(InstantiatedFrom), NumExpansions(UsingDecls.size()) { - llvm::uninitialized_copy(UsingDecls, getTrailingObjects()); + llvm::uninitialized_copy(UsingDecls, getTrailingObjects()); } void anchor() override; @@ -3882,7 +3883,7 @@ class UsingPackDecl final /// Get the set of using declarations that this pack expanded into. Note that /// some of these may still be unresolved. ArrayRef expansions() const { - return getTrailingObjects(NumExpansions); + return getTrailingObjects(NumExpansions); } static UsingPackDecl *Create(ASTContext &C, DeclContext *DC, @@ -4235,7 +4236,7 @@ class DecompositionDecl final : VarDecl(Decomposition, C, DC, StartLoc, LSquareLoc, nullptr, T, TInfo, SC), NumBindings(Bindings.size()) { - llvm::uninitialized_copy(Bindings, getTrailingObjects()); + llvm::uninitialized_copy(Bindings, getTrailingObjects()); for (auto *B : Bindings) { B->setDecomposedDecl(this); if (B->isParameterPack() && B->getBinding()) { @@ -4262,8 +4263,8 @@ class DecompositionDecl final unsigned NumBindings); // Provide the range of bindings which may have a nested pack. - llvm::ArrayRef bindings() const { - return {getTrailingObjects(), NumBindings}; + ArrayRef bindings() const { + return getTrailingObjects(NumBindings); } // Provide a flattened range to visit each binding. diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index 80c97681d9163..8d8b1ca938829 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -712,7 +712,7 @@ class DependentFunctionTemplateSpecializationInfo final /// Returns the candidates for the primary function template. ArrayRef getCandidates() const { - return {getTrailingObjects(), NumCandidates}; + return getTrailingObjects(NumCandidates); } }; @@ -1325,8 +1325,7 @@ class TemplateTypeParmDecl final : public TypeDecl, /// Returns the type constraint associated with this template parameter (if /// any). const TypeConstraint *getTypeConstraint() const { - return TypeConstraintInitialized ? getTrailingObjects() : - nullptr; + return TypeConstraintInitialized ? getTrailingObjects() : nullptr; } void setTypeConstraint(ConceptReference *CR, @@ -1711,7 +1710,7 @@ class TemplateTemplateParmDecl final /// pack. TemplateParameterList *getExpansionTemplateParameters(unsigned I) const { assert(I < NumExpandedParams && "Out-of-range expansion type index"); - return getTrailingObjects()[I]; + return getTrailingObjects()[I]; } const DefArgStorage &getDefaultArgStorage() const { return DefaultArgument; } @@ -1859,7 +1858,8 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl, /// This needs to be cached as deduction is performed during declaration, /// and we need the information to be preserved so that it is consistent /// during instantiation. - bool StrictPackMatch : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned StrictPackMatch : 1; protected: ClassTemplateSpecializationDecl(ASTContext &Context, Kind DK, TagKind TK, @@ -3254,8 +3254,7 @@ class ImplicitConceptSpecializationDecl final unsigned NumTemplateArgs); ArrayRef getTemplateArguments() const { - return ArrayRef(getTrailingObjects(), - NumTemplateArgs); + return getTrailingObjects(NumTemplateArgs); } void setTemplateArguments(ArrayRef Converted); diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index a83320a7ddec2..1e6749dda71fe 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -4566,9 +4566,11 @@ class ShuffleVectorExpr : public Expr { void setExprs(const ASTContext &C, ArrayRef Exprs); - llvm::APSInt getShuffleMaskIdx(const ASTContext &Ctx, unsigned N) const { + llvm::APSInt getShuffleMaskIdx(unsigned N) const { assert((N < NumExprs - 2) && "Shuffle idx out of range!"); - return getExpr(N+2)->EvaluateKnownConstInt(Ctx); + assert(isa(getExpr(N + 2)) && + "Index expression must be a ConstantExpr"); + return cast(getExpr(N + 2))->getAPValueResult().getInt(); } // Iterators diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index fd101336acd9d..67fbdfeb0702f 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -293,7 +293,7 @@ class OpenACCDeviceTypeClause final "Only a single asterisk version is permitted, and must be the " "only one"); - llvm::uninitialized_copy(Archs, getTrailingObjects()); + llvm::uninitialized_copy(Archs, getTrailingObjects()); } public: @@ -307,8 +307,7 @@ class OpenACCDeviceTypeClause final } ArrayRef getArchitectures() const { - return ArrayRef( - getTrailingObjects(), NumArchs); + return getTrailingObjects(NumArchs); } static OpenACCDeviceTypeClause * @@ -421,9 +420,7 @@ class OpenACCSelfClause final // Intentionally internal, meant to be an implementation detail of everything // else. All non-internal uses should go through getConditionExpr/getVarList. - llvm::ArrayRef getExprs() const { - return {getTrailingObjects(), NumExprs}; - } + ArrayRef getExprs() const { return getTrailingObjects(NumExprs); } public: static bool classof(const OpenACCClause *C) { @@ -472,8 +469,8 @@ class OpenACCSelfClause final child_range children() { return child_range( - reinterpret_cast(getTrailingObjects()), - reinterpret_cast(getTrailingObjects() + NumExprs)); + reinterpret_cast(getTrailingObjects()), + reinterpret_cast(getTrailingObjects() + NumExprs)); } const_child_range children() const { @@ -506,6 +503,14 @@ class OpenACCClauseWithExprs : public OpenACCClauseWithParams { Exprs = NewExprs; } + /// Used only for initialization, the leaf class can initialize this to + /// trailing storage, and initialize the data in the trailing storage as well. + void setExprs(MutableArrayRef NewStorage, ArrayRef Exprs) { + assert(NewStorage.size() == Exprs.size()); + llvm::uninitialized_copy(Exprs, NewStorage.begin()); + setExprs(NewStorage); + } + /// Gets the entire list of expressions, but leave it to the /// individual clauses to expose this how they'd like. llvm::ArrayRef getExprs() const { return Exprs; } @@ -538,10 +543,10 @@ class OpenACCWaitClause final QueuesLoc(QueuesLoc) { // The first element of the trailing storage is always the devnum expr, // whether it is used or not. - auto *Exprs = getTrailingObjects(); + auto *Exprs = getTrailingObjects(); llvm::uninitialized_copy(ArrayRef(DevNumExpr), Exprs); llvm::uninitialized_copy(QueueIdExprs, Exprs + 1); - setExprs(getTrailingObjects(QueueIdExprs.size() + 1)); + setExprs(getTrailingObjects(QueueIdExprs.size() + 1)); } public: @@ -578,8 +583,7 @@ class OpenACCNumGangsClause final ArrayRef IntExprs, SourceLocation EndLoc) : OpenACCClauseWithExprs(OpenACCClauseKind::NumGangs, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(IntExprs, getTrailingObjects()); - setExprs(getTrailingObjects(IntExprs.size())); + setExprs(getTrailingObjects(IntExprs.size()), IntExprs); } public: @@ -607,8 +611,7 @@ class OpenACCTileClause final ArrayRef SizeExprs, SourceLocation EndLoc) : OpenACCClauseWithExprs(OpenACCClauseKind::Tile, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(SizeExprs, getTrailingObjects()); - setExprs(getTrailingObjects(SizeExprs.size())); + setExprs(getTrailingObjects(SizeExprs.size()), SizeExprs); } public: @@ -845,8 +848,7 @@ class OpenACCPrivateClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Private, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -867,8 +869,7 @@ class OpenACCFirstPrivateClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::FirstPrivate, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -889,8 +890,7 @@ class OpenACCDevicePtrClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::DevicePtr, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -911,8 +911,7 @@ class OpenACCAttachClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Attach, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -933,8 +932,7 @@ class OpenACCDetachClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Detach, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -955,8 +953,7 @@ class OpenACCDeleteClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Delete, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -977,8 +974,7 @@ class OpenACCUseDeviceClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::UseDevice, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -999,8 +995,7 @@ class OpenACCNoCreateClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::NoCreate, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1021,8 +1016,7 @@ class OpenACCPresentClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Present, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1042,8 +1036,7 @@ class OpenACCHostClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Host, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1065,8 +1058,7 @@ class OpenACCDeviceClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Device, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1093,8 +1085,7 @@ class OpenACCCopyClause final Spelling == OpenACCClauseKind::PCopy || Spelling == OpenACCClauseKind::PresentOrCopy) && "Invalid clause kind for copy-clause"); - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1127,8 +1118,7 @@ class OpenACCCopyInClause final Spelling == OpenACCClauseKind::PCopyIn || Spelling == OpenACCClauseKind::PresentOrCopyIn) && "Invalid clause kind for copyin-clause"); - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1160,8 +1150,7 @@ class OpenACCCopyOutClause final Spelling == OpenACCClauseKind::PCopyOut || Spelling == OpenACCClauseKind::PresentOrCopyOut) && "Invalid clause kind for copyout-clause"); - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1193,8 +1182,7 @@ class OpenACCCreateClause final Spelling == OpenACCClauseKind::PCreate || Spelling == OpenACCClauseKind::PresentOrCreate) && "Invalid clause kind for create-clause"); - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1223,8 +1211,7 @@ class OpenACCReductionClause final : OpenACCClauseWithVarList(OpenACCClauseKind::Reduction, BeginLoc, LParenLoc, EndLoc), Op(Operator) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1249,8 +1236,7 @@ class OpenACCLinkClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::Link, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: @@ -1273,8 +1259,7 @@ class OpenACCDeviceResidentClause final ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(OpenACCClauseKind::DeviceResident, BeginLoc, LParenLoc, EndLoc) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); - setExprs(getTrailingObjects(VarList.size())); + setExprs(getTrailingObjects(VarList.size()), VarList); } public: diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index cdecc812f7fb9..6fd16bc0f03be 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -295,8 +295,7 @@ template class OMPVarListClause : public OMPClause { /// Fetches list of variables associated with this clause. MutableArrayRef getVarRefs() { - return MutableArrayRef( - static_cast(this)->template getTrailingObjects(), NumVars); + return static_cast(this)->template getTrailingObjects(NumVars); } /// Sets the list of variables for this clause. @@ -336,8 +335,7 @@ template class OMPVarListClause : public OMPClause { /// Fetches list of all variables in the clause. ArrayRef getVarRefs() const { - return llvm::ArrayRef( - static_cast(this)->template getTrailingObjects(), + return static_cast(this)->template getTrailingObjects( NumVars); } }; @@ -382,10 +380,8 @@ template class OMPDirectiveListClause : public OMPClause { } MutableArrayRef getDirectiveKinds() { - return MutableArrayRef( - static_cast(this) - ->template getTrailingObjects(), - NumKinds); + return static_cast(this) + ->template getTrailingObjects(NumKinds); } void setDirectiveKinds(ArrayRef DK) { @@ -984,14 +980,12 @@ class OMPSizesClause final /// Returns the tile size expressions. MutableArrayRef getSizesRefs() { - return MutableArrayRef(static_cast(this) - ->template getTrailingObjects(), - NumSizes); + return static_cast(this) + ->template getTrailingObjects(NumSizes); } ArrayRef getSizesRefs() const { - return ArrayRef(static_cast(this) - ->template getTrailingObjects(), - NumSizes); + return static_cast(this) + ->template getTrailingObjects(NumSizes); } /// Sets the tile size expressions. @@ -1090,14 +1084,12 @@ class OMPPermutationClause final /// Returns the permutation index expressions. ///@{ MutableArrayRef getArgsRefs() { - return MutableArrayRef(static_cast(this) - ->template getTrailingObjects(), - NumLoops); + return static_cast(this) + ->template getTrailingObjects(NumLoops); } ArrayRef getArgsRefs() const { - return ArrayRef(static_cast(this) - ->template getTrailingObjects(), - NumLoops); + return static_cast(this) + ->template getTrailingObjects(NumLoops); } ///@} @@ -3841,7 +3833,7 @@ class OMPReductionClause final return MutableArrayRef(getLHSExprs().end(), varlist_size()); } ArrayRef getRHSExprs() const { - return llvm::ArrayRef(getLHSExprs().end(), varlist_size()); + return ArrayRef(getLHSExprs().end(), varlist_size()); } /// Set list of helper reduction expressions, required for proper @@ -5925,18 +5917,15 @@ class OMPMappableExprListClause : public OMPVarListClause, /// Get the unique declarations that are in the trailing objects of the /// class. MutableArrayRef getUniqueDeclsRef() { - return MutableArrayRef( - static_cast(this)->template getTrailingObjects(), + return static_cast(this)->template getTrailingObjects( NumUniqueDeclarations); } /// Get the unique declarations that are in the trailing objects of the /// class. ArrayRef getUniqueDeclsRef() const { - return ArrayRef( - static_cast(this) - ->template getTrailingObjects(), - NumUniqueDeclarations); + return static_cast(this) + ->template getTrailingObjects(NumUniqueDeclarations); } /// Set the unique declarations that are in the trailing objects of the @@ -5950,16 +5939,14 @@ class OMPMappableExprListClause : public OMPVarListClause, /// Get the number of lists per declaration that are in the trailing /// objects of the class. MutableArrayRef getDeclNumListsRef() { - return MutableArrayRef( - static_cast(this)->template getTrailingObjects(), + return static_cast(this)->template getTrailingObjects( NumUniqueDeclarations); } /// Get the number of lists per declaration that are in the trailing /// objects of the class. ArrayRef getDeclNumListsRef() const { - return ArrayRef( - static_cast(this)->template getTrailingObjects(), + return static_cast(this)->template getTrailingObjects( NumUniqueDeclarations); } @@ -5999,18 +5986,14 @@ class OMPMappableExprListClause : public OMPVarListClause, /// Get the components that are in the trailing objects of the class. MutableArrayRef getComponentsRef() { - return MutableArrayRef( - static_cast(this) - ->template getTrailingObjects(), - NumComponents); + return static_cast(this) + ->template getTrailingObjects(NumComponents); } /// Get the components that are in the trailing objects of the class. ArrayRef getComponentsRef() const { - return ArrayRef( - static_cast(this) - ->template getTrailingObjects(), - NumComponents); + return static_cast(this) + ->template getTrailingObjects(NumComponents); } /// Set the components that are in the trailing objects of the class. diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td index 33336d57b6298..111a3e44f2fd5 100644 --- a/clang/include/clang/AST/PropertiesBase.td +++ b/clang/include/clang/AST/PropertiesBase.td @@ -414,7 +414,8 @@ let Class = PropertyTypeCase in { let Read = [{ node.getUnionValue() }]; } def : Creator<[{ - return APValue(cast(fieldDecl), std::move(value)); + // node.getUnionField() / fieldDecl can be null, thus, using `cast_if_present` + return APValue(cast_if_present(fieldDecl), std::move(value)); }]>; } let Class = PropertyTypeCase in { diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 180f3623983de..5c8c0e1cf1d00 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2521,6 +2521,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { bool isChar16Type() const; bool isChar32Type() const; bool isAnyCharacterType() const; + bool isUnicodeCharacterType() const; bool isIntegralType(const ASTContext &Ctx) const; /// Determine whether this type is an integral or enumeration type. diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyTIL.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyTIL.h index 9f365d1a3b655..14c5b679428a3 100644 --- a/clang/include/clang/Analysis/Analyses/ThreadSafetyTIL.h +++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyTIL.h @@ -52,6 +52,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -1664,7 +1665,8 @@ class BasicBlock : public SExpr { unsigned BlockID : 31; // Bit to determine if a block has been visited during a traversal. - bool Visited : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned Visited : 1; // Predecessor blocks in the CFG. BlockArray Predecessors; diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index ccd13a4cca4dd..a6a7482a94a29 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4739,7 +4739,8 @@ def Error : InheritableAttr { def RootSignature : Attr { /// [RootSignature(Signature)] let Spellings = [Microsoft<"RootSignature">]; - let Args = [IdentifierArgument<"Signature">]; + let Args = [IdentifierArgument<"SignatureIdent">, + DeclArgument]; let Subjects = SubjectList<[Function], ErrorDiag, "'function'">; let LangOpts = [HLSL]; @@ -4789,6 +4790,7 @@ def HLSLResourceBinding: InheritableAttr { RegisterType RegType; std::optional SlotNumber; unsigned SpaceNumber; + std::optional ImplicitBindingOrderID; public: void setBinding(RegisterType RT, std::optional SlotNum, unsigned SpaceNum) { @@ -4810,6 +4812,16 @@ def HLSLResourceBinding: InheritableAttr { unsigned getSpaceNumber() const { return SpaceNumber; } + void setImplicitBindingOrderID(uint32_t Value) { + ImplicitBindingOrderID = Value; + } + bool hasImplicitBindingOrderID() const { + return ImplicitBindingOrderID.has_value(); + } + uint32_t getImplicitBindingOrderID() const { + assert(hasImplicitBindingOrderID() && "attribute does not have implicit binding order id"); + return ImplicitBindingOrderID.value(); + } }]; } diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 5fb5f16680b41..65d66dd398ad1 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -8680,6 +8680,7 @@ its underlying representation to be a WebAssembly ``funcref``. def PreferredTypeDocumentation : Documentation { let Category = DocCatField; + let Label = "langext-preferred_type_documentation"; let Content = [{ This attribute allows adjusting the type of a bit-field in debug information. This can be helpful when a bit-field is intended to store an enumeration value, diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 11b1e247237a7..187d3b5ed24a7 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4819,6 +4819,12 @@ def HLSLResourceHandleFromBinding : LangBuiltin<"HLSL_LANG"> { let Prototype = "void(...)"; } +def HLSLResourceHandleFromImplicitBinding : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_resource_handlefromimplicitbinding"]; + let Attributes = [NoThrow]; + let Prototype = "void(...)"; +} + def HLSLAll : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_all"]; let Attributes = [NoThrow, Const]; diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index f797e29fe66a3..2cea44e224674 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -620,6 +620,12 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; diff --git a/clang/include/clang/Basic/BuiltinsRISCV.td b/clang/include/clang/Basic/BuiltinsRISCV.td index 3263603a8a1cf..b2cd5648e008f 100644 --- a/clang/include/clang/Basic/BuiltinsRISCV.td +++ b/clang/include/clang/Basic/BuiltinsRISCV.td @@ -147,6 +147,12 @@ def ntl_load : RISCVBuiltin<"void(...)">; def ntl_store : RISCVBuiltin<"void(...)">; } // Features = "zihintntl", Attributes = [CustomTypeChecking] +//===----------------------------------------------------------------------===// +// Zihintpause extension. +//===----------------------------------------------------------------------===// +let Features = "zihintpause", Attributes = [NoThrow] in +def pause : RISCVBuiltin<"void()">; + //===----------------------------------------------------------------------===// // XCV extensions. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index ab480369b3820..e2afcc08064b2 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -192,6 +192,7 @@ TARGET_BUILTIN(__builtin_wasm_replace_lane_f16x8, "V8hV8hIif", "nc", "fp16") // in which case the argument spec (second argument) is unused. TARGET_BUILTIN(__builtin_wasm_ref_null_extern, "i", "nct", "reference-types") +TARGET_BUILTIN(__builtin_wasm_ref_is_null_extern, "ii", "nct", "reference-types") // A funcref represented as a function pointer with the funcref attribute // attached to the type, therefore SemaChecking will check for the right diff --git a/clang/include/clang/Basic/DiagnosticCategories.h b/clang/include/clang/Basic/DiagnosticCategories.h index 839f8dee3ca89..52bb7a268b418 100644 --- a/clang/include/clang/Basic/DiagnosticCategories.h +++ b/clang/include/clang/Basic/DiagnosticCategories.h @@ -11,7 +11,7 @@ namespace clang { namespace diag { - enum { + enum DiagCategory { #define GET_CATEGORY_TABLE #define CATEGORY(X, ENUM) ENUM, #include "clang/Basic/DiagnosticGroups.inc" diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index b15cba698030c..4da8f80345ddc 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -670,6 +670,10 @@ def note_drv_verify_prefix_spelling : Note< "-verify prefixes must start with a letter and contain only alphanumeric" " characters, hyphens, and underscores">; +def note_command_line_code_loc_requirement + : Note<"-code-completion-at=:: requires and " + " to be integers greater than zero">; + def warn_drv_global_isel_incomplete : Warning< "-fglobal-isel support for the '%0' architecture is incomplete">, InGroup; diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 7b0dcde44296e..616f2555931f5 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -111,6 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion", ImplicitEnumEnumCast, EnumFloatConversion, EnumCompareConditional]>; +def CharacterConversion : DiagGroup<"character-conversion">; def DeprecatedOFast : DiagGroup<"deprecated-ofast">; def ObjCSignedCharBoolImplicitIntConversion : DiagGroup<"objc-signed-char-bool-implicit-int-conversion">; @@ -690,6 +691,52 @@ def Packed : DiagGroup<"packed", [PackedNonPod]>; def PaddedBitField : DiagGroup<"padded-bitfield">; def Padded : DiagGroup<"padded", [PaddedBitField]>; def UnalignedAccess : DiagGroup<"unaligned-access">; +def MSBitfieldCompatibility : DiagGroup<"ms-bitfield-padding"> { + code Documentation = [{ + Under the Microsoft ABI, adjacent bit-fields are not packed if the + underlying type has a different storage size. This warning indicates that a + pair of adjacent bit-fields may not pack in the same way due to this behavioural + difference. + + This can occur when mixing different types explicitly: + + .. code-block:: c++ + + struct S { + uint16_t field1 : 1; + uint32_t field2 : 1; + }; + + or more subtly through enums + + .. code-block:: c++ + + enum Enum1 { /* ... */ }; + enum class Enum2 : unsigned char { /* ... */ }; + struct S { + Enum1 field1 : 1; + Enum2 field2 : 1; + }; + + In each of these cases under the Microsoft ABI the second bit-field + will not be packed with the preceding bit-field, and instead will be aligned + as if the fields were each separately defined integer fields of their respective + storage size. For binary compatibility this is obviously and observably + incompatible, however where bit-fields are being used solely for memory use + reduction this incomplete packing may silently increase the size of objects vs + what is expected. + + This issue can be addressed by ensuring the storage type of each bit-field is + the same, either by explicitly using the same integer type, or in the case of + enum types declaring the enum types with the same storage size. For enum types + where you cannot specify the underlying type, the options are to either switch + to int sized storage for all specifiers or to resort to declaring the + bit-fields with explicit integer storage types and cast in and out of the field. + If such a solution is required the + :ref:`preferred_type ` attribute can be + used to convey the actual field type to debuggers and other tooling. + }]; +} def PessimizingMove : DiagGroup<"pessimizing-move">; def ReturnStdMove : DiagGroup<"return-std-move">; @@ -1073,6 +1120,7 @@ def Parentheses : DiagGroup<"parentheses", // - __null-to-integer conversion warnings are on by default def Conversion : DiagGroup<"conversion", [BoolConversion, + CharacterConversion, ConstantConversion, EnumConversion, BitFieldEnumConversion, diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3efe9593b8633..d78a757c72e4a 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -164,6 +164,8 @@ def err_ice_too_large : Error< "integer constant expression evaluates to value %0 that cannot be " "represented in a %1-bit %select{signed|unsigned}2 integer type">; def err_expr_not_string_literal : Error<"expression is not a string literal">; +def note_constexpr_assert_failed : Note< + "assertion failed during evaluation of constant expression">; // Semantic analysis of constant literals. def ext_predef_outside_function : Warning< @@ -4367,6 +4369,29 @@ def warn_address_of_reference_bool_conversion : Warning< "code; pointer may be assumed to always convert to true">, InGroup; +def warn_impcast_unicode_char_type + : Warning<"implicit conversion from %0 to %1 may change the meaning of the " + "represented code unit">, + InGroup; +def warn_impcast_unicode_precision + : Warning<"implicit conversion from %0 to %1 may lose precision and change " + "the meaning of the represented code unit">, + InGroup; +def warn_impcast_unicode_char_type_constant + : Warning<"implicit conversion from %0 to %1 changes the meaning of the " + "%select{code unit|code point}2 '%3'">, + InGroup; + +def warn_comparison_unicode_mixed_types + : Warning<"comparing values of different Unicode code unit types %0 and %1 " + "may compare different code points">, + InGroup; + +def warn_comparison_unicode_mixed_types_constant + : Warning<"comparing values of different Unicode code unit types %0 and %1 " + "compares unrelated code units '%2' and '%3'">, + InGroup; + def warn_xor_used_as_pow : Warning< "result of '%0' is %1; did you mean exponentiation?">, InGroup; @@ -6541,6 +6566,13 @@ def note_change_bitfield_sign : Note< "consider making the bit-field type %select{unsigned|signed}0">; def note_bitfield_preferred_type : Note<"preferred type for bit-field %0 specified here">; +def warn_ms_bitfield_mismatched_storage_packing : Warning< + "bit-field %0 of type %1 has a different storage size than the " + "preceding bit-field (%2 vs %3 bytes) and will not be packed under " + "the Microsoft ABI">, + InGroup, DefaultIgnore; +def note_ms_bitfield_mismatched_storage_size_previous : Note< + "preceding bit-field %0 declared here with type %1">; def warn_missing_braces : Warning< "suggest braces around initialization of subobject">, @@ -6825,7 +6857,7 @@ def err_counted_by_on_incomplete_type_on_use : Error < def note_counted_by_consider_completing_pointee_ty : Note< "consider providing a complete definition for %0">; - + def note_counted_by_consider_using_sized_by : Note< "consider using '__sized_by%select{|_or_null}0' instead of " "'__counted_by%select{|_or_null}0'">; @@ -7724,6 +7756,11 @@ def warn_comparison_of_mixed_enum_types_switch : Warning< "%diff{ ($ and $)|}0,1">, InGroup; +def warn_arith_conv_mixed_unicode_types + : Warning<"%sub{select_arith_conv_kind}0 " + "different Unicode character types %1 and %2">, + InGroup; + def err_typecheck_assign_const : Error< "%select{" "cannot assign to return value because function %1 returns a const value|" @@ -10176,6 +10213,9 @@ def note_defaulted_comparison_no_viable_function_synthesized : Note< def note_defaulted_comparison_not_rewritten_callee : Note< "defaulted %0 is implicitly deleted because this non-rewritten comparison " "function would be the best match for the comparison">; +def note_defaulted_comparison_vector_types : Note< + "defaulted %0 is implicitly deleted because defaulted comparison of vector " + "types is not supported">; def note_defaulted_comparison_not_rewritten_conversion : Note< "defaulted %0 is implicitly deleted because a builtin comparison function " "using this conversion would be the best match for the comparison">; @@ -12405,6 +12445,10 @@ def warn_zero_as_null_pointer_constant : Warning< "zero as null pointer constant">, InGroup>, DefaultIgnore; +def warn_not_eliding_copy_on_return : Warning< + "not eliding copy on return">, + InGroup>, DefaultIgnore; + def err_nullability_cs_multilevel : Error< "nullability keyword %0 cannot be applied to multi-level pointer type %1">; def note_nullability_type_specifier : Note< @@ -13005,6 +13049,8 @@ def err_wasm_reftype_multidimensional_array : Error< "multi-dimensional arrays of WebAssembly references are not allowed">; def err_wasm_builtin_arg_must_be_table_type : Error < "%ordinal0 argument must be a WebAssembly table">; +def err_wasm_builtin_arg_must_be_externref_type : Error < + "%ordinal0 argument must be an externref">; def err_wasm_builtin_arg_must_match_table_element_type : Error < "%ordinal0 argument must match the element type of the WebAssembly table in the %ordinal1 argument">; def err_wasm_builtin_arg_must_be_integer_type : Error < diff --git a/clang/include/clang/Basic/JsonSupport.h b/clang/include/clang/Basic/JsonSupport.h index bcaa3d364444e..51fb76c011067 100644 --- a/clang/include/clang/Basic/JsonSupport.h +++ b/clang/include/clang/Basic/JsonSupport.h @@ -106,7 +106,7 @@ inline void printSourceLocationAsJson(raw_ostream &Out, SourceLocation Loc, return llvm::is_contained(ForbiddenChars, Char); }); // Handle windows-specific path delimiters. - std::replace(filename.begin(), filename.end(), '\\', '/'); + llvm::replace(filename, '\\', '/'); } Out << "\"line\": " << PLoc.getLine() << ", \"column\": " << PLoc.getColumn() diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h index 62cc8acf9588b..3d035f0a5f787 100644 --- a/clang/include/clang/Basic/Module.h +++ b/clang/include/clang/Basic/Module.h @@ -888,7 +888,7 @@ class VisibleModuleSet { /// Get the location at which the import of a module was triggered. SourceLocation getImportLoc(const Module *M) const { - return M->getVisibilityID() < ImportLocs.size() + return M && M->getVisibilityID() < ImportLocs.size() ? ImportLocs[M->getVisibilityID()] : SourceLocation(); } diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 77763a5b1d574..652dc064a7b1c 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -1036,8 +1036,8 @@ class TargetInfo : public TransferrableTargetInfo, /// Returns target-specific min and max values VScale_Range. virtual std::optional> - getVScaleRange(const LangOptions &LangOpts, - bool IsArmStreamingFunction) const { + getVScaleRange(const LangOptions &LangOpts, bool IsArmStreamingFunction, + llvm::StringMap *FeatureMap = nullptr) const { return std::nullopt; } /// The __builtin_clz* and __builtin_ctz* built-in diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ab0051efe5159..7251cc2d1759a 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -279,10 +279,10 @@ def OP_CVT_F32_BF16 // Splat operation - performs a range-checked splat over a vector def SPLAT : WInst<"splat_lane", ".(!q)I", - "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPlmQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; def SPLATQ : WInst<"splat_laneq", ".(!Q)I", - "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPlmQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; let TargetGuard = "bf16,neon" in { @@ -547,19 +547,19 @@ def VST4_LANE_F16 : WInst<"vst4_lane", "v*(4!)I", "hQh", // E.3.16 Extract lanes from a vector let InstName = "vmov" in def VGET_LANE : IInst<"vget_lane", "1.I", - "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl", + "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlmQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; //////////////////////////////////////////////////////////////////////////////// // E.3.17 Set lanes within a vector let InstName = "vmov" in def VSET_LANE : IInst<"vset_lane", ".1.I", - "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl", + "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlmQm", [ImmCheck<2, ImmCheckLaneIndex, 1>]>; //////////////////////////////////////////////////////////////////////////////// // E.3.18 Initialize a vector from bit pattern -def VCREATE : NoTestOpInst<"vcreate", ".(IU>)", "csihfUcUsUiUlPcPsl", OP_CAST> { +def VCREATE : NoTestOpInst<"vcreate", ".(IU>)", "csihfUcUsUiUlPcPslm", OP_CAST> { let BigEndianSafe = 1; } @@ -567,20 +567,20 @@ def VCREATE : NoTestOpInst<"vcreate", ".(IU>)", "csihfUcUsUiUlPcPsl", OP_CAST> { // E.3.19 Set all lanes to same value let InstName = "vmov" in { def VDUP_N : WOpInst<"vdup_n", ".1", - "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl", + "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUlmQm", OP_DUP>; def VMOV_N : WOpInst<"vmov_n", ".1", - "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl", + "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUlmQm", OP_DUP>; } let InstName = "" in def VDUP_LANE: WOpInst<"vdup_lane", ".qI", - "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl", + "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUlmQm", OP_DUP_LN>; //////////////////////////////////////////////////////////////////////////////// // E.3.20 Combining vectors -def VCOMBINE : NoTestOpInst<"vcombine", "Q..", "csilhfUcUsUiUlPcPs", OP_CONC>; +def VCOMBINE : NoTestOpInst<"vcombine", "Q..", "csilhfUcUsUiUlPcPsm", OP_CONC>; //////////////////////////////////////////////////////////////////////////////// // E.3.21 Splitting vectors @@ -589,8 +589,8 @@ def VCOMBINE : NoTestOpInst<"vcombine", "Q..", "csilhfUcUsUiUlPcPs", OP_CONC>; // versions of these intrinsics in both AArch32 and AArch64 architectures. See // D45668 for more details. let InstName = "vmov" in { -def VGET_HIGH : NoTestOpInst<"vget_high", ".Q", "csilhfUcUsUiUlPcPs", OP_HI>; -def VGET_LOW : NoTestOpInst<"vget_low", ".Q", "csilhfUcUsUiUlPcPs", OP_LO>; +def VGET_HIGH : NoTestOpInst<"vget_high", ".Q", "csilhfUcUsUiUlPcPsm", OP_HI>; +def VGET_LOW : NoTestOpInst<"vget_low", ".Q", "csilhfUcUsUiUlPcPsm", OP_LO>; } //////////////////////////////////////////////////////////////////////////////// @@ -619,16 +619,16 @@ def VQMOVUN : SInst<"vqmovun", "(; //////////////////////////////////////////////////////////////////////////////// // E.3.23-24 Table lookup, Extended table lookup let InstName = "vtbl" in { -def VTBL1 : WInst<"vtbl1", "..p", "UccPc">; -def VTBL2 : WInst<"vtbl2", ".2p", "UccPc">; -def VTBL3 : WInst<"vtbl3", ".3p", "UccPc">; -def VTBL4 : WInst<"vtbl4", ".4p", "UccPc">; +def VTBL1 : WInst<"vtbl1", "..p", "UccPcm">; +def VTBL2 : WInst<"vtbl2", ".2p", "UccPcm">; +def VTBL3 : WInst<"vtbl3", ".3p", "UccPcm">; +def VTBL4 : WInst<"vtbl4", ".4p", "UccPcm">; } let InstName = "vtbx" in { -def VTBX1 : WInst<"vtbx1", "...p", "UccPc">; -def VTBX2 : WInst<"vtbx2", "..2p", "UccPc">; -def VTBX3 : WInst<"vtbx3", "..3p", "UccPc">; -def VTBX4 : WInst<"vtbx4", "..4p", "UccPc">; +def VTBX1 : WInst<"vtbx1", "...p", "UccPcm">; +def VTBX2 : WInst<"vtbx2", "..2p", "UccPcm">; +def VTBX3 : WInst<"vtbx3", "..3p", "UccPcm">; +def VTBX4 : WInst<"vtbx4", "..4p", "UccPcm">; } //////////////////////////////////////////////////////////////////////////////// @@ -677,15 +677,15 @@ def VQDMLSL_N : SOpInst<"vqdmlsl_n", "(>Q)(>Q).1", "si", OP_QDMLSL_N>; //////////////////////////////////////////////////////////////////////////////// // E.3.26 Vector Extract def VEXT : WInst<"vext", "...I", - "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQf", + "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQfmQm", [ImmCheck<2, ImmCheckLaneIndex, 0>]>; //////////////////////////////////////////////////////////////////////////////// // E.3.27 Reverse vector elements -def VREV64 : WOpInst<"vrev64", "..", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQf", +def VREV64 : WOpInst<"vrev64", "..", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQfmQm", OP_REV64>; -def VREV32 : WOpInst<"vrev32", "..", "csUcUsPcPsQcQsQUcQUsQPcQPs", OP_REV32>; -def VREV16 : WOpInst<"vrev16", "..", "cUcPcQcQUcQPc", OP_REV16>; +def VREV32 : WOpInst<"vrev32", "..", "csUcUsPcPsQcQsQUcQUsQPcQPsmQm", OP_REV32>; +def VREV16 : WOpInst<"vrev16", "..", "cUcPcQcQUcQPcmQm", OP_REV16>; //////////////////////////////////////////////////////////////////////////////// // E.3.28 Other single operand arithmetic @@ -709,13 +709,13 @@ def VBIC : LOpInst<"vbic", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ANDN>; def VORN : LOpInst<"vorn", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ORN>; let isHiddenLInst = 1 in def VBSL : SInst<"vbsl", ".U..", - "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPs">; + "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPsmQm">; //////////////////////////////////////////////////////////////////////////////// // E.3.30 Transposition operations -def VTRN : WInst<"vtrn", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">; -def VZIP : WInst<"vzip", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">; -def VUZP : WInst<"vuzp", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">; +def VTRN : WInst<"vtrn", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPsmQm">; +def VZIP : WInst<"vzip", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPsmQm">; +def VUZP : WInst<"vuzp", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPsmQm">; //////////////////////////////////////////////////////////////////////////////// @@ -1028,19 +1028,19 @@ def GET_LANE : IInst<"vget_lane", "1.I", "dQdPlQPl", def SET_LANE : IInst<"vset_lane", ".1.I", "dQdPlQPl", [ImmCheck<2, ImmCheckLaneIndex, 1>]>; def COPY_LANE : IOpInst<"vcopy_lane", "..I.I", - "csilUcUsUiUlPcPsPlfd", OP_COPY_LN>; + "csilUcUsUiUlPcPsPlfdm", OP_COPY_LN>; def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI", - "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; + "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPlQm", OP_COPY_LN>; def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI", - "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>; + "csilPcPsPlUcUsUiUlfdm", OP_COPY_LN>; def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I", - "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; + "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPlQm", OP_COPY_LN>; //////////////////////////////////////////////////////////////////////////////// // Set all lanes to same value def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "dQdPlQPl", OP_DUP_LN>; def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI", - "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl", + "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPlmQm", OP_DUP_LN>; def DUP_N : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>; def MOV_N : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>; @@ -1266,31 +1266,31 @@ def FMINNM_S64 : SInst<"vminnm", "...", "dQd">; //////////////////////////////////////////////////////////////////////////////// // Permutation def VTRN1 : SOpInst<"vtrn1", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_TRN1>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_TRN1>; def VZIP1 : SOpInst<"vzip1", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_ZIP1>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_ZIP1>; def VUZP1 : SOpInst<"vuzp1", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_UZP1>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_UZP1>; def VTRN2 : SOpInst<"vtrn2", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_TRN2>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_TRN2>; def VZIP2 : SOpInst<"vzip2", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_ZIP2>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_ZIP2>; def VUZP2 : SOpInst<"vuzp2", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_UZP2>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_UZP2>; //////////////////////////////////////////////////////////////////////////////// // Table lookup let InstName = "vtbl" in { -def VQTBL1_A64 : WInst<"vqtbl1", ".QU", "UccPcQUcQcQPc">; -def VQTBL2_A64 : WInst<"vqtbl2", ".(2Q)U", "UccPcQUcQcQPc">; -def VQTBL3_A64 : WInst<"vqtbl3", ".(3Q)U", "UccPcQUcQcQPc">; -def VQTBL4_A64 : WInst<"vqtbl4", ".(4Q)U", "UccPcQUcQcQPc">; +def VQTBL1_A64 : WInst<"vqtbl1", ".QU", "UccPcQUcQcQPcmQm">; +def VQTBL2_A64 : WInst<"vqtbl2", ".(2Q)U", "UccPcQUcQcQPcmQm">; +def VQTBL3_A64 : WInst<"vqtbl3", ".(3Q)U", "UccPcQUcQcQPcmQm">; +def VQTBL4_A64 : WInst<"vqtbl4", ".(4Q)U", "UccPcQUcQcQPcmQm">; } let InstName = "vtbx" in { -def VQTBX1_A64 : WInst<"vqtbx1", "..QU", "UccPcQUcQcQPc">; -def VQTBX2_A64 : WInst<"vqtbx2", "..(2Q)U", "UccPcQUcQcQPc">; -def VQTBX3_A64 : WInst<"vqtbx3", "..(3Q)U", "UccPcQUcQcQPc">; -def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPc">; +def VQTBX1_A64 : WInst<"vqtbx1", "..QU", "UccPcQUcQcQPcmQm">; +def VQTBX2_A64 : WInst<"vqtbx2", "..(2Q)U", "UccPcQUcQcQPcmQm">; +def VQTBX3_A64 : WInst<"vqtbx3", "..(3Q)U", "UccPcQUcQcQPcmQm">; +def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPcmQm">; } //////////////////////////////////////////////////////////////////////////////// @@ -1654,9 +1654,9 @@ def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_Q def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>; } // TargetGuard = "v8.1a" -def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs", +def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPsSm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; -def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs", +def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPsSm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; } // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" @@ -2090,17 +2090,17 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r // Lookup table read with 2-bit/4-bit indices let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { - def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcQcQUcQPc", + def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcmQcQUcQPcQm", [ImmCheck<2, ImmCheck0_1>]>; - def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcQcQUcQPc", + def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcmQcQUcQPcQm", [ImmCheck<2, ImmCheck0_3>]>; def VLUTI2_H : SInst<"vluti2_lane", "Q.(]>; def VLUTI2_H_Q : SInst<"vluti2_laneq", "Q.(]>; - def VLUTI4_B : SInst<"vluti4_lane", "..(qU)I", "QcQUcQPc", + def VLUTI4_B : SInst<"vluti4_lane", "..(qU)I", "QcQUcQPcQm", [ImmCheck<2, ImmCheck0_0>]>; - def VLUTI4_B_Q : SInst<"vluti4_laneq", "..UI", "QcQUcQPc", + def VLUTI4_B_Q : SInst<"vluti4_laneq", "..UI", "QcQUcQPcQm", [ImmCheck<2, ImmCheck0_1>]>; def VLUTI4_H_X2 : SInst<"vluti4_lane_x2", ".2(]>; diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 9f5fa266742e8..71b9a816669bc 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -269,7 +269,7 @@ def PtrStrideOp : CIR_Op<"ptr_stride", let extraClassDeclaration = [{ // Get type pointed by the base pointer. mlir::Type getElementTy() { - return mlir::cast(getBase().getType()).getPointee(); + return getBase().getType().getPointee(); } }]; } @@ -971,6 +971,52 @@ def SwitchOp : CIR_Op<"switch", }]; } +//===----------------------------------------------------------------------===// +// SwitchFlatOp +//===----------------------------------------------------------------------===// + +def SwitchFlatOp : CIR_Op<"switch.flat", [AttrSizedOperandSegments, + Terminator]> { + + let description = [{ + The `cir.switch.flat` operation is a region-less and simplified + version of the `cir.switch`. + Its representation is closer to LLVM IR dialect + than the C/C++ language feature. + }]; + + let arguments = (ins + CIR_IntType:$condition, + Variadic:$defaultOperands, + VariadicOfVariadic:$caseOperands, + ArrayAttr:$caseValues, + DenseI32ArrayAttr:$case_operand_segments + ); + + let successors = (successor + AnySuccessor:$defaultDestination, + VariadicSuccessor:$caseDestinations + ); + + let assemblyFormat = [{ + $condition `:` type($condition) `,` + $defaultDestination (`(` $defaultOperands^ `:` type($defaultOperands) `)`)? + custom(ref(type($condition)), $caseValues, + $caseDestinations, $caseOperands, + type($caseOperands)) + attr-dict + }]; + + let builders = [ + OpBuilder<(ins "mlir::Value":$condition, + "mlir::Block *":$defaultDestination, + "mlir::ValueRange":$defaultOperands, + CArg<"llvm::ArrayRef", "{}">:$caseValues, + CArg<"mlir::BlockRange", "{}">:$caseDestinations, + CArg<"llvm::ArrayRef", "{}">:$caseOperands)> + ]; +} + //===----------------------------------------------------------------------===// // BrOp //===----------------------------------------------------------------------===// @@ -1710,7 +1756,7 @@ def GetMemberOp : CIR_Op<"get_member"> { /// Return the result type. cir::PointerType getResultTy() { - return mlir::cast(getResult().getType()); + return getResult().getType(); } }]; diff --git a/clang/include/clang/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.h b/clang/include/clang/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.h new file mode 100644 index 0000000000000..3011245cd8a03 --- /dev/null +++ b/clang/include/clang/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.h @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains external dialect interfaces for CIR. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_CIR_DIALECT_OPENACC_CIROPENACCTYPEINTERFACES_H +#define CLANG_CIR_DIALECT_OPENACC_CIROPENACCTYPEINTERFACES_H + +#include "mlir/Dialect/OpenACC/OpenACC.h" + +namespace cir::acc { + +template +struct OpenACCPointerLikeModel + : public mlir::acc::PointerLikeType::ExternalModel< + OpenACCPointerLikeModel, T> { + mlir::Type getElementType(mlir::Type pointer) const { + return mlir::cast(pointer).getPointee(); + } + mlir::acc::VariableTypeCategory + getPointeeTypeCategory(mlir::Type pointer, + mlir::TypedValue varPtr, + mlir::Type varType) const; +}; + +} // namespace cir::acc + +#endif // CLANG_CIR_DIALECT_OPENACC_CIROPENACCTYPEINTERFACES_H diff --git a/clang/include/clang/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.h b/clang/include/clang/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.h new file mode 100644 index 0000000000000..13780a01ea1bb --- /dev/null +++ b/clang/include/clang/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_CIR_DIALECT_OPENACC_REGISTEROPENACCEXTENSIONS_H +#define CLANG_CIR_DIALECT_OPENACC_REGISTEROPENACCEXTENSIONS_H + +namespace mlir { +class DialectRegistry; +} // namespace mlir + +namespace cir::acc { + +void registerOpenACCExtensions(mlir::DialectRegistry ®istry); + +} // namespace cir::acc + +#endif // CLANG_CIR_DIALECT_OPENACC_REGISTEROPENACCEXTENSIONS_H diff --git a/clang/include/clang/Frontend/CommandLineSourceLoc.h b/clang/include/clang/Frontend/CommandLineSourceLoc.h index 074800a881a89..b07ffcb65c067 100644 --- a/clang/include/clang/Frontend/CommandLineSourceLoc.h +++ b/clang/include/clang/Frontend/CommandLineSourceLoc.h @@ -24,7 +24,9 @@ namespace clang { /// A source location that has been parsed on the command line. struct ParsedSourceLocation { std::string FileName; + // The 1-based line number unsigned Line; + // The 1-based column number unsigned Column; public: @@ -38,7 +40,8 @@ struct ParsedSourceLocation { // If both tail splits were valid integers, return success. if (!ColSplit.second.getAsInteger(10, PSL.Column) && - !LineSplit.second.getAsInteger(10, PSL.Line)) { + !LineSplit.second.getAsInteger(10, PSL.Line) && + !(PSL.Column == 0 || PSL.Line == 0)) { PSL.FileName = std::string(LineSplit.first); // On the command-line, stdin may be specified via "-". Inside the @@ -89,8 +92,12 @@ struct ParsedSourceRange { // probably belongs to the filename which menas the whole // string should be parsed. RangeSplit.first = Str; - } else + } else { + // Column and line numbers are 1-based. + if (EndLine == 0 || EndColumn == 0) + return std::nullopt; HasEndLoc = true; + } } auto Begin = ParsedSourceLocation::FromString(RangeSplit.first); if (Begin.FileName.empty()) diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 00538fd9a00b5..e6492b81dfff8 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -29,28 +29,28 @@ #include namespace clang { - class PragmaHandler; - class Scope; - class BalancedDelimiterTracker; - class CorrectionCandidateCallback; - class DeclGroupRef; - class DiagnosticBuilder; - struct LoopHint; - class Parser; - class ParsingDeclRAIIObject; - class ParsingDeclSpec; - class ParsingDeclarator; - class ParsingFieldDeclarator; - class ColonProtectionRAIIObject; - class InMessageExpressionRAIIObject; - class PoisonSEHIdentifiersRAIIObject; - class OMPClause; - class OpenACCClause; - class ObjCTypeParamList; - struct OMPTraitProperty; - struct OMPTraitSelector; - struct OMPTraitSet; - class OMPTraitInfo; +class PragmaHandler; +class Scope; +class BalancedDelimiterTracker; +class CorrectionCandidateCallback; +class DeclGroupRef; +class DiagnosticBuilder; +struct LoopHint; +class Parser; +class ParsingDeclRAIIObject; +class ParsingDeclSpec; +class ParsingDeclarator; +class ParsingFieldDeclarator; +class ColonProtectionRAIIObject; +class InMessageExpressionRAIIObject; +class PoisonSEHIdentifiersRAIIObject; +class OMPClause; +class OpenACCClause; +class ObjCTypeParamList; +struct OMPTraitProperty; +struct OMPTraitSelector; +struct OMPTraitSet; +class OMPTraitInfo; enum class AnnotatedNameKind { /// Annotation has failed and emitted an error. @@ -153,573 +153,438 @@ enum class CXX11AttributeKind { /// parsing units of the grammar, productions are invoked to handle whatever has /// been read. /// +/// \nosubgrouping class Parser : public CodeCompletionHandler { + // Table of Contents + // ----------------- + // 1. Parsing (Parser.cpp) + // 2. C++ Class Inline Methods (ParseCXXInlineMethods.cpp) + // 3. Declarations (ParseDecl.cpp) + // 4. C++ Declarations (ParseDeclCXX.cpp) + // 5. Expressions (ParseExpr.cpp) + // 6. C++ Expressions (ParseExprCXX.cpp) + // 7. HLSL Constructs (ParseHLSL.cpp) + // 8. Initializers (ParseInit.cpp) + // 9. Objective-C Constructs (ParseObjc.cpp) + // 10. OpenACC Constructs (ParseOpenACC.cpp) + // 11. OpenMP Constructs (ParseOpenMP.cpp) + // 12. Pragmas (ParsePragma.cpp) + // 13. Statements (ParseStmt.cpp) + // 14. `inline asm` Statement (ParseStmtAsm.cpp) + // 15. C++ Templates (ParseTemplate.cpp) + // 16. Tentative Parsing (ParseTentative.cpp) + + /// \name Parsing + /// Implementations are in Parser.cpp + ///@{ + +public: friend class ColonProtectionRAIIObject; - friend class ParsingOpenMPDirectiveRAII; - friend class ParsingOpenACCDirectiveRAII; - friend class InMessageExpressionRAIIObject; - friend class OffsetOfStateRAIIObject; friend class PoisonSEHIdentifiersRAIIObject; - friend class ObjCDeclContextSwitch; friend class ParenBraceBracketBalancer; friend class BalancedDelimiterTracker; - Preprocessor &PP; + Parser(Preprocessor &PP, Sema &Actions, bool SkipFunctionBodies); + ~Parser() override; - /// Tok - The current token we are peeking ahead. All parsing methods assume - /// that this is valid. - Token Tok; + const LangOptions &getLangOpts() const { return PP.getLangOpts(); } + const TargetInfo &getTargetInfo() const { return PP.getTargetInfo(); } + Preprocessor &getPreprocessor() const { return PP; } + Sema &getActions() const { return Actions; } + AttributeFactory &getAttrFactory() { return AttrFactory; } - // PrevTokLocation - The location of the token we previously - // consumed. This token is used for diagnostics where we expected to - // see a token following another token (e.g., the ';' at the end of - // a statement). - SourceLocation PrevTokLocation; + const Token &getCurToken() const { return Tok; } + Scope *getCurScope() const { return Actions.getCurScope(); } - /// Tracks an expected type for the current token when parsing an expression. - /// Used by code completion for ranking. - PreferredTypeBuilder PreferredType; + void incrementMSManglingNumber() const { + return Actions.incrementMSManglingNumber(); + } - unsigned short ParenCount = 0, BracketCount = 0, BraceCount = 0; - unsigned short MisplacedModuleBeginCount = 0; + // Type forwarding. All of these are statically 'void*', but they may all be + // different actual classes based on the actions in place. + typedef OpaquePtr DeclGroupPtrTy; + typedef OpaquePtr TemplateTy; - /// Actions - These are the callbacks we invoke as we parse various constructs - /// in the file. - Sema &Actions; + /// Initialize - Warm up the parser. + /// + void Initialize(); - DiagnosticsEngine &Diags; + /// Parse the first top-level declaration in a translation unit. + /// + /// \verbatim + /// translation-unit: + /// [C] external-declaration + /// [C] translation-unit external-declaration + /// [C++] top-level-declaration-seq[opt] + /// [C++20] global-module-fragment[opt] module-declaration + /// top-level-declaration-seq[opt] private-module-fragment[opt] + /// \endverbatim + /// + /// Note that in C, it is an error if there is no first declaration. + bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result, + Sema::ModuleImportState &ImportState); - StackExhaustionHandler StackHandler; + /// ParseTopLevelDecl - Parse one top-level declaration, return whatever the + /// action tells us to. This returns true if the EOF was encountered. + /// + /// \verbatim + /// top-level-declaration: + /// declaration + /// [C++20] module-import-declaration + /// \endverbatim + bool ParseTopLevelDecl(DeclGroupPtrTy &Result, + Sema::ModuleImportState &ImportState); + bool ParseTopLevelDecl() { + DeclGroupPtrTy Result; + Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module; + return ParseTopLevelDecl(Result, IS); + } - /// ScopeCache - Cache scopes to reduce malloc traffic. - static constexpr int ScopeCacheSize = 16; - unsigned NumCachedScopes; - Scope *ScopeCache[ScopeCacheSize]; + /// ConsumeToken - Consume the current 'peek token' and lex the next one. + /// This does not work with special tokens: string literals, code completion, + /// annotation tokens and balanced tokens must be handled using the specific + /// consume methods. + /// Returns the location of the consumed token. + SourceLocation ConsumeToken() { + assert(!isTokenSpecial() && + "Should consume special tokens with Consume*Token"); + PrevTokLocation = Tok.getLocation(); + PP.Lex(Tok); + return PrevTokLocation; + } - /// Identifiers used for SEH handling in Borland. These are only - /// allowed in particular circumstances - // __except block - IdentifierInfo *Ident__exception_code, - *Ident___exception_code, - *Ident_GetExceptionCode; - // __except filter expression - IdentifierInfo *Ident__exception_info, - *Ident___exception_info, - *Ident_GetExceptionInfo; - // __finally - IdentifierInfo *Ident__abnormal_termination, - *Ident___abnormal_termination, - *Ident_AbnormalTermination; + bool TryConsumeToken(tok::TokenKind Expected) { + if (Tok.isNot(Expected)) + return false; + assert(!isTokenSpecial() && + "Should consume special tokens with Consume*Token"); + PrevTokLocation = Tok.getLocation(); + PP.Lex(Tok); + return true; + } - /// Contextual keywords for Microsoft extensions. - IdentifierInfo *Ident__except; - mutable IdentifierInfo *Ident_sealed; - mutable IdentifierInfo *Ident_abstract; + bool TryConsumeToken(tok::TokenKind Expected, SourceLocation &Loc) { + if (!TryConsumeToken(Expected)) + return false; + Loc = PrevTokLocation; + return true; + } - /// Ident_super - IdentifierInfo for "super", to support fast - /// comparison. - IdentifierInfo *Ident_super; - /// Ident_vector, Ident_bool, Ident_Bool - cached IdentifierInfos for "vector" - /// and "bool" fast comparison. Only present if AltiVec or ZVector are - /// enabled. - IdentifierInfo *Ident_vector; - IdentifierInfo *Ident_bool; - IdentifierInfo *Ident_Bool; - /// Ident_pixel - cached IdentifierInfos for "pixel" fast comparison. - /// Only present if AltiVec enabled. - IdentifierInfo *Ident_pixel; + /// ConsumeAnyToken - Dispatch to the right Consume* method based on the + /// current token type. This should only be used in cases where the type of + /// the token really isn't known, e.g. in error recovery. + SourceLocation ConsumeAnyToken(bool ConsumeCodeCompletionTok = false) { + if (isTokenParen()) + return ConsumeParen(); + if (isTokenBracket()) + return ConsumeBracket(); + if (isTokenBrace()) + return ConsumeBrace(); + if (isTokenStringLiteral()) + return ConsumeStringToken(); + if (Tok.is(tok::code_completion)) + return ConsumeCodeCompletionTok ? ConsumeCodeCompletionToken() + : handleUnexpectedCodeCompletionToken(); + if (Tok.isAnnotation()) + return ConsumeAnnotationToken(); + return ConsumeToken(); + } - /// Objective-C contextual keywords. - IdentifierInfo *Ident_instancetype; + SourceLocation getEndOfPreviousToken() { + return PP.getLocForEndOfToken(PrevTokLocation); + } - /// Identifier for "introduced". - IdentifierInfo *Ident_introduced; + /// GetLookAheadToken - This peeks ahead N tokens and returns that token + /// without consuming any tokens. LookAhead(0) returns 'Tok', LookAhead(1) + /// returns the token after Tok, etc. + /// + /// Note that this differs from the Preprocessor's LookAhead method, because + /// the Parser always has one token lexed that the preprocessor doesn't. + /// + const Token &GetLookAheadToken(unsigned N) { + if (N == 0 || Tok.is(tok::eof)) + return Tok; + return PP.LookAhead(N - 1); + } - /// Identifier for "deprecated". - IdentifierInfo *Ident_deprecated; + /// NextToken - This peeks ahead one token and returns it without + /// consuming it. + const Token &NextToken() { return PP.LookAhead(0); } - /// Identifier for "obsoleted". - IdentifierInfo *Ident_obsoleted; + /// getTypeAnnotation - Read a parsed type out of an annotation token. + static TypeResult getTypeAnnotation(const Token &Tok) { + if (!Tok.getAnnotationValue()) + return TypeError(); + return ParsedType::getFromOpaquePtr(Tok.getAnnotationValue()); + } - /// Identifier for "unavailable". - IdentifierInfo *Ident_unavailable; + /// TryAnnotateTypeOrScopeToken - If the current token position is on a + /// typename (possibly qualified in C++) or a C++ scope specifier not followed + /// by a typename, TryAnnotateTypeOrScopeToken will replace one or more tokens + /// with a single annotation token representing the typename or C++ scope + /// respectively. + /// This simplifies handling of C++ scope specifiers and allows efficient + /// backtracking without the need to re-parse and resolve nested-names and + /// typenames. + /// It will mainly be called when we expect to treat identifiers as typenames + /// (if they are typenames). For example, in C we do not expect identifiers + /// inside expressions to be treated as typenames so it will not be called + /// for expressions in C. + /// The benefit for C/ObjC is that a typename will be annotated and + /// Actions.getTypeName will not be needed to be called again (e.g. + /// getTypeName will not be called twice, once to check whether we have a + /// declaration specifier, and another one to get the actual type inside + /// ParseDeclarationSpecifiers). + /// + /// This returns true if an error occurred. + /// + /// Note that this routine emits an error if you call it with ::new or + /// ::delete as the current tokens, so only call it in contexts where these + /// are invalid. + bool + TryAnnotateTypeOrScopeToken(ImplicitTypenameContext AllowImplicitTypename = + ImplicitTypenameContext::No); - /// Identifier for "message". - IdentifierInfo *Ident_message; + /// Try to annotate a type or scope token, having already parsed an + /// optional scope specifier. \p IsNewScope should be \c true unless the scope + /// specifier was extracted from an existing tok::annot_cxxscope annotation. + bool TryAnnotateTypeOrScopeTokenAfterScopeSpec( + CXXScopeSpec &SS, bool IsNewScope, + ImplicitTypenameContext AllowImplicitTypename); - /// Identifier for "strict". - IdentifierInfo *Ident_strict; + /// TryAnnotateScopeToken - Like TryAnnotateTypeOrScopeToken but only + /// annotates C++ scope specifiers and template-ids. This returns + /// true if there was an error that could not be recovered from. + /// + /// Note that this routine emits an error if you call it with ::new or + /// ::delete as the current tokens, so only call it in contexts where these + /// are invalid. + bool TryAnnotateCXXScopeToken(bool EnteringContext = false); - /// Identifier for "replacement". - IdentifierInfo *Ident_replacement; + bool MightBeCXXScopeToken() { + return getLangOpts().CPlusPlus && + (Tok.is(tok::identifier) || Tok.is(tok::coloncolon) || + (Tok.is(tok::annot_template_id) && + NextToken().is(tok::coloncolon)) || + Tok.is(tok::kw_decltype) || Tok.is(tok::kw___super)); + } + bool TryAnnotateOptionalCXXScopeToken(bool EnteringContext = false) { + return MightBeCXXScopeToken() && TryAnnotateCXXScopeToken(EnteringContext); + } - /// Identifier for "environment". - IdentifierInfo *Ident_environment; + //===--------------------------------------------------------------------===// + // Scope manipulation - /// Identifiers used by the 'external_source_symbol' attribute. - IdentifierInfo *Ident_language, *Ident_defined_in, - *Ident_generated_declaration, *Ident_USR; + /// ParseScope - Introduces a new scope for parsing. The kind of + /// scope is determined by ScopeFlags. Objects of this type should + /// be created on the stack to coincide with the position where the + /// parser enters the new scope, and this object's constructor will + /// create that new scope. Similarly, once the object is destroyed + /// the parser will exit the scope. + class ParseScope { + Parser *Self; + ParseScope(const ParseScope &) = delete; + void operator=(const ParseScope &) = delete; - /// C++11 contextual keywords. - mutable IdentifierInfo *Ident_final; - mutable IdentifierInfo *Ident_GNU_final; - mutable IdentifierInfo *Ident_override; - mutable IdentifierInfo *Ident_trivially_relocatable_if_eligible; - mutable IdentifierInfo *Ident_replaceable_if_eligible; + public: + // ParseScope - Construct a new object to manage a scope in the + // parser Self where the new Scope is created with the flags + // ScopeFlags, but only when we aren't about to enter a compound statement. + ParseScope(Parser *Self, unsigned ScopeFlags, bool EnteredScope = true, + bool BeforeCompoundStmt = false) + : Self(Self) { + if (EnteredScope && !BeforeCompoundStmt) + Self->EnterScope(ScopeFlags); + else { + if (BeforeCompoundStmt) + Self->incrementMSManglingNumber(); - // C++2a contextual keywords. - mutable IdentifierInfo *Ident_import; - mutable IdentifierInfo *Ident_module; + this->Self = nullptr; + } + } - // C++ type trait keywords that can be reverted to identifiers and still be - // used as type traits. - llvm::SmallDenseMap RevertibleTypeTraits; + // Exit - Exit the scope associated with this object now, rather + // than waiting until the object is destroyed. + void Exit() { + if (Self) { + Self->ExitScope(); + Self = nullptr; + } + } - std::unique_ptr AlignHandler; - std::unique_ptr GCCVisibilityHandler; - std::unique_ptr OptionsHandler; - std::unique_ptr PackHandler; - std::unique_ptr MSStructHandler; - std::unique_ptr UnusedHandler; - std::unique_ptr WeakHandler; - std::unique_ptr RedefineExtnameHandler; - std::unique_ptr FPContractHandler; - std::unique_ptr OpenCLExtensionHandler; - std::unique_ptr OpenMPHandler; - std::unique_ptr OpenACCHandler; - std::unique_ptr PCSectionHandler; - std::unique_ptr MSCommentHandler; - std::unique_ptr MSDetectMismatchHandler; - std::unique_ptr FPEvalMethodHandler; - std::unique_ptr FloatControlHandler; - std::unique_ptr MSPointersToMembers; - std::unique_ptr MSVtorDisp; - std::unique_ptr MSInitSeg; - std::unique_ptr MSDataSeg; - std::unique_ptr MSBSSSeg; - std::unique_ptr MSConstSeg; - std::unique_ptr MSCodeSeg; - std::unique_ptr MSSection; - std::unique_ptr MSStrictGuardStackCheck; - std::unique_ptr MSRuntimeChecks; - std::unique_ptr MSIntrinsic; - std::unique_ptr MSFunction; - std::unique_ptr MSOptimize; - std::unique_ptr MSFenvAccess; - std::unique_ptr MSAllocText; - std::unique_ptr CUDAForceHostDeviceHandler; - std::unique_ptr OptimizeHandler; - std::unique_ptr LoopHintHandler; - std::unique_ptr UnrollHintHandler; - std::unique_ptr NoUnrollHintHandler; - std::unique_ptr UnrollAndJamHintHandler; - std::unique_ptr NoUnrollAndJamHintHandler; - std::unique_ptr FPHandler; - std::unique_ptr STDCFenvAccessHandler; - std::unique_ptr STDCFenvRoundHandler; - std::unique_ptr STDCCXLIMITHandler; - std::unique_ptr STDCUnknownHandler; - std::unique_ptr AttributePragmaHandler; - std::unique_ptr MaxTokensHerePragmaHandler; - std::unique_ptr MaxTokensTotalPragmaHandler; - std::unique_ptr RISCVPragmaHandler; + ~ParseScope() { Exit(); } + }; - std::unique_ptr CommentSemaHandler; + /// Introduces zero or more scopes for parsing. The scopes will all be exited + /// when the object is destroyed. + class MultiParseScope { + Parser &Self; + unsigned NumScopes = 0; - /// Whether the '>' token acts as an operator or not. This will be - /// true except when we are parsing an expression within a C++ - /// template argument list, where the '>' closes the template - /// argument list. - bool GreaterThanIsOperator; + MultiParseScope(const MultiParseScope &) = delete; - /// ColonIsSacred - When this is false, we aggressively try to recover from - /// code like "foo : bar" as if it were a typo for "foo :: bar". This is not - /// safe in case statements and a few other things. This is managed by the - /// ColonProtectionRAIIObject RAII object. - bool ColonIsSacred; + public: + MultiParseScope(Parser &Self) : Self(Self) {} + void Enter(unsigned ScopeFlags) { + Self.EnterScope(ScopeFlags); + ++NumScopes; + } + void Exit() { + while (NumScopes) { + Self.ExitScope(); + --NumScopes; + } + } + ~MultiParseScope() { Exit(); } + }; - /// Parsing OpenMP directive mode. - bool OpenMPDirectiveParsing = false; + /// EnterScope - Start a new scope. + void EnterScope(unsigned ScopeFlags); - /// Parsing OpenACC directive mode. - bool OpenACCDirectiveParsing = false; + /// ExitScope - Pop a scope off the scope stack. + void ExitScope(); - /// Currently parsing a situation where an OpenACC array section could be - /// legal, such as a 'var-list'. - bool AllowOpenACCArraySections = false; + //===--------------------------------------------------------------------===// + // Diagnostic Emission and Error recovery. - /// RAII object to set reset OpenACC parsing a context where Array Sections - /// are allowed. - class OpenACCArraySectionRAII { - Parser &P; + DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID); + DiagnosticBuilder Diag(const Token &Tok, unsigned DiagID); + DiagnosticBuilder Diag(unsigned DiagID) { return Diag(Tok, DiagID); } - public: - OpenACCArraySectionRAII(Parser &P) : P(P) { - assert(!P.AllowOpenACCArraySections); - P.AllowOpenACCArraySections = true; - } - ~OpenACCArraySectionRAII() { - assert(P.AllowOpenACCArraySections); - P.AllowOpenACCArraySections = false; - } + DiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId); + DiagnosticBuilder DiagCompat(const Token &Tok, unsigned CompatDiagId); + DiagnosticBuilder DiagCompat(unsigned CompatDiagId) { + return DiagCompat(Tok, CompatDiagId); + } + + /// Control flags for SkipUntil functions. + enum SkipUntilFlags { + StopAtSemi = 1 << 0, ///< Stop skipping at semicolon + /// Stop skipping at specified token, but don't skip the token itself + StopBeforeMatch = 1 << 1, + StopAtCodeCompletion = 1 << 2 ///< Stop at code completion }; - /// When true, we are directly inside an Objective-C message - /// send expression. + friend constexpr SkipUntilFlags operator|(SkipUntilFlags L, + SkipUntilFlags R) { + return static_cast(static_cast(L) | + static_cast(R)); + } + + /// SkipUntil - Read tokens until we get to the specified token, then consume + /// it (unless StopBeforeMatch is specified). Because we cannot guarantee + /// that the token will ever occur, this skips to the next token, or to some + /// likely good stopping point. If Flags has StopAtSemi flag, skipping will + /// stop at a ';' character. Balances (), [], and {} delimiter tokens while + /// skipping. /// - /// This is managed by the \c InMessageExpressionRAIIObject class, and - /// should not be set directly. - bool InMessageExpression; + /// If SkipUntil finds the specified token, it returns true, otherwise it + /// returns false. + bool SkipUntil(tok::TokenKind T, + SkipUntilFlags Flags = static_cast(0)) { + return SkipUntil(llvm::ArrayRef(T), Flags); + } + bool SkipUntil(tok::TokenKind T1, tok::TokenKind T2, + SkipUntilFlags Flags = static_cast(0)) { + tok::TokenKind TokArray[] = {T1, T2}; + return SkipUntil(TokArray, Flags); + } + bool SkipUntil(tok::TokenKind T1, tok::TokenKind T2, tok::TokenKind T3, + SkipUntilFlags Flags = static_cast(0)) { + tok::TokenKind TokArray[] = {T1, T2, T3}; + return SkipUntil(TokArray, Flags); + } - /// Gets set to true after calling ProduceSignatureHelp, it is for a - /// workaround to make sure ProduceSignatureHelp is only called at the deepest - /// function call. - bool CalledSignatureHelp = false; + /// SkipUntil - Read tokens until we get to the specified token, then consume + /// it (unless no flag StopBeforeMatch). Because we cannot guarantee that the + /// token will ever occur, this skips to the next token, or to some likely + /// good stopping point. If StopAtSemi is true, skipping will stop at a ';' + /// character. + /// + /// If SkipUntil finds the specified token, it returns true, otherwise it + /// returns false. + bool SkipUntil(ArrayRef Toks, + SkipUntilFlags Flags = static_cast(0)); - OffsetOfKind OffsetOfState = OffsetOfKind::Outside; +private: + Preprocessor &PP; - /// The "depth" of the template parameters currently being parsed. - unsigned TemplateParameterDepth; + /// Tok - The current token we are peeking ahead. All parsing methods assume + /// that this is valid. + Token Tok; - /// Current kind of OpenMP clause - OpenMPClauseKind OMPClauseKind = llvm::omp::OMPC_unknown; + // PrevTokLocation - The location of the token we previously + // consumed. This token is used for diagnostics where we expected to + // see a token following another token (e.g., the ';' at the end of + // a statement). + SourceLocation PrevTokLocation; - /// RAII class that manages the template parameter depth. - class TemplateParameterDepthRAII { - unsigned &Depth; - unsigned AddedLevels; - public: - explicit TemplateParameterDepthRAII(unsigned &Depth) - : Depth(Depth), AddedLevels(0) {} + /// Tracks an expected type for the current token when parsing an expression. + /// Used by code completion for ranking. + PreferredTypeBuilder PreferredType; - ~TemplateParameterDepthRAII() { - Depth -= AddedLevels; - } + unsigned short ParenCount = 0, BracketCount = 0, BraceCount = 0; + unsigned short MisplacedModuleBeginCount = 0; - void operator++() { - ++Depth; - ++AddedLevels; - } - void addDepth(unsigned D) { - Depth += D; - AddedLevels += D; - } - void setAddedDepth(unsigned D) { - Depth = Depth - AddedLevels + D; - AddedLevels = D; - } + /// Actions - These are the callbacks we invoke as we parse various constructs + /// in the file. + Sema &Actions; - unsigned getDepth() const { return Depth; } - unsigned getOriginalDepth() const { return Depth - AddedLevels; } - }; + DiagnosticsEngine &Diags; - /// Factory object for creating ParsedAttr objects. - AttributeFactory AttrFactory; + StackExhaustionHandler StackHandler; - /// Gathers and cleans up TemplateIdAnnotations when parsing of a - /// top-level declaration is finished. - SmallVector TemplateIds; + /// ScopeCache - Cache scopes to reduce malloc traffic. + static constexpr int ScopeCacheSize = 16; + unsigned NumCachedScopes; + Scope *ScopeCache[ScopeCacheSize]; - /// Don't destroy template annotations in MaybeDestroyTemplateIds even if - /// we're at the end of a declaration. Instead, we defer the destruction until - /// after a top-level declaration. - /// Use DelayTemplateIdDestructionRAII rather than setting it directly. - bool DelayTemplateIdDestruction = false; + /// Identifiers used for SEH handling in Borland. These are only + /// allowed in particular circumstances + // __except block + IdentifierInfo *Ident__exception_code, *Ident___exception_code, + *Ident_GetExceptionCode; + // __except filter expression + IdentifierInfo *Ident__exception_info, *Ident___exception_info, + *Ident_GetExceptionInfo; + // __finally + IdentifierInfo *Ident__abnormal_termination, *Ident___abnormal_termination, + *Ident_AbnormalTermination; - void MaybeDestroyTemplateIds() { - if (DelayTemplateIdDestruction) - return; - if (!TemplateIds.empty() && - (Tok.is(tok::eof) || !PP.mightHavePendingAnnotationTokens())) - DestroyTemplateIds(); - } - void DestroyTemplateIds(); + /// Contextual keywords for Microsoft extensions. + IdentifierInfo *Ident__except; - /// RAII object to destroy TemplateIdAnnotations where possible, from a - /// likely-good position during parsing. - struct DestroyTemplateIdAnnotationsRAIIObj { - Parser &Self; + // C++2a contextual keywords. + mutable IdentifierInfo *Ident_import; + mutable IdentifierInfo *Ident_module; - DestroyTemplateIdAnnotationsRAIIObj(Parser &Self) : Self(Self) {} - ~DestroyTemplateIdAnnotationsRAIIObj() { Self.MaybeDestroyTemplateIds(); } - }; + std::unique_ptr CommentSemaHandler; - struct DelayTemplateIdDestructionRAII { - Parser &Self; - bool PrevDelayTemplateIdDestruction; + /// Gets set to true after calling ProduceSignatureHelp, it is for a + /// workaround to make sure ProduceSignatureHelp is only called at the deepest + /// function call. + bool CalledSignatureHelp = false; - DelayTemplateIdDestructionRAII(Parser &Self, - bool DelayTemplateIdDestruction) noexcept - : Self(Self), - PrevDelayTemplateIdDestruction(Self.DelayTemplateIdDestruction) { - Self.DelayTemplateIdDestruction = DelayTemplateIdDestruction; - } + IdentifierInfo *getSEHExceptKeyword(); - ~DelayTemplateIdDestructionRAII() noexcept { - Self.DelayTemplateIdDestruction = PrevDelayTemplateIdDestruction; - } - }; + /// Whether to skip parsing of function bodies. + /// + /// This option can be used, for example, to speed up searches for + /// declarations/definitions when indexing. + bool SkipFunctionBodies; - /// Identifiers which have been declared within a tentative parse. - SmallVector TentativelyDeclaredIdentifiers; - - /// Tracker for '<' tokens that might have been intended to be treated as an - /// angle bracket instead of a less-than comparison. - /// - /// This happens when the user intends to form a template-id, but typoes the - /// template-name or forgets a 'template' keyword for a dependent template - /// name. - /// - /// We track these locations from the point where we see a '<' with a - /// name-like expression on its left until we see a '>' or '>>' that might - /// match it. - struct AngleBracketTracker { - /// Flags used to rank candidate template names when there is more than one - /// '<' in a scope. - enum Priority : unsigned short { - /// A non-dependent name that is a potential typo for a template name. - PotentialTypo = 0x0, - /// A dependent name that might instantiate to a template-name. - DependentName = 0x2, - - /// A space appears before the '<' token. - SpaceBeforeLess = 0x0, - /// No space before the '<' token - NoSpaceBeforeLess = 0x1, - - LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue*/ DependentName) - }; - - struct Loc { - Expr *TemplateName; - SourceLocation LessLoc; - AngleBracketTracker::Priority Priority; - unsigned short ParenCount, BracketCount, BraceCount; - - bool isActive(Parser &P) const { - return P.ParenCount == ParenCount && P.BracketCount == BracketCount && - P.BraceCount == BraceCount; - } - - bool isActiveOrNested(Parser &P) const { - return isActive(P) || P.ParenCount > ParenCount || - P.BracketCount > BracketCount || P.BraceCount > BraceCount; - } - }; - - SmallVector Locs; - - /// Add an expression that might have been intended to be a template name. - /// In the case of ambiguity, we arbitrarily select the innermost such - /// expression, for example in 'foo < bar < baz', 'bar' is the current - /// candidate. No attempt is made to track that 'foo' is also a candidate - /// for the case where we see a second suspicious '>' token. - void add(Parser &P, Expr *TemplateName, SourceLocation LessLoc, - Priority Prio) { - if (!Locs.empty() && Locs.back().isActive(P)) { - if (Locs.back().Priority <= Prio) { - Locs.back().TemplateName = TemplateName; - Locs.back().LessLoc = LessLoc; - Locs.back().Priority = Prio; - } - } else { - Locs.push_back({TemplateName, LessLoc, Prio, - P.ParenCount, P.BracketCount, P.BraceCount}); - } - } - - /// Mark the current potential missing template location as having been - /// handled (this happens if we pass a "corresponding" '>' or '>>' token - /// or leave a bracket scope). - void clear(Parser &P) { - while (!Locs.empty() && Locs.back().isActiveOrNested(P)) - Locs.pop_back(); - } - - /// Get the current enclosing expression that might hve been intended to be - /// a template name. - Loc *getCurrent(Parser &P) { - if (!Locs.empty() && Locs.back().isActive(P)) - return &Locs.back(); - return nullptr; - } - }; - - AngleBracketTracker AngleBrackets; - - IdentifierInfo *getSEHExceptKeyword(); - - /// True if we are within an Objective-C container while parsing C-like decls. - /// - /// This is necessary because Sema thinks we have left the container - /// to parse the C-like decls, meaning Actions.ObjC().getObjCDeclContext() - /// will be NULL. - bool ParsingInObjCContainer; - - /// Whether to skip parsing of function bodies. - /// - /// This option can be used, for example, to speed up searches for - /// declarations/definitions when indexing. - bool SkipFunctionBodies; - - /// The location of the expression statement that is being parsed right now. - /// Used to determine if an expression that is being parsed is a statement or - /// just a regular sub-expression. - SourceLocation ExprStatementTokLoc; - - /// Flags describing a context in which we're parsing a statement. - enum class ParsedStmtContext { - /// This context permits declarations in language modes where declarations - /// are not statements. - AllowDeclarationsInC = 0x1, - /// This context permits standalone OpenMP directives. - AllowStandaloneOpenMPDirectives = 0x2, - /// This context is at the top level of a GNU statement expression. - InStmtExpr = 0x4, - - /// The context of a regular substatement. - SubStmt = 0, - /// The context of a compound-statement. - Compound = AllowDeclarationsInC | AllowStandaloneOpenMPDirectives, - - LLVM_MARK_AS_BITMASK_ENUM(InStmtExpr) - }; - - /// Act on an expression statement that might be the last statement in a - /// GNU statement expression. Checks whether we are actually at the end of - /// a statement expression and builds a suitable expression statement. - StmtResult handleExprStmt(ExprResult E, ParsedStmtContext StmtCtx); - -public: - Parser(Preprocessor &PP, Sema &Actions, bool SkipFunctionBodies); - ~Parser() override; - - const LangOptions &getLangOpts() const { return PP.getLangOpts(); } - const TargetInfo &getTargetInfo() const { return PP.getTargetInfo(); } - Preprocessor &getPreprocessor() const { return PP; } - Sema &getActions() const { return Actions; } - AttributeFactory &getAttrFactory() { return AttrFactory; } - - const Token &getCurToken() const { return Tok; } - Scope *getCurScope() const { return Actions.getCurScope(); } - void incrementMSManglingNumber() const { - return Actions.incrementMSManglingNumber(); - } - - ObjCContainerDecl *getObjCDeclContext() const { - return Actions.ObjC().getObjCDeclContext(); - } - - // Type forwarding. All of these are statically 'void*', but they may all be - // different actual classes based on the actions in place. - typedef OpaquePtr DeclGroupPtrTy; - typedef OpaquePtr TemplateTy; - - typedef SmallVector TemplateParameterLists; - - typedef Sema::FullExprArg FullExprArg; - - /// A SmallVector of statements. - typedef SmallVector StmtVector; - - // Parsing methods. - - /// Initialize - Warm up the parser. - /// - void Initialize(); - - /// Parse the first top-level declaration in a translation unit. - bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result, - Sema::ModuleImportState &ImportState); - - /// ParseTopLevelDecl - Parse one top-level declaration. Returns true if - /// the EOF was encountered. - bool ParseTopLevelDecl(DeclGroupPtrTy &Result, - Sema::ModuleImportState &ImportState); - bool ParseTopLevelDecl() { - DeclGroupPtrTy Result; - Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module; - return ParseTopLevelDecl(Result, IS); - } - - /// ConsumeToken - Consume the current 'peek token' and lex the next one. - /// This does not work with special tokens: string literals, code completion, - /// annotation tokens and balanced tokens must be handled using the specific - /// consume methods. - /// Returns the location of the consumed token. - SourceLocation ConsumeToken() { - assert(!isTokenSpecial() && - "Should consume special tokens with Consume*Token"); - PrevTokLocation = Tok.getLocation(); - PP.Lex(Tok); - return PrevTokLocation; - } - - bool TryConsumeToken(tok::TokenKind Expected) { - if (Tok.isNot(Expected)) - return false; - assert(!isTokenSpecial() && - "Should consume special tokens with Consume*Token"); - PrevTokLocation = Tok.getLocation(); - PP.Lex(Tok); - return true; - } - - bool TryConsumeToken(tok::TokenKind Expected, SourceLocation &Loc) { - if (!TryConsumeToken(Expected)) - return false; - Loc = PrevTokLocation; - return true; - } - - /// ConsumeAnyToken - Dispatch to the right Consume* method based on the - /// current token type. This should only be used in cases where the type of - /// the token really isn't known, e.g. in error recovery. - SourceLocation ConsumeAnyToken(bool ConsumeCodeCompletionTok = false) { - if (isTokenParen()) - return ConsumeParen(); - if (isTokenBracket()) - return ConsumeBracket(); - if (isTokenBrace()) - return ConsumeBrace(); - if (isTokenStringLiteral()) - return ConsumeStringToken(); - if (Tok.is(tok::code_completion)) - return ConsumeCodeCompletionTok ? ConsumeCodeCompletionToken() - : handleUnexpectedCodeCompletionToken(); - if (Tok.isAnnotation()) - return ConsumeAnnotationToken(); - return ConsumeToken(); - } - - - SourceLocation getEndOfPreviousToken() { - return PP.getLocForEndOfToken(PrevTokLocation); - } - - /// Retrieve the underscored keyword (_Nonnull, _Nullable) that corresponds - /// to the given nullability kind. - IdentifierInfo *getNullabilityKeyword(NullabilityKind nullability) { - return Actions.getNullabilityKeyword(nullability); - } - -private: - //===--------------------------------------------------------------------===// - // Low-Level token peeking and consumption methods. - // + //===--------------------------------------------------------------------===// + // Low-Level token peeking and consumption methods. + // /// isTokenParen - Return true if the cur token is '(' or ')'. - bool isTokenParen() const { - return Tok.isOneOf(tok::l_paren, tok::r_paren); - } + bool isTokenParen() const { return Tok.isOneOf(tok::l_paren, tok::r_paren); } /// isTokenBracket - Return true if the cur token is '[' or ']'. bool isTokenBracket() const { return Tok.isOneOf(tok::l_square, tok::r_square); } /// isTokenBrace - Return true if the cur token is '{' or '}'. - bool isTokenBrace() const { - return Tok.isOneOf(tok::l_brace, tok::r_brace); - } + bool isTokenBrace() const { return Tok.isOneOf(tok::l_brace, tok::r_brace); } /// isTokenStringLiteral - True if this token is a string-literal. bool isTokenStringLiteral() const { return tok::isStringLiteral(Tok.getKind()); @@ -737,10 +602,10 @@ class Parser : public CodeCompletionHandler { /// Return the current token to the token stream and make the given /// token the current token. void UnconsumeToken(Token &Consumed) { - Token Next = Tok; - PP.EnterToken(Consumed, /*IsReinject*/true); - PP.Lex(Tok); - PP.EnterToken(Next, /*IsReinject*/true); + Token Next = Tok; + PP.EnterToken(Consumed, /*IsReinject*/ true); + PP.Lex(Tok); + PP.EnterToken(Next, /*IsReinject*/ true); } SourceLocation ConsumeAnnotationToken() { @@ -759,7 +624,7 @@ class Parser : public CodeCompletionHandler { ++ParenCount; else if (ParenCount) { AngleBrackets.clear(*this); - --ParenCount; // Don't let unbalanced )'s drive the count negative. + --ParenCount; // Don't let unbalanced )'s drive the count negative. } PrevTokLocation = Tok.getLocation(); PP.Lex(Tok); @@ -774,7 +639,7 @@ class Parser : public CodeCompletionHandler { ++BracketCount; else if (BracketCount) { AngleBrackets.clear(*this); - --BracketCount; // Don't let unbalanced ]'s drive the count negative. + --BracketCount; // Don't let unbalanced ]'s drive the count negative. } PrevTokLocation = Tok.getLocation(); @@ -790,7 +655,7 @@ class Parser : public CodeCompletionHandler { ++BraceCount; else if (BraceCount) { AngleBrackets.clear(*this); - --BraceCount; // Don't let unbalanced }'s drive the count negative. + --BraceCount; // Don't let unbalanced }'s drive the count negative. } PrevTokLocation = Tok.getLocation(); @@ -847,158 +712,22 @@ class Parser : public CodeCompletionHandler { Kind == tok::annot_repl_input_end; } - /// Checks if the \p Level is valid for use in a fold expression. - bool isFoldOperator(prec::Level Level) const; - - /// Checks if the \p Kind is a valid operator for fold expressions. - bool isFoldOperator(tok::TokenKind Kind) const; + static void setTypeAnnotation(Token &Tok, TypeResult T) { + assert((T.isInvalid() || T.get()) && + "produced a valid-but-null type annotation?"); + Tok.setAnnotationValue(T.isInvalid() ? nullptr : T.get().getAsOpaquePtr()); + } - /// Initialize all pragma handlers. - void initializePragmaHandlers(); - - /// Destroy and reset all pragma handlers. - void resetPragmaHandlers(); - - /// Handle the annotation token produced for #pragma unused(...) - void HandlePragmaUnused(); - - /// Handle the annotation token produced for - /// #pragma GCC visibility... - void HandlePragmaVisibility(); - - /// Handle the annotation token produced for - /// #pragma pack... - void HandlePragmaPack(); - - /// Handle the annotation token produced for - /// #pragma ms_struct... - void HandlePragmaMSStruct(); - - void HandlePragmaMSPointersToMembers(); - - void HandlePragmaMSVtorDisp(); - - void HandlePragmaMSPragma(); - bool HandlePragmaMSSection(StringRef PragmaName, - SourceLocation PragmaLocation); - bool HandlePragmaMSSegment(StringRef PragmaName, - SourceLocation PragmaLocation); - bool HandlePragmaMSInitSeg(StringRef PragmaName, - SourceLocation PragmaLocation); - bool HandlePragmaMSStrictGuardStackCheck(StringRef PragmaName, - SourceLocation PragmaLocation); - bool HandlePragmaMSFunction(StringRef PragmaName, - SourceLocation PragmaLocation); - bool HandlePragmaMSAllocText(StringRef PragmaName, - SourceLocation PragmaLocation); - bool HandlePragmaMSOptimize(StringRef PragmaName, - SourceLocation PragmaLocation); - - /// Handle the annotation token produced for - /// #pragma align... - void HandlePragmaAlign(); - - /// Handle the annotation token produced for - /// #pragma clang __debug dump... - void HandlePragmaDump(); - - /// Handle the annotation token produced for - /// #pragma weak id... - void HandlePragmaWeak(); - - /// Handle the annotation token produced for - /// #pragma weak id = id... - void HandlePragmaWeakAlias(); - - /// Handle the annotation token produced for - /// #pragma redefine_extname... - void HandlePragmaRedefineExtname(); - - /// Handle the annotation token produced for - /// #pragma STDC FP_CONTRACT... - void HandlePragmaFPContract(); - - /// Handle the annotation token produced for - /// #pragma STDC FENV_ACCESS... - void HandlePragmaFEnvAccess(); - - /// Handle the annotation token produced for - /// #pragma STDC FENV_ROUND... - void HandlePragmaFEnvRound(); - - /// Handle the annotation token produced for - /// #pragma STDC CX_LIMITED_RANGE... - void HandlePragmaCXLimitedRange(); - - /// Handle the annotation token produced for - /// #pragma float_control - void HandlePragmaFloatControl(); - - /// \brief Handle the annotation token produced for - /// #pragma clang fp ... - void HandlePragmaFP(); - - /// Handle the annotation token produced for - /// #pragma OPENCL EXTENSION... - void HandlePragmaOpenCLExtension(); - - /// Handle the annotation token produced for - /// #pragma clang __debug captured - StmtResult HandlePragmaCaptured(); - - /// Handle the annotation token produced for - /// #pragma clang loop and #pragma unroll. - bool HandlePragmaLoopHint(LoopHint &Hint); - - bool ParsePragmaAttributeSubjectMatchRuleSet( - attr::ParsedSubjectMatchRuleSet &SubjectMatchRules, - SourceLocation &AnyLoc, SourceLocation &LastMatchRuleEndLoc); - - void HandlePragmaAttribute(); - - /// GetLookAheadToken - This peeks ahead N tokens and returns that token - /// without consuming any tokens. LookAhead(0) returns 'Tok', LookAhead(1) - /// returns the token after Tok, etc. - /// - /// Note that this differs from the Preprocessor's LookAhead method, because - /// the Parser always has one token lexed that the preprocessor doesn't. - /// - const Token &GetLookAheadToken(unsigned N) { - if (N == 0 || Tok.is(tok::eof)) return Tok; - return PP.LookAhead(N-1); - } - -public: - /// NextToken - This peeks ahead one token and returns it without - /// consuming it. - const Token &NextToken() { - return PP.LookAhead(0); - } - - /// getTypeAnnotation - Read a parsed type out of an annotation token. - static TypeResult getTypeAnnotation(const Token &Tok) { - if (!Tok.getAnnotationValue()) - return TypeError(); - return ParsedType::getFromOpaquePtr(Tok.getAnnotationValue()); - } - -private: - static void setTypeAnnotation(Token &Tok, TypeResult T) { - assert((T.isInvalid() || T.get()) && - "produced a valid-but-null type annotation?"); - Tok.setAnnotationValue(T.isInvalid() ? nullptr : T.get().getAsOpaquePtr()); - } - - static NamedDecl *getNonTypeAnnotation(const Token &Tok) { - return static_cast(Tok.getAnnotationValue()); - } + static NamedDecl *getNonTypeAnnotation(const Token &Tok) { + return static_cast(Tok.getAnnotationValue()); + } static void setNonTypeAnnotation(Token &Tok, NamedDecl *ND) { Tok.setAnnotationValue(ND); } static IdentifierInfo *getIdentifierAnnotation(const Token &Tok) { - return static_cast(Tok.getAnnotationValue()); + return static_cast(Tok.getAnnotationValue()); } static void setIdentifierAnnotation(Token &Tok, IdentifierInfo *ND) { @@ -1017,29 +746,16 @@ class Parser : public CodeCompletionHandler { Tok.setAnnotationValue(ER.getAsOpaquePointer()); } -public: - // If NeedType is true, then TryAnnotateTypeOrScopeToken will try harder to - // find a type name by attempting typo correction. - bool - TryAnnotateTypeOrScopeToken(ImplicitTypenameContext AllowImplicitTypename = - ImplicitTypenameContext::No); - bool TryAnnotateTypeOrScopeTokenAfterScopeSpec( - CXXScopeSpec &SS, bool IsNewScope, - ImplicitTypenameContext AllowImplicitTypename); - bool TryAnnotateCXXScopeToken(bool EnteringContext = false); - - bool MightBeCXXScopeToken() { - return getLangOpts().CPlusPlus && - (Tok.is(tok::identifier) || Tok.is(tok::coloncolon) || - (Tok.is(tok::annot_template_id) && - NextToken().is(tok::coloncolon)) || - Tok.is(tok::kw_decltype) || Tok.is(tok::kw___super)); - } - bool TryAnnotateOptionalCXXScopeToken(bool EnteringContext = false) { - return MightBeCXXScopeToken() && TryAnnotateCXXScopeToken(EnteringContext); - } - -private: + /// Attempt to classify the name at the current token position. This may + /// form a type, scope or primary expression annotation, or replace the token + /// with a typo-corrected keyword. This is only appropriate when the current + /// name must refer to an entity which has already been declared. + /// + /// \param CCC Indicates how to perform typo-correction for this name. If + /// NULL, no typo correction will be performed. + /// \param AllowImplicitTypename Whether we are in a context where a dependent + /// nested-name-specifier without typename is treated as a type (e.g. + /// T::type). AnnotatedNameKind TryAnnotateName(CorrectionCandidateCallback *CCC = nullptr, ImplicitTypenameContext AllowImplicitTypename = @@ -1048,50 +764,6 @@ class Parser : public CodeCompletionHandler { /// Push a tok::annot_cxxscope token onto the token stream. void AnnotateScopeToken(CXXScopeSpec &SS, bool IsNewAnnotation); - /// TryAltiVecToken - Check for context-sensitive AltiVec identifier tokens, - /// replacing them with the non-context-sensitive keywords. This returns - /// true if the token was replaced. - bool TryAltiVecToken(DeclSpec &DS, SourceLocation Loc, - const char *&PrevSpec, unsigned &DiagID, - bool &isInvalid) { - if (!getLangOpts().AltiVec && !getLangOpts().ZVector) - return false; - - if (Tok.getIdentifierInfo() != Ident_vector && - Tok.getIdentifierInfo() != Ident_bool && - Tok.getIdentifierInfo() != Ident_Bool && - (!getLangOpts().AltiVec || Tok.getIdentifierInfo() != Ident_pixel)) - return false; - - return TryAltiVecTokenOutOfLine(DS, Loc, PrevSpec, DiagID, isInvalid); - } - - /// TryAltiVecVectorToken - Check for context-sensitive AltiVec vector - /// identifier token, replacing it with the non-context-sensitive __vector. - /// This returns true if the token was replaced. - bool TryAltiVecVectorToken() { - if ((!getLangOpts().AltiVec && !getLangOpts().ZVector) || - Tok.getIdentifierInfo() != Ident_vector) return false; - return TryAltiVecVectorTokenOutOfLine(); - } - - bool TryAltiVecVectorTokenOutOfLine(); - bool TryAltiVecTokenOutOfLine(DeclSpec &DS, SourceLocation Loc, - const char *&PrevSpec, unsigned &DiagID, - bool &isInvalid); - - /// Returns true if the current token is the identifier 'instancetype'. - /// - /// Should only be used in Objective-C language modes. - bool isObjCInstancetype() { - assert(getLangOpts().ObjC); - if (Tok.isAnnotation()) - return false; - if (!Ident_instancetype) - Ident_instancetype = PP.getIdentifierInfo("instancetype"); - return Tok.getIdentifierInfo() == Ident_instancetype; - } - /// TryKeywordIdentFallback - For compatibility with system headers using /// keywords as identifiers, attempt to convert the current token to an /// identifier and optionally disable the keyword for the remainder of the @@ -1099,113 +771,29 @@ class Parser : public CodeCompletionHandler { /// otherwise emits a diagnostic and returns true. bool TryKeywordIdentFallback(bool DisableKeyword); - /// Get the TemplateIdAnnotation from the token. + /// Get the TemplateIdAnnotation from the token and put it in the + /// cleanup pool so that it gets destroyed when parsing the current top level + /// declaration is finished. TemplateIdAnnotation *takeTemplateIdAnnotation(const Token &tok); - /// TentativeParsingAction - An object that is used as a kind of "tentative - /// parsing transaction". It gets instantiated to mark the token position and - /// after the token consumption is done, Commit() or Revert() is called to - /// either "commit the consumed tokens" or revert to the previously marked - /// token position. Example: + /// ExpectAndConsume - The parser expects that 'ExpectedTok' is next in the + /// input. If so, it is consumed and false is returned. /// - /// TentativeParsingAction TPA(*this); - /// ConsumeToken(); - /// .... - /// TPA.Revert(); + /// If a trivial punctuator misspelling is encountered, a FixIt error + /// diagnostic is issued and false is returned after recovery. /// - /// If the Unannotated parameter is true, any token annotations created - /// during the tentative parse are reverted. - class TentativeParsingAction { - Parser &P; - PreferredTypeBuilder PrevPreferredType; - Token PrevTok; - size_t PrevTentativelyDeclaredIdentifierCount; - unsigned short PrevParenCount, PrevBracketCount, PrevBraceCount; - bool isActive; - - public: - explicit TentativeParsingAction(Parser &p, bool Unannotated = false) - : P(p), PrevPreferredType(P.PreferredType) { - PrevTok = P.Tok; - PrevTentativelyDeclaredIdentifierCount = - P.TentativelyDeclaredIdentifiers.size(); - PrevParenCount = P.ParenCount; - PrevBracketCount = P.BracketCount; - PrevBraceCount = P.BraceCount; - P.PP.EnableBacktrackAtThisPos(Unannotated); - isActive = true; - } - void Commit() { - assert(isActive && "Parsing action was finished!"); - P.TentativelyDeclaredIdentifiers.resize( - PrevTentativelyDeclaredIdentifierCount); - P.PP.CommitBacktrackedTokens(); - isActive = false; - } - void Revert() { - assert(isActive && "Parsing action was finished!"); - P.PP.Backtrack(); - P.PreferredType = PrevPreferredType; - P.Tok = PrevTok; - P.TentativelyDeclaredIdentifiers.resize( - PrevTentativelyDeclaredIdentifierCount); - P.ParenCount = PrevParenCount; - P.BracketCount = PrevBracketCount; - P.BraceCount = PrevBraceCount; - isActive = false; - } - ~TentativeParsingAction() { - assert(!isActive && "Forgot to call Commit or Revert!"); - } - }; - /// A TentativeParsingAction that automatically reverts in its destructor. - /// Useful for disambiguation parses that will always be reverted. - class RevertingTentativeParsingAction - : private Parser::TentativeParsingAction { - public: - using TentativeParsingAction::TentativeParsingAction; - - ~RevertingTentativeParsingAction() { Revert(); } - }; - - /// ObjCDeclContextSwitch - An object used to switch context from - /// an objective-c decl context to its enclosing decl context and - /// back. - class ObjCDeclContextSwitch { - Parser &P; - ObjCContainerDecl *DC; - SaveAndRestore WithinObjCContainer; - public: - explicit ObjCDeclContextSwitch(Parser &p) - : P(p), DC(p.getObjCDeclContext()), - WithinObjCContainer(P.ParsingInObjCContainer, DC != nullptr) { - if (DC) - P.Actions.ObjC().ActOnObjCTemporaryExitContainerContext(DC); - } - ~ObjCDeclContextSwitch() { - if (DC) - P.Actions.ObjC().ActOnObjCReenterContainerContext(DC); - } - }; - - /// ExpectAndConsume - The parser expects that 'ExpectedTok' is next in the - /// input. If so, it is consumed and false is returned. - /// - /// If a trivial punctuator misspelling is encountered, a FixIt error - /// diagnostic is issued and false is returned after recovery. - /// - /// If the input is malformed, this emits the specified diagnostic and true is - /// returned. - bool ExpectAndConsume(tok::TokenKind ExpectedTok, - unsigned Diag = diag::err_expected, - StringRef DiagMsg = ""); + /// If the input is malformed, this emits the specified diagnostic and true is + /// returned. + bool ExpectAndConsume(tok::TokenKind ExpectedTok, + unsigned Diag = diag::err_expected, + StringRef DiagMsg = ""); /// The parser expects a semicolon and, if present, will consume it. /// /// If the next token is not a semicolon, this emits the specified diagnostic, /// or, if there's just some closing-delimiter noise (e.g., ')' or ']') prior /// to the semicolon, consumes that extra token. - bool ExpectAndConsumeSemi(unsigned DiagID , StringRef TokenUsed = ""); + bool ExpectAndConsumeSemi(unsigned DiagID, StringRef TokenUsed = ""); /// Consume any extra semi-colons until the end of the line. void ConsumeExtraSemi(ExtraSemiKind Kind, DeclSpec::TST T = TST_unspecified); @@ -1239,87 +827,6 @@ class Parser : public CodeCompletionHandler { void diagnoseUseOfC11Keyword(const Token &Tok); -public: - //===--------------------------------------------------------------------===// - // Scope manipulation - - /// ParseScope - Introduces a new scope for parsing. The kind of - /// scope is determined by ScopeFlags. Objects of this type should - /// be created on the stack to coincide with the position where the - /// parser enters the new scope, and this object's constructor will - /// create that new scope. Similarly, once the object is destroyed - /// the parser will exit the scope. - class ParseScope { - Parser *Self; - ParseScope(const ParseScope &) = delete; - void operator=(const ParseScope &) = delete; - - public: - // ParseScope - Construct a new object to manage a scope in the - // parser Self where the new Scope is created with the flags - // ScopeFlags, but only when we aren't about to enter a compound statement. - ParseScope(Parser *Self, unsigned ScopeFlags, bool EnteredScope = true, - bool BeforeCompoundStmt = false) - : Self(Self) { - if (EnteredScope && !BeforeCompoundStmt) - Self->EnterScope(ScopeFlags); - else { - if (BeforeCompoundStmt) - Self->incrementMSManglingNumber(); - - this->Self = nullptr; - } - } - - // Exit - Exit the scope associated with this object now, rather - // than waiting until the object is destroyed. - void Exit() { - if (Self) { - Self->ExitScope(); - Self = nullptr; - } - } - - ~ParseScope() { - Exit(); - } - }; - - /// Introduces zero or more scopes for parsing. The scopes will all be exited - /// when the object is destroyed. - class MultiParseScope { - Parser &Self; - unsigned NumScopes = 0; - - MultiParseScope(const MultiParseScope&) = delete; - - public: - MultiParseScope(Parser &Self) : Self(Self) {} - void Enter(unsigned ScopeFlags) { - Self.EnterScope(ScopeFlags); - ++NumScopes; - } - void Exit() { - while (NumScopes) { - Self.ExitScope(); - --NumScopes; - } - } - ~MultiParseScope() { - Exit(); - } - }; - - /// EnterScope - Start a new scope. - void EnterScope(unsigned ScopeFlags); - - /// ExitScope - Pop a scope off the scope stack. - void ExitScope(); - - /// Re-enter the template scopes for a declaration that might be a template. - unsigned ReenterTemplateScopes(MultiParseScope &S, Decl *D); - -private: /// RAII object used to modify the scope flags for the current scope. class ParseScopeFlags { Scope *CurScope; @@ -1328,119 +835,297 @@ class Parser : public CodeCompletionHandler { void operator=(const ParseScopeFlags &) = delete; public: + /// Set the flags for the current scope to ScopeFlags. If ManageFlags is + /// false, this object does nothing. ParseScopeFlags(Parser *Self, unsigned ScopeFlags, bool ManageFlags = true); + + /// Restore the flags for the current scope to what they were before this + /// object overrode them. ~ParseScopeFlags(); }; + /// Emits a diagnostic suggesting parentheses surrounding a + /// given range. + /// + /// \param Loc The location where we'll emit the diagnostic. + /// \param DK The kind of diagnostic to emit. + /// \param ParenRange Source range enclosing code that should be + /// parenthesized. + void SuggestParentheses(SourceLocation Loc, unsigned DK, + SourceRange ParenRange); + //===--------------------------------------------------------------------===// - // Diagnostic Emission and Error recovery. + // C99 6.9: External Definitions. -public: - DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID); - DiagnosticBuilder Diag(const Token &Tok, unsigned DiagID); - DiagnosticBuilder Diag(unsigned DiagID) { - return Diag(Tok, DiagID); - } + /// ParseExternalDeclaration: + /// + /// The `Attrs` that are passed in are C++11 attributes and appertain to the + /// declaration. + /// + /// \verbatim + /// external-declaration: [C99 6.9], declaration: [C++ dcl.dcl] + /// function-definition + /// declaration + /// [GNU] asm-definition + /// [GNU] __extension__ external-declaration + /// [OBJC] objc-class-definition + /// [OBJC] objc-class-declaration + /// [OBJC] objc-alias-declaration + /// [OBJC] objc-protocol-definition + /// [OBJC] objc-method-definition + /// [OBJC] @end + /// [C++] linkage-specification + /// [GNU] asm-definition: + /// simple-asm-expr ';' + /// [C++11] empty-declaration + /// [C++11] attribute-declaration + /// + /// [C++11] empty-declaration: + /// ';' + /// + /// [C++0x/GNU] 'extern' 'template' declaration + /// + /// [C++20] module-import-declaration + /// \endverbatim + /// + DeclGroupPtrTy ParseExternalDeclaration(ParsedAttributes &DeclAttrs, + ParsedAttributes &DeclSpecAttrs, + ParsingDeclSpec *DS = nullptr); - DiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId); - DiagnosticBuilder DiagCompat(const Token &Tok, unsigned CompatDiagId); - DiagnosticBuilder DiagCompat(unsigned CompatDiagId) { - return DiagCompat(Tok, CompatDiagId); - } + /// Determine whether the current token, if it occurs after a + /// declarator, continues a declaration or declaration list. + bool isDeclarationAfterDeclarator(); -private: - void SuggestParentheses(SourceLocation Loc, unsigned DK, - SourceRange ParenRange); - void CheckNestedObjCContexts(SourceLocation AtLoc); + /// Determine whether the current token, if it occurs after a + /// declarator, indicates the start of a function definition. + bool isStartOfFunctionDefinition(const ParsingDeclarator &Declarator); -public: + DeclGroupPtrTy ParseDeclarationOrFunctionDefinition( + ParsedAttributes &DeclAttrs, ParsedAttributes &DeclSpecAttrs, + ParsingDeclSpec *DS = nullptr, AccessSpecifier AS = AS_none); - /// Control flags for SkipUntil functions. - enum SkipUntilFlags { - StopAtSemi = 1 << 0, ///< Stop skipping at semicolon - /// Stop skipping at specified token, but don't skip the token itself - StopBeforeMatch = 1 << 1, - StopAtCodeCompletion = 1 << 2 ///< Stop at code completion - }; + /// Parse either a function-definition or a declaration. We can't tell which + /// we have until we read up to the compound-statement in function-definition. + /// TemplateParams, if non-NULL, provides the template parameters when we're + /// parsing a C++ template-declaration. + /// + /// \verbatim + /// function-definition: [C99 6.9.1] + /// decl-specs declarator declaration-list[opt] compound-statement + /// [C90] function-definition: [C99 6.7.1] - implicit int result + /// [C90] decl-specs[opt] declarator declaration-list[opt] compound-statement + /// + /// declaration: [C99 6.7] + /// declaration-specifiers init-declarator-list[opt] ';' + /// [!C99] init-declarator-list ';' [TODO: warn in c99 mode] + /// [OMP] threadprivate-directive + /// [OMP] allocate-directive [TODO] + /// \endverbatim + /// + DeclGroupPtrTy ParseDeclOrFunctionDefInternal(ParsedAttributes &Attrs, + ParsedAttributes &DeclSpecAttrs, + ParsingDeclSpec &DS, + AccessSpecifier AS); - friend constexpr SkipUntilFlags operator|(SkipUntilFlags L, - SkipUntilFlags R) { - return static_cast(static_cast(L) | - static_cast(R)); - } + void SkipFunctionBody(); - /// SkipUntil - Read tokens until we get to the specified token, then consume - /// it (unless StopBeforeMatch is specified). Because we cannot guarantee - /// that the token will ever occur, this skips to the next token, or to some - /// likely good stopping point. If Flags has StopAtSemi flag, skipping will - /// stop at a ';' character. Balances (), [], and {} delimiter tokens while - /// skipping. + struct ParsedTemplateInfo; + class LateParsedAttrList; + + /// ParseFunctionDefinition - We parsed and verified that the specified + /// Declarator is well formed. If this is a K&R-style function, read the + /// parameters declaration-list, then start the compound-statement. /// - /// If SkipUntil finds the specified token, it returns true, otherwise it - /// returns false. - bool SkipUntil(tok::TokenKind T, - SkipUntilFlags Flags = static_cast(0)) { - return SkipUntil(llvm::ArrayRef(T), Flags); - } - bool SkipUntil(tok::TokenKind T1, tok::TokenKind T2, - SkipUntilFlags Flags = static_cast(0)) { - tok::TokenKind TokArray[] = {T1, T2}; - return SkipUntil(TokArray, Flags); - } - bool SkipUntil(tok::TokenKind T1, tok::TokenKind T2, tok::TokenKind T3, - SkipUntilFlags Flags = static_cast(0)) { - tok::TokenKind TokArray[] = {T1, T2, T3}; - return SkipUntil(TokArray, Flags); - } - bool SkipUntil(ArrayRef Toks, - SkipUntilFlags Flags = static_cast(0)); + /// \verbatim + /// function-definition: [C99 6.9.1] + /// decl-specs declarator declaration-list[opt] compound-statement + /// [C90] function-definition: [C99 6.7.1] - implicit int result + /// [C90] decl-specs[opt] declarator declaration-list[opt] compound-statement + /// [C++] function-definition: [C++ 8.4] + /// decl-specifier-seq[opt] declarator ctor-initializer[opt] + /// function-body + /// [C++] function-definition: [C++ 8.4] + /// decl-specifier-seq[opt] declarator function-try-block + /// \endverbatim + /// + Decl *ParseFunctionDefinition( + ParsingDeclarator &D, + const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo(), + LateParsedAttrList *LateParsedAttrs = nullptr); - /// SkipMalformedDecl - Read tokens until we get to some likely good stopping - /// point for skipping past a simple-declaration. - void SkipMalformedDecl(); + /// ParseKNRParamDeclarations - Parse 'declaration-list[opt]' which provides + /// types for a function with a K&R-style identifier list for arguments. + void ParseKNRParamDeclarations(Declarator &D); - /// The location of the first statement inside an else that might - /// have a missleading indentation. If there is no - /// MisleadingIndentationChecker on an else active, this location is invalid. - SourceLocation MisleadingIndentationElseLoc; + /// ParseSimpleAsm + /// + /// \verbatim + /// [GNU] simple-asm-expr: + /// 'asm' '(' asm-string-literal ')' + /// \endverbatim + /// + /// EndLoc is filled with the location of the last token of the simple-asm. + ExprResult ParseSimpleAsm(bool ForAsmLabel, SourceLocation *EndLoc); -private: - //===--------------------------------------------------------------------===// - // Lexing and parsing of C++ inline methods. + /// ParseAsmStringLiteral - This is just a normal string-literal, but is not + /// allowed to be a wide string, and is not subject to character translation. + /// Unlike GCC, we also diagnose an empty string literal when parsing for an + /// asm label as opposed to an asm statement, because such a construct does + /// not behave well. + /// + /// \verbatim + /// [GNU] asm-string-literal: + /// string-literal + /// \endverbatim + /// + ExprResult ParseAsmStringLiteral(bool ForAsmLabel); - struct ParsingClass; + /// Describes the condition of a Microsoft __if_exists or + /// __if_not_exists block. + struct IfExistsCondition { + /// The location of the initial keyword. + SourceLocation KeywordLoc; + /// Whether this is an __if_exists block (rather than an + /// __if_not_exists block). + bool IsIfExists; - /// [class.mem]p1: "... the class is regarded as complete within - /// - function bodies - /// - default arguments - /// - exception-specifications (TODO: C++0x) - /// - and brace-or-equal-initializers for non-static data members - /// (including such things in nested classes)." - /// LateParsedDeclarations build the tree of those elements so they can - /// be parsed after parsing the top-level class. - class LateParsedDeclaration { - public: - virtual ~LateParsedDeclaration(); + /// Nested-name-specifier preceding the name. + CXXScopeSpec SS; - virtual void ParseLexedMethodDeclarations(); - virtual void ParseLexedMemberInitializers(); - virtual void ParseLexedMethodDefs(); - virtual void ParseLexedAttributes(); - virtual void ParseLexedPragmas(); + /// The name we're looking for. + UnqualifiedId Name; + + /// The behavior of this __if_exists or __if_not_exists block + /// should. + IfExistsBehavior Behavior; }; - /// Inner node of the LateParsedDeclaration tree that parses - /// all its members recursively. - class LateParsedClass : public LateParsedDeclaration { - public: - LateParsedClass(Parser *P, ParsingClass *C); - ~LateParsedClass() override; + bool ParseMicrosoftIfExistsCondition(IfExistsCondition &Result); + void ParseMicrosoftIfExistsExternalDeclaration(); - void ParseLexedMethodDeclarations() override; - void ParseLexedMemberInitializers() override; - void ParseLexedMethodDefs() override; - void ParseLexedAttributes() override; - void ParseLexedPragmas() override; + //===--------------------------------------------------------------------===// + // Modules + + /// Parse a declaration beginning with the 'module' keyword or C++20 + /// context-sensitive keyword (optionally preceded by 'export'). + /// + /// \verbatim + /// module-declaration: [C++20] + /// 'export'[opt] 'module' module-name attribute-specifier-seq[opt] ';' + /// + /// global-module-fragment: [C++2a] + /// 'module' ';' top-level-declaration-seq[opt] + /// module-declaration: [C++2a] + /// 'export'[opt] 'module' module-name module-partition[opt] + /// attribute-specifier-seq[opt] ';' + /// private-module-fragment: [C++2a] + /// 'module' ':' 'private' ';' top-level-declaration-seq[opt] + /// \endverbatim + DeclGroupPtrTy ParseModuleDecl(Sema::ModuleImportState &ImportState); + + /// Parse a module import declaration. This is essentially the same for + /// Objective-C and C++20 except for the leading '@' (in ObjC) and the + /// trailing optional attributes (in C++). + /// + /// \verbatim + /// [ObjC] @import declaration: + /// '@' 'import' module-name ';' + /// [ModTS] module-import-declaration: + /// 'import' module-name attribute-specifier-seq[opt] ';' + /// [C++20] module-import-declaration: + /// 'export'[opt] 'import' module-name + /// attribute-specifier-seq[opt] ';' + /// 'export'[opt] 'import' module-partition + /// attribute-specifier-seq[opt] ';' + /// 'export'[opt] 'import' header-name + /// attribute-specifier-seq[opt] ';' + /// \endverbatim + Decl *ParseModuleImport(SourceLocation AtLoc, + Sema::ModuleImportState &ImportState); + + /// Try recover parser when module annotation appears where it must not + /// be found. + /// \returns false if the recover was successful and parsing may be continued, + /// or true if parser must bail out to top level and handle the token there. + bool parseMisplacedModuleImport(); + + bool tryParseMisplacedModuleImport() { + tok::TokenKind Kind = Tok.getKind(); + if (Kind == tok::annot_module_begin || Kind == tok::annot_module_end || + Kind == tok::annot_module_include) + return parseMisplacedModuleImport(); + return false; + } + + /// Parse a C++ / Objective-C module name (both forms use the same + /// grammar). + /// + /// \verbatim + /// module-name: + /// module-name-qualifier[opt] identifier + /// module-name-qualifier: + /// module-name-qualifier[opt] identifier '.' + /// \endverbatim + bool ParseModuleName(SourceLocation UseLoc, + SmallVectorImpl &Path, bool IsImport); + + //===--------------------------------------------------------------------===// + // Preprocessor code-completion pass-through + void CodeCompleteDirective(bool InConditional) override; + void CodeCompleteInConditionalExclusion() override; + void CodeCompleteMacroName(bool IsDefinition) override; + void CodeCompletePreprocessorExpression() override; + void CodeCompleteMacroArgument(IdentifierInfo *Macro, MacroInfo *MacroInfo, + unsigned ArgumentIndex) override; + void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled) override; + void CodeCompleteNaturalLanguage() override; + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name C++ Class Inline Methods + /// Implementations are in ParseCXXInlineMethods.cpp + ///@{ + +private: + struct ParsingClass; + + /// [class.mem]p1: "... the class is regarded as complete within + /// - function bodies + /// - default arguments + /// - exception-specifications (TODO: C++0x) + /// - and brace-or-equal-initializers for non-static data members + /// (including such things in nested classes)." + /// LateParsedDeclarations build the tree of those elements so they can + /// be parsed after parsing the top-level class. + class LateParsedDeclaration { + public: + virtual ~LateParsedDeclaration(); + + virtual void ParseLexedMethodDeclarations(); + virtual void ParseLexedMemberInitializers(); + virtual void ParseLexedMethodDefs(); + virtual void ParseLexedAttributes(); + virtual void ParseLexedPragmas(); + }; + + /// Inner node of the LateParsedDeclaration tree that parses + /// all its members recursively. + class LateParsedClass : public LateParsedDeclaration { + public: + LateParsedClass(Parser *P, ParsingClass *C); + ~LateParsedClass() override; + + void ParseLexedMethodDeclarations() override; + void ParseLexedMemberInitializers() override; + void ParseLexedMethodDefs() override; + void ParseLexedAttributes() override; + void ParseLexedPragmas() override; // Delete copy constructor and copy assignment operator. LateParsedClass(const LateParsedClass &) = delete; @@ -1463,11 +1148,11 @@ class Parser : public CodeCompletionHandler { IdentifierInfo &AttrName; IdentifierInfo *MacroII = nullptr; SourceLocation AttrNameLoc; - SmallVector Decls; + SmallVector Decls; explicit LateParsedAttribute(Parser *P, IdentifierInfo &Name, SourceLocation Loc) - : Self(P), AttrName(Name), AttrNameLoc(Loc) {} + : Self(P), AttrName(Name), AttrNameLoc(Loc) {} void ParseLexedAttributes() override; @@ -1495,7 +1180,7 @@ class Parser : public CodeCompletionHandler { }; // A list of late-parsed attributes. Used by ParseGNUAttributes. - class LateParsedAttrList: public SmallVector { + class LateParsedAttrList : public SmallVector { public: LateParsedAttrList(bool PSoon = false, bool LateAttrParseExperimentalExtOnly = false) @@ -1532,9 +1217,9 @@ class Parser : public CodeCompletionHandler { /// occurs within a member function declaration inside the class /// (C++ [class.mem]p2). struct LateParsedDefaultArgument { - explicit LateParsedDefaultArgument(Decl *P, - std::unique_ptr Toks = nullptr) - : Param(P), Toks(std::move(Toks)) { } + explicit LateParsedDefaultArgument( + Decl *P, std::unique_ptr Toks = nullptr) + : Param(P), Toks(std::move(Toks)) {} /// Param - The parameter declaration for this parameter. Decl *Param; @@ -1577,8 +1262,7 @@ class Parser : public CodeCompletionHandler { /// member whose parsing must to be delayed until the class is completely /// defined (C++11 [class.mem]p2). struct LateParsedMemberInitializer : public LateParsedDeclaration { - LateParsedMemberInitializer(Parser *P, Decl *FD) - : Self(P), Field(FD) { } + LateParsedMemberInitializer(Parser *P, Decl *FD) : Self(P), Field(FD) {} void ParseLexedMemberInitializers() override; @@ -1598,1432 +1282,1716 @@ class Parser : public CodeCompletionHandler { /// the method declarations and possibly attached inline definitions /// will be stored here with the tokens that will be parsed to create those /// entities. - typedef SmallVector LateParsedDeclarationsContainer; - - /// Representation of a class that has been parsed, including - /// any member function declarations or definitions that need to be - /// parsed after the corresponding top-level class is complete. - struct ParsingClass { - ParsingClass(Decl *TagOrTemplate, bool TopLevelClass, bool IsInterface) - : TopLevelClass(TopLevelClass), IsInterface(IsInterface), - TagOrTemplate(TagOrTemplate) {} - - /// Whether this is a "top-level" class, meaning that it is - /// not nested within another class. - bool TopLevelClass : 1; - - /// Whether this class is an __interface. - bool IsInterface : 1; - - /// The class or class template whose definition we are parsing. - Decl *TagOrTemplate; - - /// LateParsedDeclarations - Method declarations, inline definitions and - /// nested classes that contain pieces whose parsing will be delayed until - /// the top-level class is fully defined. - LateParsedDeclarationsContainer LateParsedDeclarations; - }; - - /// The stack of classes that is currently being - /// parsed. Nested and local classes will be pushed onto this stack - /// when they are parsed, and removed afterward. - std::stack ClassStack; - - ParsingClass &getCurrentClass() { - assert(!ClassStack.empty() && "No lexed method stacks!"); - return *ClassStack.top(); - } - - /// RAII object used to manage the parsing of a class definition. - class ParsingClassDefinition { - Parser &P; - bool Popped; - Sema::ParsingClassState State; - - public: - ParsingClassDefinition(Parser &P, Decl *TagOrTemplate, bool TopLevelClass, - bool IsInterface) - : P(P), Popped(false), - State(P.PushParsingClass(TagOrTemplate, TopLevelClass, IsInterface)) { - } - - /// Pop this class of the stack. - void Pop() { - assert(!Popped && "Nested class has already been popped"); - Popped = true; - P.PopParsingClass(State); - } - - ~ParsingClassDefinition() { - if (!Popped) - P.PopParsingClass(State); - } - }; - - /// Contains information about any template-specific - /// information that has been parsed prior to parsing declaration - /// specifiers. - struct ParsedTemplateInfo { - ParsedTemplateInfo() : Kind(ParsedTemplateKind::NonTemplate), TemplateParams(nullptr) {} - - ParsedTemplateInfo(TemplateParameterLists *TemplateParams, - bool isSpecialization, - bool lastParameterListWasEmpty = false) - : Kind(isSpecialization? ParsedTemplateKind::ExplicitSpecialization : ParsedTemplateKind::Template), - TemplateParams(TemplateParams), - LastParameterListWasEmpty(lastParameterListWasEmpty) { } - - explicit ParsedTemplateInfo(SourceLocation ExternLoc, - SourceLocation TemplateLoc) - : Kind(ParsedTemplateKind::ExplicitInstantiation), TemplateParams(nullptr), - ExternLoc(ExternLoc), TemplateLoc(TemplateLoc), - LastParameterListWasEmpty(false){ } - - ParsedTemplateKind Kind; - - /// The template parameter lists, for template declarations - /// and explicit specializations. - TemplateParameterLists *TemplateParams; - - /// The location of the 'extern' keyword, if any, for an explicit - /// instantiation - SourceLocation ExternLoc; - - /// The location of the 'template' keyword, for an explicit - /// instantiation. - SourceLocation TemplateLoc; - - /// Whether the last template parameter list was empty. - bool LastParameterListWasEmpty; + typedef SmallVector + LateParsedDeclarationsContainer; - SourceRange getSourceRange() const LLVM_READONLY; - }; - - // In ParseCXXInlineMethods.cpp. + /// Utility to re-enter a possibly-templated scope while parsing its + /// late-parsed components. struct ReenterTemplateScopeRAII; - struct ReenterClassScopeRAII; - void LexTemplateFunctionForLateParsing(CachedTokens &Toks); - void ParseLateTemplatedFuncDef(LateParsedTemplate &LPT); - - static void LateTemplateParserCallback(void *P, LateParsedTemplate &LPT); - - Sema::ParsingClassState - PushParsingClass(Decl *TagOrTemplate, bool TopLevelClass, bool IsInterface); - void DeallocateParsedClasses(ParsingClass *Class); - void PopParsingClass(Sema::ParsingClassState); + /// Utility to re-enter a class scope while parsing its late-parsed + /// components. + struct ReenterClassScopeRAII; + /// ParseCXXInlineMethodDef - We parsed and verified that the specified + /// Declarator is a well formed C++ inline method definition. Now lex its body + /// and store its tokens for parsing after the C++ class is complete. NamedDecl *ParseCXXInlineMethodDef(AccessSpecifier AS, const ParsedAttributesView &AccessAttrs, ParsingDeclarator &D, const ParsedTemplateInfo &TemplateInfo, const VirtSpecifiers &VS, SourceLocation PureSpecLoc); + + /// Parse the optional ("message") part of a deleted-function-body. StringLiteral *ParseCXXDeletedFunctionMessage(); + + /// If we've encountered '= delete' in a context where it is ill-formed, such + /// as in the declaration of a non-function, also skip the ("message") part if + /// it is present to avoid issuing further diagnostics. void SkipDeletedFunctionBody(); + + /// ParseCXXNonStaticMemberInitializer - We parsed and verified that the + /// specified Declarator is a well formed C++ non-static data member + /// declaration. Now lex its initializer and store its tokens for parsing + /// after the class is complete. void ParseCXXNonStaticMemberInitializer(Decl *VarD); + + /// Wrapper class which calls ParseLexedAttribute, after setting up the + /// scope appropriately. void ParseLexedAttributes(ParsingClass &Class); + + /// Parse all attributes in LAs, and attach them to Decl D. void ParseLexedAttributeList(LateParsedAttrList &LAs, Decl *D, bool EnterScope, bool OnDefinition); - void ParseLexedCAttributeList(LateParsedAttrList &LA, bool EnterScope, - ParsedAttributes *OutAttrs = nullptr); - void ParseLexedAttribute(LateParsedAttribute &LA, - bool EnterScope, bool OnDefinition); - void ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, - ParsedAttributes *OutAttrs = nullptr); + + /// Finish parsing an attribute for which parsing was delayed. + /// This will be called at the end of parsing a class declaration + /// for each LateParsedAttribute. We consume the saved tokens and + /// create an attribute with the arguments filled in. We add this + /// to the Attribute list for the decl. + void ParseLexedAttribute(LateParsedAttribute &LA, bool EnterScope, + bool OnDefinition); + + /// ParseLexedMethodDeclarations - We finished parsing the member + /// specification of a top (non-nested) C++ class. Now go over the + /// stack of method declarations with some parts for which parsing was + /// delayed (such as default arguments) and parse them. void ParseLexedMethodDeclarations(ParsingClass &Class); void ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM); + + /// ParseLexedMethodDefs - We finished parsing the member specification of a + /// top (non-nested) C++ class. Now go over the stack of lexed methods that + /// were collected during its parsing and parse them all. void ParseLexedMethodDefs(ParsingClass &Class); void ParseLexedMethodDef(LexedMethod &LM); + + /// ParseLexedMemberInitializers - We finished parsing the member + /// specification of a top (non-nested) C++ class. Now go over the stack of + /// lexed data member initializers that were collected during its parsing and + /// parse them all. void ParseLexedMemberInitializers(ParsingClass &Class); void ParseLexedMemberInitializer(LateParsedMemberInitializer &MI); - void ParseLexedObjCMethodDefs(LexedMethod &LM, bool parseMethod); - void ParseLexedPragmas(ParsingClass &Class); - void ParseLexedPragma(LateParsedPragma &LP); - bool ConsumeAndStoreFunctionPrologue(CachedTokens &Toks); - bool ConsumeAndStoreInitializer(CachedTokens &Toks, CachedInitKind CIK); - bool ConsumeAndStoreConditional(CachedTokens &Toks); - bool ConsumeAndStoreUntil(tok::TokenKind T1, - CachedTokens &Toks, - bool StopAtSemi = true, - bool ConsumeFinalToken = true) { - return ConsumeAndStoreUntil(T1, T1, Toks, StopAtSemi, ConsumeFinalToken); - } - bool ConsumeAndStoreUntil(tok::TokenKind T1, tok::TokenKind T2, - CachedTokens &Toks, - bool StopAtSemi = true, - bool ConsumeFinalToken = true); - - //===--------------------------------------------------------------------===// - // C99 6.9: External Definitions. - DeclGroupPtrTy ParseExternalDeclaration(ParsedAttributes &DeclAttrs, - ParsedAttributes &DeclSpecAttrs, - ParsingDeclSpec *DS = nullptr); - bool isDeclarationAfterDeclarator(); - bool isStartOfFunctionDefinition(const ParsingDeclarator &Declarator); - DeclGroupPtrTy ParseDeclarationOrFunctionDefinition( - ParsedAttributes &DeclAttrs, ParsedAttributes &DeclSpecAttrs, - ParsingDeclSpec *DS = nullptr, AccessSpecifier AS = AS_none); - DeclGroupPtrTy ParseDeclOrFunctionDefInternal(ParsedAttributes &Attrs, - ParsedAttributes &DeclSpecAttrs, - ParsingDeclSpec &DS, - AccessSpecifier AS); - void SkipFunctionBody(); - Decl *ParseFunctionDefinition(ParsingDeclarator &D, - const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo(), - LateParsedAttrList *LateParsedAttrs = nullptr); - void ParseKNRParamDeclarations(Declarator &D); - // EndLoc is filled with the location of the last token of the simple-asm. - ExprResult ParseSimpleAsm(bool ForAsmLabel, SourceLocation *EndLoc); - ExprResult ParseAsmStringLiteral(bool ForAsmLabel); + ///@} - // Objective-C External Declarations - void MaybeSkipAttributes(tok::ObjCKeywordKind Kind); - DeclGroupPtrTy ParseObjCAtDirectives(ParsedAttributes &DeclAttrs, - ParsedAttributes &DeclSpecAttrs); - DeclGroupPtrTy ParseObjCAtClassDeclaration(SourceLocation atLoc); - Decl *ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, - ParsedAttributes &prefixAttrs); - class ObjCTypeParamListScope; - ObjCTypeParamList *parseObjCTypeParamList(); - ObjCTypeParamList *parseObjCTypeParamListOrProtocolRefs( - ObjCTypeParamListScope &Scope, SourceLocation &lAngleLoc, - SmallVectorImpl &protocolIdents, SourceLocation &rAngleLoc, - bool mayBeProtocolList = true); + // + // + // ------------------------------------------------------------------------- + // + // - void HelperActionsForIvarDeclarations(ObjCContainerDecl *interfaceDecl, - SourceLocation atLoc, - BalancedDelimiterTracker &T, - SmallVectorImpl &AllIvarDecls, - bool RBraceMissing); - void ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl, - tok::ObjCKeywordKind visibility, - SourceLocation atLoc); - bool ParseObjCProtocolReferences(SmallVectorImpl &P, - SmallVectorImpl &PLocs, - bool WarnOnDeclarations, - bool ForObjCContainer, - SourceLocation &LAngleLoc, - SourceLocation &EndProtoLoc, - bool consumeLastToken); + /// \name Declarations + /// Implementations are in ParseDecl.cpp + ///@{ - /// Parse the first angle-bracket-delimited clause for an - /// Objective-C object or object pointer type, which may be either - /// type arguments or protocol qualifiers. - void parseObjCTypeArgsOrProtocolQualifiers( - ParsedType baseType, - SourceLocation &typeArgsLAngleLoc, - SmallVectorImpl &typeArgs, - SourceLocation &typeArgsRAngleLoc, - SourceLocation &protocolLAngleLoc, - SmallVectorImpl &protocols, - SmallVectorImpl &protocolLocs, - SourceLocation &protocolRAngleLoc, - bool consumeLastToken, - bool warnOnIncompleteProtocols); +public: + /// SkipMalformedDecl - Read tokens until we get to some likely good stopping + /// point for skipping past a simple-declaration. + /// + /// Skip until we reach something which seems like a sensible place to pick + /// up parsing after a malformed declaration. This will sometimes stop sooner + /// than SkipUntil(tok::r_brace) would, but will never stop later. + void SkipMalformedDecl(); - /// Parse either Objective-C type arguments or protocol qualifiers; if the - /// former, also parse protocol qualifiers afterward. - void parseObjCTypeArgsAndProtocolQualifiers( - ParsedType baseType, - SourceLocation &typeArgsLAngleLoc, - SmallVectorImpl &typeArgs, - SourceLocation &typeArgsRAngleLoc, - SourceLocation &protocolLAngleLoc, - SmallVectorImpl &protocols, - SmallVectorImpl &protocolLocs, - SourceLocation &protocolRAngleLoc, - bool consumeLastToken); - - /// Parse a protocol qualifier type such as '', which is - /// an anachronistic way of writing 'id'. - TypeResult parseObjCProtocolQualifierType(SourceLocation &rAngleLoc); - - /// Parse Objective-C type arguments and protocol qualifiers, extending the - /// current type with the parsed result. - TypeResult parseObjCTypeArgsAndProtocolQualifiers(SourceLocation loc, - ParsedType type, - bool consumeLastToken, - SourceLocation &endLoc); - - void ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey, - Decl *CDecl); - DeclGroupPtrTy ParseObjCAtProtocolDeclaration(SourceLocation atLoc, - ParsedAttributes &prefixAttrs); - - struct ObjCImplParsingDataRAII { - Parser &P; - Decl *Dcl; - bool HasCFunction; - typedef SmallVector LateParsedObjCMethodContainer; - LateParsedObjCMethodContainer LateParsedObjCMethods; - - ObjCImplParsingDataRAII(Parser &parser, Decl *D) - : P(parser), Dcl(D), HasCFunction(false) { - P.CurParsedObjCImpl = this; - Finished = false; - } - ~ObjCImplParsingDataRAII(); - - void finish(SourceRange AtEnd); - bool isFinished() const { return Finished; } - - private: - bool Finished; - }; - ObjCImplParsingDataRAII *CurParsedObjCImpl; - void StashAwayMethodOrFunctionBodyTokens(Decl *MDecl); - - DeclGroupPtrTy ParseObjCAtImplementationDeclaration(SourceLocation AtLoc, - ParsedAttributes &Attrs); - DeclGroupPtrTy ParseObjCAtEndDeclaration(SourceRange atEnd); - Decl *ParseObjCAtAliasDeclaration(SourceLocation atLoc); - Decl *ParseObjCPropertySynthesize(SourceLocation atLoc); - Decl *ParseObjCPropertyDynamic(SourceLocation atLoc); - - IdentifierInfo *ParseObjCSelectorPiece(SourceLocation &MethodLocation); - - IdentifierInfo *ObjCTypeQuals[llvm::to_underlying(ObjCTypeQual::NumQuals)]; + /// ParseTypeName + /// \verbatim + /// type-name: [C99 6.7.6] + /// specifier-qualifier-list abstract-declarator[opt] + /// \endverbatim + /// + /// Called type-id in C++. + TypeResult + ParseTypeName(SourceRange *Range = nullptr, + DeclaratorContext Context = DeclaratorContext::TypeName, + AccessSpecifier AS = AS_none, Decl **OwnedType = nullptr, + ParsedAttributes *Attrs = nullptr); - bool isTokIdentifier_in() const; +private: + /// Ident_vector, Ident_bool, Ident_Bool - cached IdentifierInfos for "vector" + /// and "bool" fast comparison. Only present if AltiVec or ZVector are + /// enabled. + IdentifierInfo *Ident_vector; + IdentifierInfo *Ident_bool; + IdentifierInfo *Ident_Bool; - ParsedType ParseObjCTypeName(ObjCDeclSpec &DS, DeclaratorContext Ctx, - ParsedAttributes *ParamAttrs); - Decl *ParseObjCMethodPrototype( - tok::ObjCKeywordKind MethodImplKind = tok::objc_not_keyword, - bool MethodDefinition = true); - Decl *ParseObjCMethodDecl(SourceLocation mLoc, tok::TokenKind mType, - tok::ObjCKeywordKind MethodImplKind = tok::objc_not_keyword, - bool MethodDefinition=true); - void ParseObjCPropertyAttribute(ObjCDeclSpec &DS); + /// Ident_pixel - cached IdentifierInfos for "pixel" fast comparison. + /// Only present if AltiVec enabled. + IdentifierInfo *Ident_pixel; - Decl *ParseObjCMethodDefinition(); + /// Identifier for "introduced". + IdentifierInfo *Ident_introduced; -public: - //===--------------------------------------------------------------------===// - // C99 6.5: Expressions. + /// Identifier for "deprecated". + IdentifierInfo *Ident_deprecated; - ExprResult - ParseExpression(TypeCastState isTypeCast = TypeCastState::NotTypeCast); - ExprResult ParseConstantExpressionInExprEvalContext( - TypeCastState isTypeCast = TypeCastState::NotTypeCast); - ExprResult ParseConstantExpression(); - ExprResult ParseArrayBoundExpression(); - ExprResult ParseCaseExpression(SourceLocation CaseLoc); - ExprResult ParseConstraintExpression(); - ExprResult - ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause); - ExprResult ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause); - // Expr that doesn't include commas. - ExprResult ParseAssignmentExpression( - TypeCastState isTypeCast = TypeCastState::NotTypeCast); - ExprResult ParseConditionalExpression(); + /// Identifier for "obsoleted". + IdentifierInfo *Ident_obsoleted; - ExprResult ParseMSAsmIdentifier(llvm::SmallVectorImpl &LineToks, - unsigned &NumLineToksConsumed, - bool IsUnevaluated); + /// Identifier for "unavailable". + IdentifierInfo *Ident_unavailable; - ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral = false); - ExprResult ParseUnevaluatedStringLiteralExpression(); + /// Identifier for "message". + IdentifierInfo *Ident_message; -private: - ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral, - bool Unevaluated); + /// Identifier for "strict". + IdentifierInfo *Ident_strict; - ExprResult ParseExpressionWithLeadingAt(SourceLocation AtLoc); + /// Identifier for "replacement". + IdentifierInfo *Ident_replacement; - ExprResult ParseExpressionWithLeadingExtension(SourceLocation ExtLoc); + /// Identifier for "environment". + IdentifierInfo *Ident_environment; - ExprResult ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec); + /// Identifiers used by the 'external_source_symbol' attribute. + IdentifierInfo *Ident_language, *Ident_defined_in, + *Ident_generated_declaration, *Ident_USR; - bool isRevertibleTypeTrait(const IdentifierInfo *Id, - clang::tok::TokenKind *Kind = nullptr); + /// Factory object for creating ParsedAttr objects. + AttributeFactory AttrFactory; - ExprResult ParseCastExpression(CastParseKind ParseKind, - bool isAddressOfOperand, - bool &NotCastExpr, - TypeCastState isTypeCast, - bool isVectorLiteral = false, - bool *NotPrimaryExpression = nullptr); - ExprResult - ParseCastExpression(CastParseKind ParseKind, bool isAddressOfOperand = false, - TypeCastState isTypeCast = TypeCastState::NotTypeCast, - bool isVectorLiteral = false, - bool *NotPrimaryExpression = nullptr); + /// TryAltiVecToken - Check for context-sensitive AltiVec identifier tokens, + /// replacing them with the non-context-sensitive keywords. This returns + /// true if the token was replaced. + bool TryAltiVecToken(DeclSpec &DS, SourceLocation Loc, const char *&PrevSpec, + unsigned &DiagID, bool &isInvalid) { + if (!getLangOpts().AltiVec && !getLangOpts().ZVector) + return false; - /// Returns true if the next token cannot start an expression. - bool isNotExpressionStart(); + if (Tok.getIdentifierInfo() != Ident_vector && + Tok.getIdentifierInfo() != Ident_bool && + Tok.getIdentifierInfo() != Ident_Bool && + (!getLangOpts().AltiVec || Tok.getIdentifierInfo() != Ident_pixel)) + return false; - /// Returns true if the next token would start a postfix-expression - /// suffix. - bool isPostfixExpressionSuffixStart() { - tok::TokenKind K = Tok.getKind(); - return (K == tok::l_square || K == tok::l_paren || - K == tok::period || K == tok::arrow || - K == tok::plusplus || K == tok::minusminus); + return TryAltiVecTokenOutOfLine(DS, Loc, PrevSpec, DiagID, isInvalid); } - bool diagnoseUnknownTemplateId(ExprResult TemplateName, SourceLocation Less); - void checkPotentialAngleBracket(ExprResult &PotentialTemplateName); - bool checkPotentialAngleBracketDelimiter(const AngleBracketTracker::Loc &, - const Token &OpToken); - bool checkPotentialAngleBracketDelimiter(const Token &OpToken) { - if (auto *Info = AngleBrackets.getCurrent(*this)) - return checkPotentialAngleBracketDelimiter(*Info, OpToken); - return false; + /// TryAltiVecVectorToken - Check for context-sensitive AltiVec vector + /// identifier token, replacing it with the non-context-sensitive __vector. + /// This returns true if the token was replaced. + bool TryAltiVecVectorToken() { + if ((!getLangOpts().AltiVec && !getLangOpts().ZVector) || + Tok.getIdentifierInfo() != Ident_vector) + return false; + return TryAltiVecVectorTokenOutOfLine(); } - ExprResult ParsePostfixExpressionSuffix(ExprResult LHS); - ExprResult ParseUnaryExprOrTypeTraitExpression(); - ExprResult ParseBuiltinPrimaryExpression(); - ExprResult ParseSYCLUniqueStableNameExpression(); - - ExprResult ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok, - bool &isCastExpr, - ParsedType &CastTy, - SourceRange &CastRange); + /// TryAltiVecVectorTokenOutOfLine - Out of line body that should only be + /// called from TryAltiVecVectorToken. + bool TryAltiVecVectorTokenOutOfLine(); + bool TryAltiVecTokenOutOfLine(DeclSpec &DS, SourceLocation Loc, + const char *&PrevSpec, unsigned &DiagID, + bool &isInvalid); - /// ParseExpressionList - Used for C/C++ (argument-)expression-list. - bool ParseExpressionList(SmallVectorImpl &Exprs, - llvm::function_ref ExpressionStarts = - llvm::function_ref(), - bool FailImmediatelyOnInvalidExpr = false, - bool EarlyTypoCorrection = false); + void ParseLexedCAttributeList(LateParsedAttrList &LA, bool EnterScope, + ParsedAttributes *OutAttrs = nullptr); - /// ParseSimpleExpressionList - A simple comma-separated list of expressions, - /// used for misc language extensions. - bool ParseSimpleExpressionList(SmallVectorImpl &Exprs); + /// Finish parsing an attribute for which parsing was delayed. + /// This will be called at the end of parsing a class declaration + /// for each LateParsedAttribute. We consume the saved tokens and + /// create an attribute with the arguments filled in. We add this + /// to the Attribute list for the decl. + void ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, + ParsedAttributes *OutAttrs = nullptr); - ExprResult ParseParenExpression(ParenParseOption &ExprType, - bool stopIfCastExpr, - bool isTypeCast, - ParsedType &CastTy, - SourceLocation &RParenLoc); + void ParseLexedPragmas(ParsingClass &Class); + void ParseLexedPragma(LateParsedPragma &LP); - ExprResult ParseCXXAmbiguousParenExpression( - ParenParseOption &ExprType, ParsedType &CastTy, - BalancedDelimiterTracker &Tracker, ColonProtectionRAIIObject &ColonProt); - ExprResult ParseCompoundLiteralExpression(ParsedType Ty, - SourceLocation LParenLoc, - SourceLocation RParenLoc); + /// Consume tokens and store them in the passed token container until + /// we've passed the try keyword and constructor initializers and have + /// consumed the opening brace of the function body. The opening brace will be + /// consumed if and only if there was no error. + /// + /// \return True on error. + bool ConsumeAndStoreFunctionPrologue(CachedTokens &Toks); - ExprResult ParseGenericSelectionExpression(); + /// ConsumeAndStoreInitializer - Consume and store the token at the passed + /// token container until the end of the current initializer expression + /// (either a default argument or an in-class initializer for a non-static + /// data member). + /// + /// Returns \c true if we reached the end of something initializer-shaped, + /// \c false if we bailed out. + bool ConsumeAndStoreInitializer(CachedTokens &Toks, CachedInitKind CIK); - ExprResult ParseObjCBoolLiteral(); + /// Consume and store tokens from the '?' to the ':' in a conditional + /// expression. + bool ConsumeAndStoreConditional(CachedTokens &Toks); + bool ConsumeAndStoreUntil(tok::TokenKind T1, CachedTokens &Toks, + bool StopAtSemi = true, + bool ConsumeFinalToken = true) { + return ConsumeAndStoreUntil(T1, T1, Toks, StopAtSemi, ConsumeFinalToken); + } - ExprResult ParseFoldExpression(ExprResult LHS, BalancedDelimiterTracker &T); + /// ConsumeAndStoreUntil - Consume and store the token at the passed token + /// container until the token 'T' is reached (which gets + /// consumed/stored too, if ConsumeFinalToken). + /// If StopAtSemi is true, then we will stop early at a ';' character. + /// Returns true if token 'T1' or 'T2' was found. + /// NOTE: This is a specialized version of Parser::SkipUntil. + bool ConsumeAndStoreUntil(tok::TokenKind T1, tok::TokenKind T2, + CachedTokens &Toks, bool StopAtSemi = true, + bool ConsumeFinalToken = true); //===--------------------------------------------------------------------===// - // C++ Expressions - ExprResult tryParseCXXIdExpression(CXXScopeSpec &SS, bool isAddressOfOperand, - Token &Replacement); - - ExprResult tryParseCXXPackIndexingExpression(ExprResult PackIdExpression); - ExprResult ParseCXXPackIndexingExpression(ExprResult PackIdExpression); - - ExprResult ParseCXXIdExpression(bool isAddressOfOperand = false); + // C99 6.7: Declarations. - bool areTokensAdjacent(const Token &A, const Token &B); + /// A context for parsing declaration specifiers. TODO: flesh this + /// out, there are other significant restrictions on specifiers than + /// would be best implemented in the parser. + enum class DeclSpecContext { + DSC_normal, // normal context + DSC_class, // class context, enables 'friend' + DSC_type_specifier, // C++ type-specifier-seq or C specifier-qualifier-list + DSC_trailing, // C++11 trailing-type-specifier in a trailing return type + DSC_alias_declaration, // C++11 type-specifier-seq in an alias-declaration + DSC_conv_operator, // C++ type-specifier-seq in an conversion operator + DSC_top_level, // top-level/namespace declaration context + DSC_template_param, // template parameter context + DSC_template_arg, // template argument context + DSC_template_type_arg, // template type argument context + DSC_objc_method_result, // ObjC method result context, enables + // 'instancetype' + DSC_condition, // condition declaration context + DSC_association, // A _Generic selection expression's type association + DSC_new, // C++ new expression + }; - void CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectTypePtr, - bool EnteringContext, IdentifierInfo &II, - CXXScopeSpec &SS); + /// Is this a context in which we are parsing just a type-specifier (or + /// trailing-type-specifier)? + static bool isTypeSpecifier(DeclSpecContext DSC) { + switch (DSC) { + case DeclSpecContext::DSC_normal: + case DeclSpecContext::DSC_template_param: + case DeclSpecContext::DSC_template_arg: + case DeclSpecContext::DSC_class: + case DeclSpecContext::DSC_top_level: + case DeclSpecContext::DSC_objc_method_result: + case DeclSpecContext::DSC_condition: + return false; - bool ParseOptionalCXXScopeSpecifier( - CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHasErrors, - bool EnteringContext, bool *MayBePseudoDestructor = nullptr, - bool IsTypename = false, const IdentifierInfo **LastII = nullptr, - bool OnlyNamespace = false, bool InUsingDeclaration = false, - bool Disambiguation = false); - - //===--------------------------------------------------------------------===// - // C++11 5.1.2: Lambda expressions + case DeclSpecContext::DSC_template_type_arg: + case DeclSpecContext::DSC_type_specifier: + case DeclSpecContext::DSC_conv_operator: + case DeclSpecContext::DSC_trailing: + case DeclSpecContext::DSC_alias_declaration: + case DeclSpecContext::DSC_association: + case DeclSpecContext::DSC_new: + return true; + } + llvm_unreachable("Missing DeclSpecContext case"); + } - /// Result of tentatively parsing a lambda-introducer. - enum class LambdaIntroducerTentativeParse { - /// This appears to be a lambda-introducer, which has been fully parsed. - Success, - /// This is a lambda-introducer, but has not been fully parsed, and this - /// function needs to be called again to parse it. - Incomplete, - /// This is definitely an Objective-C message send expression, rather than - /// a lambda-introducer, attribute-specifier, or array designator. - MessageSend, - /// This is not a lambda-introducer. - Invalid, + /// Whether a defining-type-specifier is permitted in a given context. + enum class AllowDefiningTypeSpec { + /// The grammar doesn't allow a defining-type-specifier here, and we must + /// not parse one (eg, because a '{' could mean something else). + No, + /// The grammar doesn't allow a defining-type-specifier here, but we permit + /// one for error recovery purposes. Sema will reject. + NoButErrorRecovery, + /// The grammar allows a defining-type-specifier here, even though it's + /// always invalid. Sema will reject. + YesButInvalid, + /// The grammar allows a defining-type-specifier here, and one can be valid. + Yes }; - // [...] () -> type {...} - ExprResult ParseLambdaExpression(); - ExprResult TryParseLambdaExpression(); - bool - ParseLambdaIntroducer(LambdaIntroducer &Intro, - LambdaIntroducerTentativeParse *Tentative = nullptr); - ExprResult ParseLambdaExpressionAfterIntroducer(LambdaIntroducer &Intro); - - //===--------------------------------------------------------------------===// - // C++ 5.2p1: C++ Casts - ExprResult ParseCXXCasts(); + /// Is this a context in which we are parsing defining-type-specifiers (and + /// so permit class and enum definitions in addition to non-defining class and + /// enum elaborated-type-specifiers)? + static AllowDefiningTypeSpec + isDefiningTypeSpecifierContext(DeclSpecContext DSC, bool IsCPlusPlus) { + switch (DSC) { + case DeclSpecContext::DSC_normal: + case DeclSpecContext::DSC_class: + case DeclSpecContext::DSC_top_level: + case DeclSpecContext::DSC_alias_declaration: + case DeclSpecContext::DSC_objc_method_result: + return AllowDefiningTypeSpec::Yes; - /// Parse a __builtin_bit_cast(T, E), used to implement C++2a std::bit_cast. - ExprResult ParseBuiltinBitCast(); + case DeclSpecContext::DSC_condition: + case DeclSpecContext::DSC_template_param: + return AllowDefiningTypeSpec::YesButInvalid; - //===--------------------------------------------------------------------===// - // C++ 5.2p1: C++ Type Identification - ExprResult ParseCXXTypeid(); + case DeclSpecContext::DSC_template_type_arg: + case DeclSpecContext::DSC_type_specifier: + return AllowDefiningTypeSpec::NoButErrorRecovery; - //===--------------------------------------------------------------------===// - // C++ : Microsoft __uuidof Expression - ExprResult ParseCXXUuidof(); + case DeclSpecContext::DSC_association: + return IsCPlusPlus ? AllowDefiningTypeSpec::NoButErrorRecovery + : AllowDefiningTypeSpec::Yes; - //===--------------------------------------------------------------------===// - // C++ 5.2.4: C++ Pseudo-Destructor Expressions - ExprResult ParseCXXPseudoDestructor(Expr *Base, SourceLocation OpLoc, - tok::TokenKind OpKind, - CXXScopeSpec &SS, - ParsedType ObjectType); + case DeclSpecContext::DSC_trailing: + case DeclSpecContext::DSC_conv_operator: + case DeclSpecContext::DSC_template_arg: + case DeclSpecContext::DSC_new: + return AllowDefiningTypeSpec::No; + } + llvm_unreachable("Missing DeclSpecContext case"); + } - //===--------------------------------------------------------------------===// - // C++ 9.3.2: C++ 'this' pointer - ExprResult ParseCXXThis(); + /// Is this a context in which an opaque-enum-declaration can appear? + static bool isOpaqueEnumDeclarationContext(DeclSpecContext DSC) { + switch (DSC) { + case DeclSpecContext::DSC_normal: + case DeclSpecContext::DSC_class: + case DeclSpecContext::DSC_top_level: + return true; - //===--------------------------------------------------------------------===// - // C++ 15: C++ Throw Expression - ExprResult ParseThrowExpression(); + case DeclSpecContext::DSC_alias_declaration: + case DeclSpecContext::DSC_objc_method_result: + case DeclSpecContext::DSC_condition: + case DeclSpecContext::DSC_template_param: + case DeclSpecContext::DSC_template_type_arg: + case DeclSpecContext::DSC_type_specifier: + case DeclSpecContext::DSC_trailing: + case DeclSpecContext::DSC_association: + case DeclSpecContext::DSC_conv_operator: + case DeclSpecContext::DSC_template_arg: + case DeclSpecContext::DSC_new: - ExceptionSpecificationType tryParseExceptionSpecification( - bool Delayed, - SourceRange &SpecificationRange, - SmallVectorImpl &DynamicExceptions, - SmallVectorImpl &DynamicExceptionRanges, - ExprResult &NoexceptExpr, - CachedTokens *&ExceptionSpecTokens); - - // EndLoc is filled with the location of the last token of the specification. - ExceptionSpecificationType ParseDynamicExceptionSpecification( - SourceRange &SpecificationRange, - SmallVectorImpl &Exceptions, - SmallVectorImpl &Ranges); + return false; + } + llvm_unreachable("Missing DeclSpecContext case"); + } - //===--------------------------------------------------------------------===// - // C++0x 8: Function declaration trailing-return-type - TypeResult ParseTrailingReturnType(SourceRange &Range, - bool MayBeFollowedByDirectInit); + /// Is this a context in which we can perform class template argument + /// deduction? + static bool isClassTemplateDeductionContext(DeclSpecContext DSC) { + switch (DSC) { + case DeclSpecContext::DSC_normal: + case DeclSpecContext::DSC_template_param: + case DeclSpecContext::DSC_template_arg: + case DeclSpecContext::DSC_class: + case DeclSpecContext::DSC_top_level: + case DeclSpecContext::DSC_condition: + case DeclSpecContext::DSC_type_specifier: + case DeclSpecContext::DSC_association: + case DeclSpecContext::DSC_conv_operator: + case DeclSpecContext::DSC_new: + return true; - //===--------------------------------------------------------------------===// - // C++ 2.13.5: C++ Boolean Literals - ExprResult ParseCXXBoolLiteral(); + case DeclSpecContext::DSC_objc_method_result: + case DeclSpecContext::DSC_template_type_arg: + case DeclSpecContext::DSC_trailing: + case DeclSpecContext::DSC_alias_declaration: + return false; + } + llvm_unreachable("Missing DeclSpecContext case"); + } - //===--------------------------------------------------------------------===// - // C++ 5.2.3: Explicit type conversion (functional notation) - ExprResult ParseCXXTypeConstructExpression(const DeclSpec &DS); + // Is this a context in which an implicit 'typename' is allowed? + static ImplicitTypenameContext + getImplicitTypenameContext(DeclSpecContext DSC) { + switch (DSC) { + case DeclSpecContext::DSC_class: + case DeclSpecContext::DSC_top_level: + case DeclSpecContext::DSC_type_specifier: + case DeclSpecContext::DSC_template_type_arg: + case DeclSpecContext::DSC_trailing: + case DeclSpecContext::DSC_alias_declaration: + case DeclSpecContext::DSC_template_param: + case DeclSpecContext::DSC_new: + return ImplicitTypenameContext::Yes; - /// ParseCXXSimpleTypeSpecifier - [C++ 7.1.5.2] Simple type specifiers. - /// This should only be called when the current token is known to be part of - /// simple-type-specifier. - void ParseCXXSimpleTypeSpecifier(DeclSpec &DS); + case DeclSpecContext::DSC_normal: + case DeclSpecContext::DSC_objc_method_result: + case DeclSpecContext::DSC_condition: + case DeclSpecContext::DSC_template_arg: + case DeclSpecContext::DSC_conv_operator: + case DeclSpecContext::DSC_association: + return ImplicitTypenameContext::No; + } + llvm_unreachable("Missing DeclSpecContext case"); + } - bool ParseCXXTypeSpecifierSeq( - DeclSpec &DS, DeclaratorContext Context = DeclaratorContext::TypeName); + /// Information on a C++0x for-range-initializer found while parsing a + /// declaration which turns out to be a for-range-declaration. + struct ForRangeInit { + SourceLocation ColonLoc; + ExprResult RangeExpr; + SmallVector LifetimeExtendTemps; + bool ParsedForRangeDecl() { return !ColonLoc.isInvalid(); } + }; + struct ForRangeInfo : ForRangeInit { + StmtResult LoopVar; + }; - //===--------------------------------------------------------------------===// - // C++ 5.3.4 and 5.3.5: C++ new and delete - bool ParseExpressionListOrTypeId(SmallVectorImpl &Exprs, - Declarator &D); - void ParseDirectNewDeclarator(Declarator &D); - ExprResult ParseCXXNewExpression(bool UseGlobal, SourceLocation Start); - ExprResult ParseCXXDeleteExpression(bool UseGlobal, - SourceLocation Start); + /// ParseDeclaration - Parse a full 'declaration', which consists of + /// declaration-specifiers, some number of declarators, and a semicolon. + /// 'Context' should be a DeclaratorContext value. This returns the + /// location of the semicolon in DeclEnd. + /// + /// \verbatim + /// declaration: [C99 6.7] + /// block-declaration -> + /// simple-declaration + /// others [FIXME] + /// [C++] template-declaration + /// [C++] namespace-definition + /// [C++] using-directive + /// [C++] using-declaration + /// [C++11/C11] static_assert-declaration + /// others... [FIXME] + /// \endverbatim + /// + DeclGroupPtrTy ParseDeclaration(DeclaratorContext Context, + SourceLocation &DeclEnd, + ParsedAttributes &DeclAttrs, + ParsedAttributes &DeclSpecAttrs, + SourceLocation *DeclSpecStart = nullptr); - //===--------------------------------------------------------------------===// - // C++ if/switch/while/for condition expression. - struct ForRangeInfo; - Sema::ConditionResult ParseCXXCondition(StmtResult *InitStmt, - SourceLocation Loc, - Sema::ConditionKind CK, - bool MissingOK, - ForRangeInfo *FRI = nullptr, - bool EnterForConditionScope = false); - DeclGroupPtrTy ParseAliasDeclarationInInitStatement(DeclaratorContext Context, - ParsedAttributes &Attrs); + /// \verbatim + /// simple-declaration: [C99 6.7: declaration] [C++ 7p1: dcl.dcl] + /// declaration-specifiers init-declarator-list[opt] ';' + /// [C++11] attribute-specifier-seq decl-specifier-seq[opt] + /// init-declarator-list ';' + ///[C90/C++]init-declarator-list ';' [TODO] + /// [OMP] threadprivate-directive + /// [OMP] allocate-directive [TODO] + /// + /// for-range-declaration: [C++11 6.5p1: stmt.ranged] + /// attribute-specifier-seq[opt] type-specifier-seq declarator + /// \endverbatim + /// + /// If RequireSemi is false, this does not check for a ';' at the end of the + /// declaration. If it is true, it checks for and eats it. + /// + /// If FRI is non-null, we might be parsing a for-range-declaration instead + /// of a simple-declaration. If we find that we are, we also parse the + /// for-range-initializer, and place it here. + /// + /// DeclSpecStart is used when decl-specifiers are parsed before parsing + /// the Declaration. The SourceLocation for this Decl is set to + /// DeclSpecStart if DeclSpecStart is non-null. + DeclGroupPtrTy + ParseSimpleDeclaration(DeclaratorContext Context, SourceLocation &DeclEnd, + ParsedAttributes &DeclAttrs, + ParsedAttributes &DeclSpecAttrs, bool RequireSemi, + ForRangeInit *FRI = nullptr, + SourceLocation *DeclSpecStart = nullptr); - //===--------------------------------------------------------------------===// - // C++ Coroutines + /// ParseDeclGroup - Having concluded that this is either a function + /// definition or a group of object declarations, actually parse the + /// result. + /// + /// Returns true if this might be the start of a declarator, or a common typo + /// for a declarator. + bool MightBeDeclarator(DeclaratorContext Context); + DeclGroupPtrTy ParseDeclGroup(ParsingDeclSpec &DS, DeclaratorContext Context, + ParsedAttributes &Attrs, + ParsedTemplateInfo &TemplateInfo, + SourceLocation *DeclEnd = nullptr, + ForRangeInit *FRI = nullptr); - ExprResult ParseCoyieldExpression(); + /// Parse 'declaration' after parsing 'declaration-specifiers + /// declarator'. This method parses the remainder of the declaration + /// (including any attributes or initializer, among other things) and + /// finalizes the declaration. + /// + /// \verbatim + /// init-declarator: [C99 6.7] + /// declarator + /// declarator '=' initializer + /// [GNU] declarator simple-asm-expr[opt] attributes[opt] + /// [GNU] declarator simple-asm-expr[opt] attributes[opt] '=' initializer + /// [C++] declarator initializer[opt] + /// + /// [C++] initializer: + /// [C++] '=' initializer-clause + /// [C++] '(' expression-list ')' + /// [C++0x] '=' 'default' [TODO] + /// [C++0x] '=' 'delete' + /// [C++0x] braced-init-list + /// \endverbatim + /// + /// According to the standard grammar, =default and =delete are function + /// definitions, but that definitely doesn't fit with the parser here. + /// + Decl *ParseDeclarationAfterDeclarator( + Declarator &D, + const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo()); - //===--------------------------------------------------------------------===// - // C++ Concepts + /// Parse an optional simple-asm-expr and attributes, and attach them to a + /// declarator. Returns true on an error. + bool ParseAsmAttributesAfterDeclarator(Declarator &D); + Decl *ParseDeclarationAfterDeclaratorAndAttributes( + Declarator &D, + const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo(), + ForRangeInit *FRI = nullptr); - ExprResult ParseRequiresExpression(); - void ParseTrailingRequiresClause(Declarator &D); + /// ParseImplicitInt - This method is called when we have an non-typename + /// identifier in a declspec (which normally terminates the decl spec) when + /// the declspec has no type specifier. In this case, the declspec is either + /// malformed or is "implicit int" (in K&R and C89). + /// + /// This method handles diagnosing this prettily and returns false if the + /// declspec is done being processed. If it recovers and thinks there may be + /// other pieces of declspec after it, it returns true. + /// + bool ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS, + ParsedTemplateInfo &TemplateInfo, AccessSpecifier AS, + DeclSpecContext DSC, ParsedAttributes &Attrs); - //===--------------------------------------------------------------------===// - // C99 6.7.8: Initialization. - - /// ParseInitializer - /// initializer: [C99 6.7.8] - /// assignment-expression - /// '{' ... - ExprResult ParseInitializer() { - if (Tok.isNot(tok::l_brace)) - return ParseAssignmentExpression(); - return ParseBraceInitializer(); + /// Determine the declaration specifier context from the declarator + /// context. + /// + /// \param Context the declarator context, which is one of the + /// DeclaratorContext enumerator values. + DeclSpecContext + getDeclSpecContextFromDeclaratorContext(DeclaratorContext Context); + void + ParseDeclarationSpecifiers(DeclSpec &DS, ParsedTemplateInfo &TemplateInfo, + AccessSpecifier AS = AS_none, + DeclSpecContext DSC = DeclSpecContext::DSC_normal, + LateParsedAttrList *LateAttrs = nullptr) { + return ParseDeclarationSpecifiers(DS, TemplateInfo, AS, DSC, LateAttrs, + getImplicitTypenameContext(DSC)); } - bool MayBeDesignationStart(); - ExprResult ParseBraceInitializer(); - struct DesignatorCompletionInfo { - SmallVectorImpl &InitExprs; - QualType PreferredBaseType; - }; - ExprResult ParseInitializerWithPotentialDesignator(DesignatorCompletionInfo); - ExprResult createEmbedExpr(); - void injectEmbedTokens(); - //===--------------------------------------------------------------------===// - // clang Expressions + /// ParseDeclarationSpecifiers + /// \verbatim + /// declaration-specifiers: [C99 6.7] + /// storage-class-specifier declaration-specifiers[opt] + /// type-specifier declaration-specifiers[opt] + /// [C99] function-specifier declaration-specifiers[opt] + /// [C11] alignment-specifier declaration-specifiers[opt] + /// [GNU] attributes declaration-specifiers[opt] + /// [Clang] '__module_private__' declaration-specifiers[opt] + /// [ObjC1] '__kindof' declaration-specifiers[opt] + /// + /// storage-class-specifier: [C99 6.7.1] + /// 'typedef' + /// 'extern' + /// 'static' + /// 'auto' + /// 'register' + /// [C++] 'mutable' + /// [C++11] 'thread_local' + /// [C11] '_Thread_local' + /// [GNU] '__thread' + /// function-specifier: [C99 6.7.4] + /// [C99] 'inline' + /// [C++] 'virtual' + /// [C++] 'explicit' + /// [OpenCL] '__kernel' + /// 'friend': [C++ dcl.friend] + /// 'constexpr': [C++0x dcl.constexpr] + /// \endverbatim + void + ParseDeclarationSpecifiers(DeclSpec &DS, ParsedTemplateInfo &TemplateInfo, + AccessSpecifier AS, DeclSpecContext DSC, + LateParsedAttrList *LateAttrs, + ImplicitTypenameContext AllowImplicitTypename); - ExprResult ParseBlockLiteralExpression(); // ^{...} + /// Determine whether we're looking at something that might be a declarator + /// in a simple-declaration. If it can't possibly be a declarator, maybe + /// diagnose a missing semicolon after a prior tag definition in the decl + /// specifier. + /// + /// \return \c true if an error occurred and this can't be any kind of + /// declaration. + bool DiagnoseMissingSemiAfterTagDefinition( + DeclSpec &DS, AccessSpecifier AS, DeclSpecContext DSContext, + LateParsedAttrList *LateAttrs = nullptr); - //===--------------------------------------------------------------------===// - // Objective-C Expressions - ExprResult ParseObjCAtExpression(SourceLocation AtLocation); - ExprResult ParseObjCStringLiteral(SourceLocation AtLoc); - ExprResult ParseObjCCharacterLiteral(SourceLocation AtLoc); - ExprResult ParseObjCNumericLiteral(SourceLocation AtLoc); - ExprResult ParseObjCBooleanLiteral(SourceLocation AtLoc, bool ArgValue); - ExprResult ParseObjCArrayLiteral(SourceLocation AtLoc); - ExprResult ParseObjCDictionaryLiteral(SourceLocation AtLoc); - ExprResult ParseObjCBoxedExpr(SourceLocation AtLoc); - ExprResult ParseObjCEncodeExpression(SourceLocation AtLoc); - ExprResult ParseObjCSelectorExpression(SourceLocation AtLoc); - ExprResult ParseObjCProtocolExpression(SourceLocation AtLoc); - bool isSimpleObjCMessageExpression(); - ExprResult ParseObjCMessageExpression(); - ExprResult ParseObjCMessageExpressionBody(SourceLocation LBracloc, - SourceLocation SuperLoc, - ParsedType ReceiverType, - Expr *ReceiverExpr); - ExprResult ParseAssignmentExprWithObjCMessageExprStart( - SourceLocation LBracloc, SourceLocation SuperLoc, - ParsedType ReceiverType, Expr *ReceiverExpr); - bool ParseObjCXXMessageReceiver(bool &IsExpr, void *&TypeOrExpr); + void ParseSpecifierQualifierList( + DeclSpec &DS, AccessSpecifier AS = AS_none, + DeclSpecContext DSC = DeclSpecContext::DSC_normal) { + ParseSpecifierQualifierList(DS, getImplicitTypenameContext(DSC), AS, DSC); + } - //===--------------------------------------------------------------------===// - // C99 6.8: Statements and Blocks. + /// ParseSpecifierQualifierList + /// \verbatim + /// specifier-qualifier-list: + /// type-specifier specifier-qualifier-list[opt] + /// type-qualifier specifier-qualifier-list[opt] + /// [GNU] attributes specifier-qualifier-list[opt] + /// \endverbatim + /// + void ParseSpecifierQualifierList( + DeclSpec &DS, ImplicitTypenameContext AllowImplicitTypename, + AccessSpecifier AS = AS_none, + DeclSpecContext DSC = DeclSpecContext::DSC_normal); - /// A SmallVector of expressions. - typedef SmallVector ExprVector; + /// ParseEnumSpecifier + /// \verbatim + /// enum-specifier: [C99 6.7.2.2] + /// 'enum' identifier[opt] '{' enumerator-list '}' + ///[C99/C++]'enum' identifier[opt] '{' enumerator-list ',' '}' + /// [GNU] 'enum' attributes[opt] identifier[opt] '{' enumerator-list ',' [opt] + /// '}' attributes[opt] + /// [MS] 'enum' __declspec[opt] identifier[opt] '{' enumerator-list ',' [opt] + /// '}' + /// 'enum' identifier + /// [GNU] 'enum' attributes[opt] identifier + /// + /// [C++11] enum-head '{' enumerator-list[opt] '}' + /// [C++11] enum-head '{' enumerator-list ',' '}' + /// + /// enum-head: [C++11] + /// enum-key attribute-specifier-seq[opt] identifier[opt] enum-base[opt] + /// enum-key attribute-specifier-seq[opt] nested-name-specifier + /// identifier enum-base[opt] + /// + /// enum-key: [C++11] + /// 'enum' + /// 'enum' 'class' + /// 'enum' 'struct' + /// + /// enum-base: [C++11] + /// ':' type-specifier-seq + /// + /// [C++] elaborated-type-specifier: + /// [C++] 'enum' nested-name-specifier[opt] identifier + /// \endverbatim + /// + void ParseEnumSpecifier(SourceLocation TagLoc, DeclSpec &DS, + const ParsedTemplateInfo &TemplateInfo, + AccessSpecifier AS, DeclSpecContext DSC); - StmtResult - ParseStatement(SourceLocation *TrailingElseLoc = nullptr, - ParsedStmtContext StmtCtx = ParsedStmtContext::SubStmt); - StmtResult ParseStatementOrDeclaration( - StmtVector &Stmts, ParsedStmtContext StmtCtx, - SourceLocation *TrailingElseLoc = nullptr); - StmtResult ParseStatementOrDeclarationAfterAttributes( - StmtVector &Stmts, ParsedStmtContext StmtCtx, - SourceLocation *TrailingElseLoc, ParsedAttributes &DeclAttrs, - ParsedAttributes &DeclSpecAttrs); - StmtResult ParseExprStatement(ParsedStmtContext StmtCtx); - StmtResult ParseLabeledStatement(ParsedAttributes &Attrs, - ParsedStmtContext StmtCtx); - StmtResult ParseCaseStatement(ParsedStmtContext StmtCtx, - bool MissingCase = false, - ExprResult Expr = ExprResult()); - StmtResult ParseDefaultStatement(ParsedStmtContext StmtCtx); - StmtResult ParseCompoundStatement(bool isStmtExpr = false); - StmtResult ParseCompoundStatement(bool isStmtExpr, - unsigned ScopeFlags); - void ParseCompoundStatementLeadingPragmas(); - void DiagnoseLabelAtEndOfCompoundStatement(); - bool ConsumeNullStmt(StmtVector &Stmts); - StmtResult ParseCompoundStatementBody(bool isStmtExpr = false); - bool ParseParenExprOrCondition(StmtResult *InitStmt, - Sema::ConditionResult &CondResult, - SourceLocation Loc, Sema::ConditionKind CK, - SourceLocation &LParenLoc, - SourceLocation &RParenLoc); - StmtResult ParseIfStatement(SourceLocation *TrailingElseLoc); - StmtResult ParseSwitchStatement(SourceLocation *TrailingElseLoc); - StmtResult ParseWhileStatement(SourceLocation *TrailingElseLoc); - StmtResult ParseDoStatement(); - StmtResult ParseForStatement(SourceLocation *TrailingElseLoc); - StmtResult ParseGotoStatement(); - StmtResult ParseContinueStatement(); - StmtResult ParseBreakStatement(); - StmtResult ParseReturnStatement(); - StmtResult ParseAsmStatement(bool &msAsm); - StmtResult ParseMicrosoftAsmStatement(SourceLocation AsmLoc); - StmtResult ParsePragmaLoopHint(StmtVector &Stmts, ParsedStmtContext StmtCtx, - SourceLocation *TrailingElseLoc, - ParsedAttributes &Attrs); + /// ParseEnumBody - Parse a {} enclosed enumerator-list. + /// \verbatim + /// enumerator-list: + /// enumerator + /// enumerator-list ',' enumerator + /// enumerator: + /// enumeration-constant attributes[opt] + /// enumeration-constant attributes[opt] '=' constant-expression + /// enumeration-constant: + /// identifier + /// \endverbatim + /// + void ParseEnumBody(SourceLocation StartLoc, Decl *TagDecl, + SkipBodyInfo *SkipBody = nullptr); - /// Describes the condition of a Microsoft __if_exists or - /// __if_not_exists block. - struct IfExistsCondition { - /// The location of the initial keyword. - SourceLocation KeywordLoc; - /// Whether this is an __if_exists block (rather than an - /// __if_not_exists block). - bool IsIfExists; + /// ParseStructUnionBody + /// \verbatim + /// struct-contents: + /// struct-declaration-list + /// [EXT] empty + /// [GNU] "struct-declaration-list" without terminating ';' + /// struct-declaration-list: + /// struct-declaration + /// struct-declaration-list struct-declaration + /// [OBC] '@' 'defs' '(' class-name ')' + /// \endverbatim + /// + void ParseStructUnionBody(SourceLocation StartLoc, DeclSpec::TST TagType, + RecordDecl *TagDecl); - /// Nested-name-specifier preceding the name. - CXXScopeSpec SS; + /// ParseStructDeclaration - Parse a struct declaration without the + /// terminating semicolon. + /// + /// Note that a struct declaration refers to a declaration in a struct, + /// not to the declaration of a struct. + /// + /// \verbatim + /// struct-declaration: + /// [C23] attributes-specifier-seq[opt] + /// specifier-qualifier-list struct-declarator-list + /// [GNU] __extension__ struct-declaration + /// [GNU] specifier-qualifier-list + /// struct-declarator-list: + /// struct-declarator + /// struct-declarator-list ',' struct-declarator + /// [GNU] struct-declarator-list ',' attributes[opt] struct-declarator + /// struct-declarator: + /// declarator + /// [GNU] declarator attributes[opt] + /// declarator[opt] ':' constant-expression + /// [GNU] declarator[opt] ':' constant-expression attributes[opt] + /// \endverbatim + /// + void ParseStructDeclaration( + ParsingDeclSpec &DS, + llvm::function_ref FieldsCallback, + LateParsedAttrList *LateFieldAttrs = nullptr); - /// The name we're looking for. - UnqualifiedId Name; + DeclGroupPtrTy ParseTopLevelStmtDecl(); - /// The behavior of this __if_exists or __if_not_exists block - /// should. - IfExistsBehavior Behavior; - }; + /// isDeclarationSpecifier() - Return true if the current token is part of a + /// declaration specifier. + /// + /// \param AllowImplicitTypename whether this is a context where T::type [T + /// dependent] can appear. + /// \param DisambiguatingWithExpression True to indicate that the purpose of + /// this check is to disambiguate between an expression and a declaration. + bool isDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename, + bool DisambiguatingWithExpression = false); - bool ParseMicrosoftIfExistsCondition(IfExistsCondition& Result); - void ParseMicrosoftIfExistsStatement(StmtVector &Stmts); - void ParseMicrosoftIfExistsExternalDeclaration(); - void ParseMicrosoftIfExistsClassDeclaration(DeclSpec::TST TagType, - ParsedAttributes &AccessAttrs, - AccessSpecifier &CurAS); - bool ParseMicrosoftIfExistsBraceInitializer(ExprVector &InitExprs, - bool &InitExprsOk); - bool ParseAsmOperandsOpt(SmallVectorImpl &Names, - SmallVectorImpl &Constraints, - SmallVectorImpl &Exprs); + /// isTypeSpecifierQualifier - Return true if the current token could be the + /// start of a specifier-qualifier-list. + bool isTypeSpecifierQualifier(); - //===--------------------------------------------------------------------===// - // C++ 6: Statements and Blocks + /// isKnownToBeTypeSpecifier - Return true if we know that the specified token + /// is definitely a type-specifier. Return false if it isn't part of a type + /// specifier or if we're not sure. + bool isKnownToBeTypeSpecifier(const Token &Tok) const; - StmtResult ParseCXXTryBlock(); - StmtResult ParseCXXTryBlockCommon(SourceLocation TryLoc, bool FnTry = false); - StmtResult ParseCXXCatchBlock(bool FnCatch = false); + /// Starting with a scope specifier, identifier, or + /// template-id that refers to the current class, determine whether + /// this is a constructor declarator. + bool isConstructorDeclarator( + bool Unqualified, bool DeductionGuide = false, + DeclSpec::FriendSpecified IsFriend = DeclSpec::FriendSpecified::No, + const ParsedTemplateInfo *TemplateInfo = nullptr); - //===--------------------------------------------------------------------===// - // MS: SEH Statements and Blocks + /// Diagnoses use of _ExtInt as being deprecated, and diagnoses use of + /// _BitInt as an extension when appropriate. + void DiagnoseBitIntUse(const Token &Tok); - StmtResult ParseSEHTryBlock(); - StmtResult ParseSEHExceptBlock(SourceLocation Loc); - StmtResult ParseSEHFinallyBlock(SourceLocation Loc); - StmtResult ParseSEHLeaveStatement(); + // Check for the start of an attribute-specifier-seq in a context where an + // attribute is not allowed. + bool CheckProhibitedCXX11Attribute() { + assert(Tok.is(tok::l_square)); + if (NextToken().isNot(tok::l_square)) + return false; + return DiagnoseProhibitedCXX11Attribute(); + } - //===--------------------------------------------------------------------===// - // Objective-C Statements + /// DiagnoseProhibitedCXX11Attribute - We have found the opening square + /// brackets of a C++11 attribute-specifier in a location where an attribute + /// is not permitted. By C++11 [dcl.attr.grammar]p6, this is ill-formed. + /// Diagnose this situation. + /// + /// \return \c true if we skipped an attribute-like chunk of tokens, \c false + /// if this doesn't appear to actually be an attribute-specifier, and the + /// caller should try to parse it. + bool DiagnoseProhibitedCXX11Attribute(); - StmtResult ParseObjCAtStatement(SourceLocation atLoc, - ParsedStmtContext StmtCtx); - StmtResult ParseObjCTryStmt(SourceLocation atLoc); - StmtResult ParseObjCThrowStmt(SourceLocation atLoc); - StmtResult ParseObjCSynchronizedStmt(SourceLocation atLoc); - StmtResult ParseObjCAutoreleasePoolStmt(SourceLocation atLoc); + void CheckMisplacedCXX11Attribute(ParsedAttributes &Attrs, + SourceLocation CorrectLocation) { + if (!Tok.isRegularKeywordAttribute() && + (Tok.isNot(tok::l_square) || NextToken().isNot(tok::l_square)) && + Tok.isNot(tok::kw_alignas)) + return; + DiagnoseMisplacedCXX11Attribute(Attrs, CorrectLocation); + } + /// We have found the opening square brackets of a C++11 + /// attribute-specifier in a location where an attribute is not permitted, but + /// we know where the attributes ought to be written. Parse them anyway, and + /// provide a fixit moving them to the right place. + void DiagnoseMisplacedCXX11Attribute(ParsedAttributes &Attrs, + SourceLocation CorrectLocation); - //===--------------------------------------------------------------------===// - // C99 6.7: Declarations. + // Usually, `__attribute__((attrib)) class Foo {} var` means that attribute + // applies to var, not the type Foo. + // As an exception to the rule, __declspec(align(...)) before the + // class-key affects the type instead of the variable. + // Also, Microsoft-style [attributes] seem to affect the type instead of the + // variable. + // This function moves attributes that should apply to the type off DS to + // Attrs. + void stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs, DeclSpec &DS, + TagUseKind TUK); - /// A context for parsing declaration specifiers. TODO: flesh this - /// out, there are other significant restrictions on specifiers than - /// would be best implemented in the parser. - enum class DeclSpecContext { - DSC_normal, // normal context - DSC_class, // class context, enables 'friend' - DSC_type_specifier, // C++ type-specifier-seq or C specifier-qualifier-list - DSC_trailing, // C++11 trailing-type-specifier in a trailing return type - DSC_alias_declaration, // C++11 type-specifier-seq in an alias-declaration - DSC_conv_operator, // C++ type-specifier-seq in an conversion operator - DSC_top_level, // top-level/namespace declaration context - DSC_template_param, // template parameter context - DSC_template_arg, // template argument context - DSC_template_type_arg, // template type argument context - DSC_objc_method_result, // ObjC method result context, enables - // 'instancetype' - DSC_condition, // condition declaration context - DSC_association, // A _Generic selection expression's type association - DSC_new, // C++ new expression - }; + // FixItLoc = possible correct location for the attributes + void ProhibitAttributes(ParsedAttributes &Attrs, + SourceLocation FixItLoc = SourceLocation()) { + if (Attrs.Range.isInvalid()) + return; + DiagnoseProhibitedAttributes(Attrs, FixItLoc); + Attrs.clear(); + } - /// Is this a context in which we are parsing just a type-specifier (or - /// trailing-type-specifier)? - static bool isTypeSpecifier(DeclSpecContext DSC) { - switch (DSC) { - case DeclSpecContext::DSC_normal: - case DeclSpecContext::DSC_template_param: - case DeclSpecContext::DSC_template_arg: - case DeclSpecContext::DSC_class: - case DeclSpecContext::DSC_top_level: - case DeclSpecContext::DSC_objc_method_result: - case DeclSpecContext::DSC_condition: - return false; - - case DeclSpecContext::DSC_template_type_arg: - case DeclSpecContext::DSC_type_specifier: - case DeclSpecContext::DSC_conv_operator: - case DeclSpecContext::DSC_trailing: - case DeclSpecContext::DSC_alias_declaration: - case DeclSpecContext::DSC_association: - case DeclSpecContext::DSC_new: - return true; - } - llvm_unreachable("Missing DeclSpecContext case"); + void ProhibitAttributes(ParsedAttributesView &Attrs, + SourceLocation FixItLoc = SourceLocation()) { + if (Attrs.Range.isInvalid()) + return; + DiagnoseProhibitedAttributes(Attrs, FixItLoc); + Attrs.clearListOnly(); } + void DiagnoseProhibitedAttributes(const ParsedAttributesView &Attrs, + SourceLocation FixItLoc); - /// Whether a defining-type-specifier is permitted in a given context. - enum class AllowDefiningTypeSpec { - /// The grammar doesn't allow a defining-type-specifier here, and we must - /// not parse one (eg, because a '{' could mean something else). - No, - /// The grammar doesn't allow a defining-type-specifier here, but we permit - /// one for error recovery purposes. Sema will reject. - NoButErrorRecovery, - /// The grammar allows a defining-type-specifier here, even though it's - /// always invalid. Sema will reject. - YesButInvalid, - /// The grammar allows a defining-type-specifier here, and one can be valid. - Yes - }; + // Forbid C++11 and C23 attributes that appear on certain syntactic locations + // which standard permits but we don't supported yet, for example, attributes + // appertain to decl specifiers. + // For the most cases we don't want to warn on unknown type attributes, but + // left them to later diagnoses. However, for a few cases like module + // declarations and module import declarations, we should do it. + void ProhibitCXX11Attributes(ParsedAttributes &Attrs, unsigned AttrDiagID, + unsigned KeywordDiagId, + bool DiagnoseEmptyAttrs = false, + bool WarnOnUnknownAttrs = false); - /// Is this a context in which we are parsing defining-type-specifiers (and - /// so permit class and enum definitions in addition to non-defining class and - /// enum elaborated-type-specifiers)? - static AllowDefiningTypeSpec - isDefiningTypeSpecifierContext(DeclSpecContext DSC, bool IsCPlusPlus) { - switch (DSC) { - case DeclSpecContext::DSC_normal: - case DeclSpecContext::DSC_class: - case DeclSpecContext::DSC_top_level: - case DeclSpecContext::DSC_alias_declaration: - case DeclSpecContext::DSC_objc_method_result: - return AllowDefiningTypeSpec::Yes; + /// Emit warnings for C++11 and C23 attributes that are in a position that + /// clang accepts as an extension. + void DiagnoseCXX11AttributeExtension(ParsedAttributes &Attrs); - case DeclSpecContext::DSC_condition: - case DeclSpecContext::DSC_template_param: - return AllowDefiningTypeSpec::YesButInvalid; + ExprResult ParseUnevaluatedStringInAttribute(const IdentifierInfo &AttrName); - case DeclSpecContext::DSC_template_type_arg: - case DeclSpecContext::DSC_type_specifier: - return AllowDefiningTypeSpec::NoButErrorRecovery; + bool + ParseAttributeArgumentList(const clang::IdentifierInfo &AttrName, + SmallVectorImpl &Exprs, + ParsedAttributeArgumentsProperties ArgsProperties); - case DeclSpecContext::DSC_association: - return IsCPlusPlus ? AllowDefiningTypeSpec::NoButErrorRecovery - : AllowDefiningTypeSpec::Yes; + /// Parses syntax-generic attribute arguments for attributes which are + /// known to the implementation, and adds them to the given ParsedAttributes + /// list with the given attribute syntax. Returns the number of arguments + /// parsed for the attribute. + unsigned + ParseAttributeArgsCommon(IdentifierInfo *AttrName, SourceLocation AttrNameLoc, + ParsedAttributes &Attrs, SourceLocation *EndLoc, + IdentifierInfo *ScopeName, SourceLocation ScopeLoc, + ParsedAttr::Form Form); - case DeclSpecContext::DSC_trailing: - case DeclSpecContext::DSC_conv_operator: - case DeclSpecContext::DSC_template_arg: - case DeclSpecContext::DSC_new: - return AllowDefiningTypeSpec::No; - } - llvm_unreachable("Missing DeclSpecContext case"); - } + enum ParseAttrKindMask { + PAKM_GNU = 1 << 0, + PAKM_Declspec = 1 << 1, + PAKM_CXX11 = 1 << 2, + }; - /// Is this a context in which an opaque-enum-declaration can appear? - static bool isOpaqueEnumDeclarationContext(DeclSpecContext DSC) { - switch (DSC) { - case DeclSpecContext::DSC_normal: - case DeclSpecContext::DSC_class: - case DeclSpecContext::DSC_top_level: + /// \brief Parse attributes based on what syntaxes are desired, allowing for + /// the order to vary. e.g. with PAKM_GNU | PAKM_Declspec: + /// __attribute__((...)) __declspec(...) __attribute__((...))) + /// Note that Microsoft attributes (spelled with single square brackets) are + /// not supported by this because of parsing ambiguities with other + /// constructs. + /// + /// There are some attribute parse orderings that should not be allowed in + /// arbitrary order. e.g., + /// + /// \verbatim + /// [[]] __attribute__(()) int i; // OK + /// __attribute__(()) [[]] int i; // Not OK + /// \endverbatim + /// + /// Such situations should use the specific attribute parsing functionality. + void ParseAttributes(unsigned WhichAttrKinds, ParsedAttributes &Attrs, + LateParsedAttrList *LateAttrs = nullptr); + /// \brief Possibly parse attributes based on what syntaxes are desired, + /// allowing for the order to vary. + bool MaybeParseAttributes(unsigned WhichAttrKinds, ParsedAttributes &Attrs, + LateParsedAttrList *LateAttrs = nullptr) { + if (Tok.isOneOf(tok::kw___attribute, tok::kw___declspec) || + isAllowedCXX11AttributeSpecifier()) { + ParseAttributes(WhichAttrKinds, Attrs, LateAttrs); return true; + } + return false; + } - case DeclSpecContext::DSC_alias_declaration: - case DeclSpecContext::DSC_objc_method_result: - case DeclSpecContext::DSC_condition: - case DeclSpecContext::DSC_template_param: - case DeclSpecContext::DSC_template_type_arg: - case DeclSpecContext::DSC_type_specifier: - case DeclSpecContext::DSC_trailing: - case DeclSpecContext::DSC_association: - case DeclSpecContext::DSC_conv_operator: - case DeclSpecContext::DSC_template_arg: - case DeclSpecContext::DSC_new: - - return false; + void MaybeParseGNUAttributes(Declarator &D, + LateParsedAttrList *LateAttrs = nullptr) { + if (Tok.is(tok::kw___attribute)) { + ParsedAttributes Attrs(AttrFactory); + ParseGNUAttributes(Attrs, LateAttrs, &D); + D.takeAttributes(Attrs); } - llvm_unreachable("Missing DeclSpecContext case"); } - /// Is this a context in which we can perform class template argument - /// deduction? - static bool isClassTemplateDeductionContext(DeclSpecContext DSC) { - switch (DSC) { - case DeclSpecContext::DSC_normal: - case DeclSpecContext::DSC_template_param: - case DeclSpecContext::DSC_template_arg: - case DeclSpecContext::DSC_class: - case DeclSpecContext::DSC_top_level: - case DeclSpecContext::DSC_condition: - case DeclSpecContext::DSC_type_specifier: - case DeclSpecContext::DSC_association: - case DeclSpecContext::DSC_conv_operator: - case DeclSpecContext::DSC_new: + bool MaybeParseGNUAttributes(ParsedAttributes &Attrs, + LateParsedAttrList *LateAttrs = nullptr) { + if (Tok.is(tok::kw___attribute)) { + ParseGNUAttributes(Attrs, LateAttrs); return true; - - case DeclSpecContext::DSC_objc_method_result: - case DeclSpecContext::DSC_template_type_arg: - case DeclSpecContext::DSC_trailing: - case DeclSpecContext::DSC_alias_declaration: - return false; } - llvm_unreachable("Missing DeclSpecContext case"); + return false; } - // Is this a context in which an implicit 'typename' is allowed? - static ImplicitTypenameContext - getImplicitTypenameContext(DeclSpecContext DSC) { - switch (DSC) { - case DeclSpecContext::DSC_class: - case DeclSpecContext::DSC_top_level: - case DeclSpecContext::DSC_type_specifier: - case DeclSpecContext::DSC_template_type_arg: - case DeclSpecContext::DSC_trailing: - case DeclSpecContext::DSC_alias_declaration: - case DeclSpecContext::DSC_template_param: - case DeclSpecContext::DSC_new: - return ImplicitTypenameContext::Yes; + /// ParseSingleGNUAttribute - Parse a single GNU attribute. + /// + /// \verbatim + /// [GNU] attrib: + /// empty + /// attrib-name + /// attrib-name '(' identifier ')' + /// attrib-name '(' identifier ',' nonempty-expr-list ')' + /// attrib-name '(' argument-expression-list [C99 6.5.2] ')' + /// + /// [GNU] attrib-name: + /// identifier + /// typespec + /// typequal + /// storageclass + /// \endverbatim + bool ParseSingleGNUAttribute(ParsedAttributes &Attrs, SourceLocation &EndLoc, + LateParsedAttrList *LateAttrs = nullptr, + Declarator *D = nullptr); - case DeclSpecContext::DSC_normal: - case DeclSpecContext::DSC_objc_method_result: - case DeclSpecContext::DSC_condition: - case DeclSpecContext::DSC_template_arg: - case DeclSpecContext::DSC_conv_operator: - case DeclSpecContext::DSC_association: - return ImplicitTypenameContext::No; + /// ParseGNUAttributes - Parse a non-empty attributes list. + /// + /// \verbatim + /// [GNU] attributes: + /// attribute + /// attributes attribute + /// + /// [GNU] attribute: + /// '__attribute__' '(' '(' attribute-list ')' ')' + /// + /// [GNU] attribute-list: + /// attrib + /// attribute_list ',' attrib + /// + /// [GNU] attrib: + /// empty + /// attrib-name + /// attrib-name '(' identifier ')' + /// attrib-name '(' identifier ',' nonempty-expr-list ')' + /// attrib-name '(' argument-expression-list [C99 6.5.2] ')' + /// + /// [GNU] attrib-name: + /// identifier + /// typespec + /// typequal + /// storageclass + /// \endverbatim + /// + /// Whether an attribute takes an 'identifier' is determined by the + /// attrib-name. GCC's behavior here is not worth imitating: + /// + /// * In C mode, if the attribute argument list starts with an identifier + /// followed by a ',' or an ')', and the identifier doesn't resolve to + /// a type, it is parsed as an identifier. If the attribute actually + /// wanted an expression, it's out of luck (but it turns out that no + /// attributes work that way, because C constant expressions are very + /// limited). + /// * In C++ mode, if the attribute argument list starts with an identifier, + /// and the attribute *wants* an identifier, it is parsed as an identifier. + /// At block scope, any additional tokens between the identifier and the + /// ',' or ')' are ignored, otherwise they produce a parse error. + /// + /// We follow the C++ model, but don't allow junk after the identifier. + void ParseGNUAttributes(ParsedAttributes &Attrs, + LateParsedAttrList *LateAttrs = nullptr, + Declarator *D = nullptr); + + /// Parse the arguments to a parameterized GNU attribute or + /// a C++11 attribute in "gnu" namespace. + void ParseGNUAttributeArgs(IdentifierInfo *AttrName, + SourceLocation AttrNameLoc, + ParsedAttributes &Attrs, SourceLocation *EndLoc, + IdentifierInfo *ScopeName, SourceLocation ScopeLoc, + ParsedAttr::Form Form, Declarator *D); + IdentifierLoc *ParseIdentifierLoc(); + + unsigned + ParseClangAttributeArgs(IdentifierInfo *AttrName, SourceLocation AttrNameLoc, + ParsedAttributes &Attrs, SourceLocation *EndLoc, + IdentifierInfo *ScopeName, SourceLocation ScopeLoc, + ParsedAttr::Form Form); + + void MaybeParseCXX11Attributes(Declarator &D) { + if (isAllowedCXX11AttributeSpecifier()) { + ParsedAttributes Attrs(AttrFactory); + ParseCXX11Attributes(Attrs); + D.takeAttributes(Attrs); } - llvm_unreachable("Missing DeclSpecContext case"); } - /// Information on a C++0x for-range-initializer found while parsing a - /// declaration which turns out to be a for-range-declaration. - struct ForRangeInit { - SourceLocation ColonLoc; - ExprResult RangeExpr; - SmallVector LifetimeExtendTemps; - bool ParsedForRangeDecl() { return !ColonLoc.isInvalid(); } - }; - struct ForRangeInfo : ForRangeInit { - StmtResult LoopVar; - }; + bool MaybeParseCXX11Attributes(ParsedAttributes &Attrs, + bool OuterMightBeMessageSend = false) { + if (isAllowedCXX11AttributeSpecifier(false, OuterMightBeMessageSend)) { + ParseCXX11Attributes(Attrs); + return true; + } + return false; + } - DeclGroupPtrTy ParseDeclaration(DeclaratorContext Context, - SourceLocation &DeclEnd, - ParsedAttributes &DeclAttrs, - ParsedAttributes &DeclSpecAttrs, - SourceLocation *DeclSpecStart = nullptr); - DeclGroupPtrTy - ParseSimpleDeclaration(DeclaratorContext Context, SourceLocation &DeclEnd, - ParsedAttributes &DeclAttrs, - ParsedAttributes &DeclSpecAttrs, bool RequireSemi, - ForRangeInit *FRI = nullptr, - SourceLocation *DeclSpecStart = nullptr); - bool MightBeDeclarator(DeclaratorContext Context); - DeclGroupPtrTy ParseDeclGroup(ParsingDeclSpec &DS, DeclaratorContext Context, - ParsedAttributes &Attrs, - ParsedTemplateInfo &TemplateInfo, - SourceLocation *DeclEnd = nullptr, - ForRangeInit *FRI = nullptr); - Decl *ParseDeclarationAfterDeclarator(Declarator &D, - const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo()); - bool ParseAsmAttributesAfterDeclarator(Declarator &D); - Decl *ParseDeclarationAfterDeclaratorAndAttributes( - Declarator &D, - const ParsedTemplateInfo &TemplateInfo = ParsedTemplateInfo(), - ForRangeInit *FRI = nullptr); - Decl *ParseFunctionStatementBody(Decl *Decl, ParseScope &BodyScope); - Decl *ParseFunctionTryBlock(Decl *Decl, ParseScope &BodyScope); + bool MaybeParseMicrosoftAttributes(ParsedAttributes &Attrs) { + bool AttrsParsed = false; + if ((getLangOpts().MicrosoftExt || getLangOpts().HLSL) && + Tok.is(tok::l_square)) { + ParsedAttributes AttrsWithRange(AttrFactory); + ParseMicrosoftAttributes(AttrsWithRange); + AttrsParsed = !AttrsWithRange.empty(); + Attrs.takeAllFrom(AttrsWithRange); + } + return AttrsParsed; + } + bool MaybeParseMicrosoftDeclSpecs(ParsedAttributes &Attrs) { + if (getLangOpts().DeclSpecKeyword && Tok.is(tok::kw___declspec)) { + ParseMicrosoftDeclSpecs(Attrs); + return true; + } + return false; + } - /// When in code-completion, skip parsing of the function/method body - /// unless the body contains the code-completion point. + /// \verbatim + /// [MS] decl-specifier: + /// __declspec ( extended-decl-modifier-seq ) /// - /// \returns true if the function body was skipped. - bool trySkippingFunctionBody(); + /// [MS] extended-decl-modifier-seq: + /// extended-decl-modifier[opt] + /// extended-decl-modifier extended-decl-modifier-seq + /// \endverbatim + void ParseMicrosoftDeclSpecs(ParsedAttributes &Attrs); + bool ParseMicrosoftDeclSpecArgs(IdentifierInfo *AttrName, + SourceLocation AttrNameLoc, + ParsedAttributes &Attrs); + void ParseMicrosoftTypeAttributes(ParsedAttributes &attrs); + void ParseWebAssemblyFuncrefTypeAttribute(ParsedAttributes &Attrs); + void DiagnoseAndSkipExtendedMicrosoftTypeAttributes(); + SourceLocation SkipExtendedMicrosoftTypeAttributes(); - bool ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS, - ParsedTemplateInfo &TemplateInfo, AccessSpecifier AS, - DeclSpecContext DSC, ParsedAttributes &Attrs); - DeclSpecContext - getDeclSpecContextFromDeclaratorContext(DeclaratorContext Context); - void - ParseDeclarationSpecifiers(DeclSpec &DS, ParsedTemplateInfo &TemplateInfo, - AccessSpecifier AS = AS_none, - DeclSpecContext DSC = DeclSpecContext::DSC_normal, - LateParsedAttrList *LateAttrs = nullptr) { - return ParseDeclarationSpecifiers(DS, TemplateInfo, AS, DSC, LateAttrs, - getImplicitTypenameContext(DSC)); - } - void - ParseDeclarationSpecifiers(DeclSpec &DS, ParsedTemplateInfo &TemplateInfo, - AccessSpecifier AS, DeclSpecContext DSC, - LateParsedAttrList *LateAttrs, - ImplicitTypenameContext AllowImplicitTypename); + void ParseBorlandTypeAttributes(ParsedAttributes &attrs); + void ParseOpenCLKernelAttributes(ParsedAttributes &attrs); + void ParseOpenCLQualifiers(ParsedAttributes &Attrs); + void ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs); + void ParseCUDAFunctionAttributes(ParsedAttributes &attrs); + bool isHLSLQualifier(const Token &Tok) const; + void ParseHLSLQualifiers(ParsedAttributes &Attrs); - SourceLocation ParsePackIndexingType(DeclSpec &DS); - void AnnotateExistingIndexedTypeNamePack(ParsedType T, - SourceLocation StartLoc, - SourceLocation EndLoc); + /// Parse a version number. + /// + /// \verbatim + /// version: + /// simple-integer + /// simple-integer '.' simple-integer + /// simple-integer '_' simple-integer + /// simple-integer '.' simple-integer '.' simple-integer + /// simple-integer '_' simple-integer '_' simple-integer + /// \endverbatim + VersionTuple ParseVersionTuple(SourceRange &Range); - bool DiagnoseMissingSemiAfterTagDefinition( - DeclSpec &DS, AccessSpecifier AS, DeclSpecContext DSContext, - LateParsedAttrList *LateAttrs = nullptr); + /// Parse the contents of the "availability" attribute. + /// + /// \verbatim + /// availability-attribute: + /// 'availability' '(' platform ',' opt-strict version-arg-list, + /// opt-replacement, opt-message')' + /// + /// platform: + /// identifier + /// + /// opt-strict: + /// 'strict' ',' + /// + /// version-arg-list: + /// version-arg + /// version-arg ',' version-arg-list + /// + /// version-arg: + /// 'introduced' '=' version + /// 'deprecated' '=' version + /// 'obsoleted' = version + /// 'unavailable' + /// opt-replacement: + /// 'replacement' '=' + /// opt-message: + /// 'message' '=' + /// \endverbatim + void ParseAvailabilityAttribute(IdentifierInfo &Availability, + SourceLocation AvailabilityLoc, + ParsedAttributes &attrs, + SourceLocation *endLoc, + IdentifierInfo *ScopeName, + SourceLocation ScopeLoc, + ParsedAttr::Form Form); - void ParseSpecifierQualifierList( - DeclSpec &DS, AccessSpecifier AS = AS_none, - DeclSpecContext DSC = DeclSpecContext::DSC_normal) { - ParseSpecifierQualifierList(DS, getImplicitTypenameContext(DSC), AS, DSC); - } + /// Parse the contents of the "external_source_symbol" attribute. + /// + /// \verbatim + /// external-source-symbol-attribute: + /// 'external_source_symbol' '(' keyword-arg-list ')' + /// + /// keyword-arg-list: + /// keyword-arg + /// keyword-arg ',' keyword-arg-list + /// + /// keyword-arg: + /// 'language' '=' + /// 'defined_in' '=' + /// 'USR' '=' + /// 'generated_declaration' + /// \endverbatim + void ParseExternalSourceSymbolAttribute(IdentifierInfo &ExternalSourceSymbol, + SourceLocation Loc, + ParsedAttributes &Attrs, + SourceLocation *EndLoc, + IdentifierInfo *ScopeName, + SourceLocation ScopeLoc, + ParsedAttr::Form Form); - void ParseSpecifierQualifierList( - DeclSpec &DS, ImplicitTypenameContext AllowImplicitTypename, - AccessSpecifier AS = AS_none, - DeclSpecContext DSC = DeclSpecContext::DSC_normal); + /// Parse the contents of the "objc_bridge_related" attribute. + /// \verbatim + /// objc_bridge_related '(' related_class ',' opt-class_method ',' opt-instance_method ')' + /// related_class: + /// Identifier + /// + /// opt-class_method: + /// Identifier: | + /// + /// opt-instance_method: + /// Identifier | + /// \endverbatim + /// + void ParseObjCBridgeRelatedAttribute(IdentifierInfo &ObjCBridgeRelated, + SourceLocation ObjCBridgeRelatedLoc, + ParsedAttributes &Attrs, + SourceLocation *EndLoc, + IdentifierInfo *ScopeName, + SourceLocation ScopeLoc, + ParsedAttr::Form Form); - void ParseObjCTypeQualifierList(ObjCDeclSpec &DS, - DeclaratorContext Context); + void ParseSwiftNewTypeAttribute(IdentifierInfo &AttrName, + SourceLocation AttrNameLoc, + ParsedAttributes &Attrs, + SourceLocation *EndLoc, + IdentifierInfo *ScopeName, + SourceLocation ScopeLoc, + ParsedAttr::Form Form); - void ParseEnumSpecifier(SourceLocation TagLoc, DeclSpec &DS, - const ParsedTemplateInfo &TemplateInfo, - AccessSpecifier AS, DeclSpecContext DSC); - void ParseEnumBody(SourceLocation StartLoc, Decl *TagDecl, - SkipBodyInfo *SkipBody = nullptr); - void ParseStructUnionBody(SourceLocation StartLoc, DeclSpec::TST TagType, - RecordDecl *TagDecl); + void ParseTypeTagForDatatypeAttribute(IdentifierInfo &AttrName, + SourceLocation AttrNameLoc, + ParsedAttributes &Attrs, + SourceLocation *EndLoc, + IdentifierInfo *ScopeName, + SourceLocation ScopeLoc, + ParsedAttr::Form Form); - void ParseStructDeclaration( - ParsingDeclSpec &DS, - llvm::function_ref FieldsCallback, - LateParsedAttrList *LateFieldAttrs = nullptr); + void ParseAttributeWithTypeArg(IdentifierInfo &AttrName, + SourceLocation AttrNameLoc, + ParsedAttributes &Attrs, + IdentifierInfo *ScopeName, + SourceLocation ScopeLoc, + ParsedAttr::Form Form); - DeclGroupPtrTy ParseTopLevelStmtDecl(); + void DistributeCLateParsedAttrs(Decl *Dcl, LateParsedAttrList *LateAttrs); - bool isDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename, - bool DisambiguatingWithExpression = false); - bool isTypeSpecifierQualifier(); + /// Bounds attributes (e.g., counted_by): + /// \verbatim + /// AttrName '(' expression ')' + /// \endverbatim + void ParseBoundsAttribute(IdentifierInfo &AttrName, + SourceLocation AttrNameLoc, ParsedAttributes &Attrs, + IdentifierInfo *ScopeName, SourceLocation ScopeLoc, + ParsedAttr::Form Form); - /// isKnownToBeTypeSpecifier - Return true if we know that the specified token - /// is definitely a type-specifier. Return false if it isn't part of a type - /// specifier or if we're not sure. - bool isKnownToBeTypeSpecifier(const Token &Tok) const; + /// \verbatim + /// [GNU] typeof-specifier: + /// typeof ( expressions ) + /// typeof ( type-name ) + /// [GNU/C++] typeof unary-expression + /// [C23] typeof-specifier: + /// typeof '(' typeof-specifier-argument ')' + /// typeof_unqual '(' typeof-specifier-argument ')' + /// + /// typeof-specifier-argument: + /// expression + /// type-name + /// \endverbatim + /// + void ParseTypeofSpecifier(DeclSpec &DS); - /// Return true if we know that we are definitely looking at a - /// decl-specifier, and isn't part of an expression such as a function-style - /// cast. Return false if it's no a decl-specifier, or we're not sure. - bool isKnownToBeDeclarationSpecifier() { - if (getLangOpts().CPlusPlus) - return isCXXDeclarationSpecifier(ImplicitTypenameContext::No) == - TPResult::True; - return isDeclarationSpecifier(ImplicitTypenameContext::No, true); - } + /// \verbatim + /// [C11] atomic-specifier: + /// _Atomic ( type-name ) + /// \endverbatim + /// + void ParseAtomicSpecifier(DeclSpec &DS); - /// isDeclarationStatement - Disambiguates between a declaration or an - /// expression statement, when parsing function bodies. + /// ParseAlignArgument - Parse the argument to an alignment-specifier. /// - /// \param DisambiguatingWithExpression - True to indicate that the purpose of - /// this check is to disambiguate between an expression and a declaration. - /// Returns true for declaration, false for expression. - bool isDeclarationStatement(bool DisambiguatingWithExpression = false) { - if (getLangOpts().CPlusPlus) - return isCXXDeclarationStatement(DisambiguatingWithExpression); - return isDeclarationSpecifier(ImplicitTypenameContext::No, true); - } + /// \verbatim + /// [C11] type-id + /// [C11] constant-expression + /// [C++0x] type-id ...[opt] + /// [C++0x] assignment-expression ...[opt] + /// \endverbatim + ExprResult ParseAlignArgument(StringRef KWName, SourceLocation Start, + SourceLocation &EllipsisLoc, bool &IsType, + ParsedType &Ty); - /// isForInitDeclaration - Disambiguates between a declaration or an - /// expression in the context of the C 'clause-1' or the C++ - // 'for-init-statement' part of a 'for' statement. - /// Returns true for declaration, false for expression. - bool isForInitDeclaration() { - if (getLangOpts().OpenMP) - Actions.OpenMP().startOpenMPLoop(); - if (getLangOpts().CPlusPlus) - return Tok.is(tok::kw_using) || - isCXXSimpleDeclaration(/*AllowForRangeDecl=*/true); - return isDeclarationSpecifier(ImplicitTypenameContext::No, true); - } + /// ParseAlignmentSpecifier - Parse an alignment-specifier, and add the + /// attribute to Attrs. + /// + /// \verbatim + /// alignment-specifier: + /// [C11] '_Alignas' '(' type-id ')' + /// [C11] '_Alignas' '(' constant-expression ')' + /// [C++11] 'alignas' '(' type-id ...[opt] ')' + /// [C++11] 'alignas' '(' assignment-expression ...[opt] ')' + /// \endverbatim + void ParseAlignmentSpecifier(ParsedAttributes &Attrs, + SourceLocation *endLoc = nullptr); + ExprResult ParseExtIntegerArgument(); - /// Determine whether this is a C++1z for-range-identifier. - bool isForRangeIdentifier(); + /// \verbatim + /// type-qualifier: + /// ('__ptrauth') '(' constant-expression + /// (',' constant-expression)[opt] + /// (',' constant-expression)[opt] ')' + /// \endverbatim + void ParsePtrauthQualifier(ParsedAttributes &Attrs); - /// Determine whether we are currently at the start of an Objective-C - /// class message that appears to be missing the open bracket '['. - bool isStartOfObjCClassMessageMissingOpenBracket(); + /// DeclaratorScopeObj - RAII object used in Parser::ParseDirectDeclarator to + /// enter a new C++ declarator scope and exit it when the function is + /// finished. + class DeclaratorScopeObj { + Parser &P; + CXXScopeSpec &SS; + bool EnteredScope; + bool CreatedScope; - /// Starting with a scope specifier, identifier, or - /// template-id that refers to the current class, determine whether - /// this is a constructor declarator. - bool isConstructorDeclarator( - bool Unqualified, bool DeductionGuide = false, - DeclSpec::FriendSpecified IsFriend = DeclSpec::FriendSpecified::No, - const ParsedTemplateInfo *TemplateInfo = nullptr); + public: + DeclaratorScopeObj(Parser &p, CXXScopeSpec &ss) + : P(p), SS(ss), EnteredScope(false), CreatedScope(false) {} - /// isTypeIdInParens - Assumes that a '(' was parsed and now we want to know - /// whether the parens contain an expression or a type-id. - /// Returns true for a type-id and false for an expression. - bool isTypeIdInParens(bool &isAmbiguous) { - if (getLangOpts().CPlusPlus) - return isCXXTypeId(TentativeCXXTypeIdContext::InParens, isAmbiguous); - isAmbiguous = false; - return isTypeSpecifierQualifier(); - } - bool isTypeIdInParens() { - bool isAmbiguous; - return isTypeIdInParens(isAmbiguous); - } + void EnterDeclaratorScope() { + assert(!EnteredScope && "Already entered the scope!"); + assert(SS.isSet() && "C++ scope was not set!"); - /// Checks whether the current tokens form a type-id or an expression for the - /// purposes of use as the initial operand to a generic selection expression. - /// This requires special handling in C++ because it accepts either a type or - /// an expression, and we need to disambiguate which is which. However, we - /// cannot use the same logic as we've used for sizeof expressions, because - /// that logic relies on the operator only accepting a single argument, - /// whereas _Generic accepts a list of arguments. - bool isTypeIdForGenericSelection() { - if (getLangOpts().CPlusPlus) { - bool isAmbiguous; - return isCXXTypeId(TentativeCXXTypeIdContext::AsGenericSelectionArgument, - isAmbiguous); - } - return isTypeSpecifierQualifier(); - } + CreatedScope = true; + P.EnterScope(0); // Not a decl scope. - /// Checks if the current tokens form type-id or expression. - /// It is similar to isTypeIdInParens but does not suppose that type-id - /// is in parenthesis. - bool isTypeIdUnambiguously() { - if (getLangOpts().CPlusPlus) { - bool isAmbiguous; - return isCXXTypeId(TentativeCXXTypeIdContext::Unambiguous, isAmbiguous); + if (!P.Actions.ActOnCXXEnterDeclaratorScope(P.getCurScope(), SS)) + EnteredScope = true; } - return isTypeSpecifierQualifier(); - } - /// isCXXDeclarationStatement - C++-specialized function that disambiguates - /// between a declaration or an expression statement, when parsing function - /// bodies. Returns true for declaration, false for expression. - bool isCXXDeclarationStatement(bool DisambiguatingWithExpression = false); + ~DeclaratorScopeObj() { + if (EnteredScope) { + assert(SS.isSet() && "C++ scope was cleared ?"); + P.Actions.ActOnCXXExitDeclaratorScope(P.getCurScope(), SS); + } + if (CreatedScope) + P.ExitScope(); + } + }; - /// isCXXSimpleDeclaration - C++-specialized function that disambiguates - /// between a simple-declaration or an expression-statement. - /// If during the disambiguation process a parsing error is encountered, - /// the function returns true to let the declaration parsing code handle it. - /// Returns false if the statement is disambiguated as expression. - bool isCXXSimpleDeclaration(bool AllowForRangeDecl); + /// ParseDeclarator - Parse and verify a newly-initialized declarator. + void ParseDeclarator(Declarator &D); + /// A function that parses a variant of direct-declarator. + typedef void (Parser::*DirectDeclParseFunction)(Declarator &); - /// isCXXFunctionDeclarator - Disambiguates between a function declarator or - /// a constructor-style initializer, when parsing declaration statements. - /// Returns true for function declarator and false for constructor-style - /// initializer. Sets 'IsAmbiguous' to true to indicate that this declaration - /// might be a constructor-style initializer. - /// If during the disambiguation process a parsing error is encountered, - /// the function returns true to let the declaration parsing code handle it. - bool isCXXFunctionDeclarator(bool *IsAmbiguous = nullptr, - ImplicitTypenameContext AllowImplicitTypename = - ImplicitTypenameContext::No); + /// ParseDeclaratorInternal - Parse a C or C++ declarator. The + /// direct-declarator is parsed by the function passed to it. Pass null, and + /// the direct-declarator isn't parsed at all, making this function + /// effectively parse the C++ ptr-operator production. + /// + /// If the grammar of this construct is extended, matching changes must also + /// be made to TryParseDeclarator and MightBeDeclarator, and possibly to + /// isConstructorDeclarator. + /// + /// \verbatim + /// declarator: [C99 6.7.5] [C++ 8p4, dcl.decl] + /// [C] pointer[opt] direct-declarator + /// [C++] direct-declarator + /// [C++] ptr-operator declarator + /// + /// pointer: [C99 6.7.5] + /// '*' type-qualifier-list[opt] + /// '*' type-qualifier-list[opt] pointer + /// + /// ptr-operator: + /// '*' cv-qualifier-seq[opt] + /// '&' + /// [C++0x] '&&' + /// [GNU] '&' restrict[opt] attributes[opt] + /// [GNU?] '&&' restrict[opt] attributes[opt] + /// '::'[opt] nested-name-specifier '*' cv-qualifier-seq[opt] + /// \endverbatim + void ParseDeclaratorInternal(Declarator &D, + DirectDeclParseFunction DirectDeclParser); - struct ConditionDeclarationOrInitStatementState; - enum class ConditionOrInitStatement { - Expression, ///< Disambiguated as an expression (either kind). - ConditionDecl, ///< Disambiguated as the declaration form of condition. - InitStmtDecl, ///< Disambiguated as a simple-declaration init-statement. - ForRangeDecl, ///< Disambiguated as a for-range declaration. - Error ///< Can't be any of the above! + enum AttrRequirements { + AR_NoAttributesParsed = 0, ///< No attributes are diagnosed. + AR_GNUAttributesParsedAndRejected = 1 << 0, ///< Diagnose GNU attributes. + AR_GNUAttributesParsed = 1 << 1, + AR_CXX11AttributesParsed = 1 << 2, + AR_DeclspecAttributesParsed = 1 << 3, + AR_AllAttributesParsed = AR_GNUAttributesParsed | AR_CXX11AttributesParsed | + AR_DeclspecAttributesParsed, + AR_VendorAttributesParsed = + AR_GNUAttributesParsed | AR_DeclspecAttributesParsed }; - /// Disambiguates between the different kinds of things that can happen - /// after 'if (' or 'switch ('. This could be one of two different kinds of - /// declaration (depending on whether there is a ';' later) or an expression. - ConditionOrInitStatement - isCXXConditionDeclarationOrInitStatement(bool CanBeInitStmt, - bool CanBeForRangeDecl); - bool isCXXTypeId(TentativeCXXTypeIdContext Context, bool &isAmbiguous); - bool isCXXTypeId(TentativeCXXTypeIdContext Context) { - bool isAmbiguous; - return isCXXTypeId(Context, isAmbiguous); - } + /// ParseTypeQualifierListOpt + /// \verbatim + /// type-qualifier-list: [C99 6.7.5] + /// type-qualifier + /// [vendor] attributes + /// [ only if AttrReqs & AR_VendorAttributesParsed ] + /// type-qualifier-list type-qualifier + /// [vendor] type-qualifier-list attributes + /// [ only if AttrReqs & AR_VendorAttributesParsed ] + /// [C++0x] attribute-specifier[opt] is allowed before cv-qualifier-seq + /// [ only if AttReqs & AR_CXX11AttributesParsed ] + /// \endverbatim + /// Note: vendor can be GNU, MS, etc and can be explicitly controlled via + /// AttrRequirements bitmask values. + void ParseTypeQualifierListOpt( + DeclSpec &DS, unsigned AttrReqs = AR_AllAttributesParsed, + bool AtomicOrPtrauthAllowed = true, bool IdentifierRequired = false, + std::optional> CodeCompletionHandler = + std::nullopt); - /// TPResult - Used as the result value for functions whose purpose is to - /// disambiguate C++ constructs by "tentatively parsing" them. - enum class TPResult { - True, False, Ambiguous, Error - }; + /// ParseDirectDeclarator + /// \verbatim + /// direct-declarator: [C99 6.7.5] + /// [C99] identifier + /// '(' declarator ')' + /// [GNU] '(' attributes declarator ')' + /// [C90] direct-declarator '[' constant-expression[opt] ']' + /// [C99] direct-declarator '[' type-qual-list[opt] assignment-expr[opt] ']' + /// [C99] direct-declarator '[' 'static' type-qual-list[opt] assign-expr ']' + /// [C99] direct-declarator '[' type-qual-list 'static' assignment-expr ']' + /// [C99] direct-declarator '[' type-qual-list[opt] '*' ']' + /// [C++11] direct-declarator '[' constant-expression[opt] ']' + /// attribute-specifier-seq[opt] + /// direct-declarator '(' parameter-type-list ')' + /// direct-declarator '(' identifier-list[opt] ')' + /// [GNU] direct-declarator '(' parameter-forward-declarations + /// parameter-type-list[opt] ')' + /// [C++] direct-declarator '(' parameter-declaration-clause ')' + /// cv-qualifier-seq[opt] exception-specification[opt] + /// [C++11] direct-declarator '(' parameter-declaration-clause ')' + /// attribute-specifier-seq[opt] cv-qualifier-seq[opt] + /// ref-qualifier[opt] exception-specification[opt] + /// [C++] declarator-id + /// [C++11] declarator-id attribute-specifier-seq[opt] + /// + /// declarator-id: [C++ 8] + /// '...'[opt] id-expression + /// '::'[opt] nested-name-specifier[opt] type-name + /// + /// id-expression: [C++ 5.1] + /// unqualified-id + /// qualified-id + /// + /// unqualified-id: [C++ 5.1] + /// identifier + /// operator-function-id + /// conversion-function-id + /// '~' class-name + /// template-id + /// + /// C++17 adds the following, which we also handle here: + /// + /// simple-declaration: + /// '[' identifier-list ']' brace-or-equal-initializer ';' + /// \endverbatim + /// + /// Note, any additional constructs added here may need corresponding changes + /// in isConstructorDeclarator. + void ParseDirectDeclarator(Declarator &D); + void ParseDecompositionDeclarator(Declarator &D); - /// Determine whether we could have an enum-base. + /// ParseParenDeclarator - We parsed the declarator D up to a paren. This is + /// only called before the identifier, so these are most likely just grouping + /// parens for precedence. If we find that these are actually function + /// parameter parens in an abstract-declarator, we call + /// ParseFunctionDeclarator. /// - /// \p AllowSemi If \c true, then allow a ';' after the enum-base; otherwise - /// only consider this to be an enum-base if the next token is a '{'. + /// \verbatim + /// direct-declarator: + /// '(' declarator ')' + /// [GNU] '(' attributes declarator ')' + /// direct-declarator '(' parameter-type-list ')' + /// direct-declarator '(' identifier-list[opt] ')' + /// [GNU] direct-declarator '(' parameter-forward-declarations + /// parameter-type-list[opt] ')' + /// \endverbatim /// - /// \return \c false if this cannot possibly be an enum base; \c true - /// otherwise. - bool isEnumBase(bool AllowSemi); + void ParseParenDeclarator(Declarator &D); - /// isCXXDeclarationSpecifier - Returns TPResult::True if it is a - /// declaration specifier, TPResult::False if it is not, - /// TPResult::Ambiguous if it could be either a decl-specifier or a - /// function-style cast, and TPResult::Error if a parsing error was - /// encountered. If it could be a braced C++11 function-style cast, returns - /// BracedCastResult. - /// Doesn't consume tokens. - TPResult - isCXXDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename, - TPResult BracedCastResult = TPResult::False, - bool *InvalidAsDeclSpec = nullptr); + /// ParseFunctionDeclarator - We are after the identifier and have parsed the + /// declarator D up to a paren, which indicates that we are parsing function + /// arguments. + /// + /// If FirstArgAttrs is non-null, then the caller parsed those attributes + /// immediately after the open paren - they will be applied to the DeclSpec + /// of the first parameter. + /// + /// If RequiresArg is true, then the first argument of the function is + /// required to be present and required to not be an identifier list. + /// + /// For C++, after the parameter-list, it also parses the + /// cv-qualifier-seq[opt], (C++11) ref-qualifier[opt], + /// exception-specification[opt], (C++11) attribute-specifier-seq[opt], + /// (C++11) trailing-return-type[opt] and (C++2a) the trailing + /// requires-clause. + /// + /// \verbatim + /// [C++11] exception-specification: + /// dynamic-exception-specification + /// noexcept-specification + /// \endverbatim + /// + void ParseFunctionDeclarator(Declarator &D, ParsedAttributes &FirstArgAttrs, + BalancedDelimiterTracker &Tracker, + bool IsAmbiguous, bool RequiresArg = false); + void InitCXXThisScopeForDeclaratorIfRelevant( + const Declarator &D, const DeclSpec &DS, + std::optional &ThisScope); - /// Given that isCXXDeclarationSpecifier returns \c TPResult::True or - /// \c TPResult::Ambiguous, determine whether the decl-specifier would be - /// a type-specifier other than a cv-qualifier. - bool isCXXDeclarationSpecifierAType(); + /// ParseRefQualifier - Parses a member function ref-qualifier. Returns + /// true if a ref-qualifier is found. + bool ParseRefQualifier(bool &RefQualifierIsLValueRef, + SourceLocation &RefQualifierLoc); - /// Determine whether the current token sequence might be - /// '<' template-argument-list '>' - /// rather than a less-than expression. - TPResult isTemplateArgumentList(unsigned TokensToSkip); + /// isFunctionDeclaratorIdentifierList - This parameter list may have an + /// identifier list form for a K&R-style function: void foo(a,b,c) + /// + /// Note that identifier-lists are only allowed for normal declarators, not + /// for abstract-declarators. + bool isFunctionDeclaratorIdentifierList(); - /// Determine whether an '(' after an 'explicit' keyword is part of a C++20 - /// 'explicit(bool)' declaration, in earlier language modes where that is an - /// extension. - TPResult isExplicitBool(); + /// ParseFunctionDeclaratorIdentifierList - While parsing a function + /// declarator we found a K&R-style identifier list instead of a typed + /// parameter list. + /// + /// After returning, ParamInfo will hold the parsed parameters. + /// + /// \verbatim + /// identifier-list: [C99 6.7.5] + /// identifier + /// identifier-list ',' identifier + /// \endverbatim + /// + void ParseFunctionDeclaratorIdentifierList( + Declarator &D, SmallVectorImpl &ParamInfo); + void ParseParameterDeclarationClause( + Declarator &D, ParsedAttributes &attrs, + SmallVectorImpl &ParamInfo, + SourceLocation &EllipsisLoc) { + return ParseParameterDeclarationClause( + D.getContext(), attrs, ParamInfo, EllipsisLoc, + D.getCXXScopeSpec().isSet() && + D.isFunctionDeclaratorAFunctionDeclaration()); + } - /// Determine whether an identifier has been tentatively declared as a - /// non-type. Such tentative declarations should not be found to name a type - /// during a tentative parse, but also should not be annotated as a non-type. - bool isTentativelyDeclared(IdentifierInfo *II); + /// ParseParameterDeclarationClause - Parse a (possibly empty) parameter-list + /// after the opening parenthesis. This function will not parse a K&R-style + /// identifier list. + /// + /// DeclContext is the context of the declarator being parsed. If + /// FirstArgAttrs is non-null, then the caller parsed those attributes + /// immediately after the open paren - they will be applied to the DeclSpec of + /// the first parameter. + /// + /// After returning, ParamInfo will hold the parsed parameters. EllipsisLoc + /// will be the location of the ellipsis, if any was parsed. + /// + /// \verbatim + /// parameter-type-list: [C99 6.7.5] + /// parameter-list + /// parameter-list ',' '...' + /// [C++] parameter-list '...' + /// + /// parameter-list: [C99 6.7.5] + /// parameter-declaration + /// parameter-list ',' parameter-declaration + /// + /// parameter-declaration: [C99 6.7.5] + /// declaration-specifiers declarator + /// [C++] declaration-specifiers declarator '=' assignment-expression + /// [C++11] initializer-clause + /// [GNU] declaration-specifiers declarator attributes + /// declaration-specifiers abstract-declarator[opt] + /// [C++] declaration-specifiers abstract-declarator[opt] + /// '=' assignment-expression + /// [GNU] declaration-specifiers abstract-declarator[opt] attributes + /// [C++11] attribute-specifier-seq parameter-declaration + /// [C++2b] attribute-specifier-seq 'this' parameter-declaration + /// \endverbatim + /// + void ParseParameterDeclarationClause( + DeclaratorContext DeclaratorContext, ParsedAttributes &attrs, + SmallVectorImpl &ParamInfo, + SourceLocation &EllipsisLoc, bool IsACXXFunctionDeclaration = false); - // "Tentative parsing" functions, used for disambiguation. If a parsing error - // is encountered they will return TPResult::Error. - // Returning TPResult::True/False indicates that the ambiguity was - // resolved and tentative parsing may stop. TPResult::Ambiguous indicates - // that more tentative parsing is necessary for disambiguation. - // They all consume tokens, so backtracking should be used after calling them. + /// \verbatim + /// [C90] direct-declarator '[' constant-expression[opt] ']' + /// [C99] direct-declarator '[' type-qual-list[opt] assignment-expr[opt] ']' + /// [C99] direct-declarator '[' 'static' type-qual-list[opt] assign-expr ']' + /// [C99] direct-declarator '[' type-qual-list 'static' assignment-expr ']' + /// [C99] direct-declarator '[' type-qual-list[opt] '*' ']' + /// [C++11] direct-declarator '[' constant-expression[opt] ']' + /// attribute-specifier-seq[opt] + /// \endverbatim + void ParseBracketDeclarator(Declarator &D); - TPResult TryParseSimpleDeclaration(bool AllowForRangeDecl); - TPResult TryParseTypeofSpecifier(); - TPResult TryParseProtocolQualifiers(); - TPResult TryParsePtrOperatorSeq(); - TPResult TryParseOperatorId(); - TPResult TryParseInitDeclaratorList(bool MayHaveTrailingReturnType = false); - TPResult TryParseDeclarator(bool mayBeAbstract, bool mayHaveIdentifier = true, - bool mayHaveDirectInit = false, - bool mayHaveTrailingReturnType = false); - TPResult TryParseParameterDeclarationClause( - bool *InvalidAsDeclaration = nullptr, bool VersusTemplateArg = false, - ImplicitTypenameContext AllowImplicitTypename = - ImplicitTypenameContext::No); - TPResult TryParseFunctionDeclarator(bool MayHaveTrailingReturnType = false); - bool NameAfterArrowIsNonType(); - TPResult TryParseBracketDeclarator(); - TPResult TryConsumeDeclarationSpecifier(); + /// Diagnose brackets before an identifier. + void ParseMisplacedBracketDeclarator(Declarator &D); - /// Try to skip a possibly empty sequence of 'attribute-specifier's without - /// full validation of the syntactic structure of attributes. - bool TrySkipAttributes(); + /// Parse the given string as a type. + /// + /// This is a dangerous utility function currently employed only by API notes. + /// It is not a general entry-point for safely parsing types from strings. + /// + /// \param TypeStr The string to be parsed as a type. + /// \param Context The name of the context in which this string is being + /// parsed, which will be used in diagnostics. + /// \param IncludeLoc The location at which this parse was triggered. + TypeResult ParseTypeFromString(StringRef TypeStr, StringRef Context, + SourceLocation IncludeLoc); - /// Diagnoses use of _ExtInt as being deprecated, and diagnoses use of - /// _BitInt as an extension when appropriate. - void DiagnoseBitIntUse(const Token &Tok); + ///@} -public: - TypeResult - ParseTypeName(SourceRange *Range = nullptr, - DeclaratorContext Context = DeclaratorContext::TypeName, - AccessSpecifier AS = AS_none, Decl **OwnedType = nullptr, - ParsedAttributes *Attrs = nullptr); + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name C++ Declarations + /// Implementations are in ParseDeclCXX.cpp + ///@{ private: - void ParseBlockId(SourceLocation CaretLoc); + /// Contextual keywords for Microsoft extensions. + mutable IdentifierInfo *Ident_sealed; + mutable IdentifierInfo *Ident_abstract; - /// Return true if the next token should be treated as a [[]] attribute, - /// or as a keyword that behaves like one. The former is only true if - /// [[]] attributes are enabled, whereas the latter is true whenever - /// such a keyword appears. The arguments are as for - /// isCXX11AttributeSpecifier. - bool isAllowedCXX11AttributeSpecifier(bool Disambiguate = false, - bool OuterMightBeMessageSend = false) { - return (Tok.isRegularKeywordAttribute() || - isCXX11AttributeSpecifier(Disambiguate, OuterMightBeMessageSend) != - CXX11AttributeKind::NotAttributeSpecifier); - } + /// C++11 contextual keywords. + mutable IdentifierInfo *Ident_final; + mutable IdentifierInfo *Ident_GNU_final; + mutable IdentifierInfo *Ident_override; + mutable IdentifierInfo *Ident_trivially_relocatable_if_eligible; + mutable IdentifierInfo *Ident_replaceable_if_eligible; - // Check for the start of an attribute-specifier-seq in a context where an - // attribute is not allowed. - bool CheckProhibitedCXX11Attribute() { - assert(Tok.is(tok::l_square)); - if (NextToken().isNot(tok::l_square)) - return false; - return DiagnoseProhibitedCXX11Attribute(); - } + /// Representation of a class that has been parsed, including + /// any member function declarations or definitions that need to be + /// parsed after the corresponding top-level class is complete. + struct ParsingClass { + ParsingClass(Decl *TagOrTemplate, bool TopLevelClass, bool IsInterface) + : TopLevelClass(TopLevelClass), IsInterface(IsInterface), + TagOrTemplate(TagOrTemplate) {} - bool DiagnoseProhibitedCXX11Attribute(); - void CheckMisplacedCXX11Attribute(ParsedAttributes &Attrs, - SourceLocation CorrectLocation) { - if (!Tok.isRegularKeywordAttribute() && - (Tok.isNot(tok::l_square) || NextToken().isNot(tok::l_square)) && - Tok.isNot(tok::kw_alignas)) - return; - DiagnoseMisplacedCXX11Attribute(Attrs, CorrectLocation); - } - void DiagnoseMisplacedCXX11Attribute(ParsedAttributes &Attrs, - SourceLocation CorrectLocation); - - void stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs, DeclSpec &DS, - TagUseKind TUK); - - // FixItLoc = possible correct location for the attributes - void ProhibitAttributes(ParsedAttributes &Attrs, - SourceLocation FixItLoc = SourceLocation()) { - if (Attrs.Range.isInvalid()) - return; - DiagnoseProhibitedAttributes(Attrs, FixItLoc); - Attrs.clear(); - } + /// Whether this is a "top-level" class, meaning that it is + /// not nested within another class. + bool TopLevelClass : 1; - void ProhibitAttributes(ParsedAttributesView &Attrs, - SourceLocation FixItLoc = SourceLocation()) { - if (Attrs.Range.isInvalid()) - return; - DiagnoseProhibitedAttributes(Attrs, FixItLoc); - Attrs.clearListOnly(); - } - void DiagnoseProhibitedAttributes(const ParsedAttributesView &Attrs, - SourceLocation FixItLoc); + /// Whether this class is an __interface. + bool IsInterface : 1; - // Forbid C++11 and C23 attributes that appear on certain syntactic locations - // which standard permits but we don't supported yet, for example, attributes - // appertain to decl specifiers. - // For the most cases we don't want to warn on unknown type attributes, but - // left them to later diagnoses. However, for a few cases like module - // declarations and module import declarations, we should do it. - void ProhibitCXX11Attributes(ParsedAttributes &Attrs, unsigned AttrDiagID, - unsigned KeywordDiagId, - bool DiagnoseEmptyAttrs = false, - bool WarnOnUnknownAttrs = false); + /// The class or class template whose definition we are parsing. + Decl *TagOrTemplate; - /// Skip C++11 and C23 attributes and return the end location of the - /// last one. - /// \returns SourceLocation() if there are no attributes. - SourceLocation SkipCXX11Attributes(); + /// LateParsedDeclarations - Method declarations, inline definitions and + /// nested classes that contain pieces whose parsing will be delayed until + /// the top-level class is fully defined. + LateParsedDeclarationsContainer LateParsedDeclarations; + }; - /// Diagnose and skip C++11 and C23 attributes that appear in syntactic - /// locations where attributes are not allowed. - void DiagnoseAndSkipCXX11Attributes(); + /// The stack of classes that is currently being + /// parsed. Nested and local classes will be pushed onto this stack + /// when they are parsed, and removed afterward. + std::stack ClassStack; - /// Emit warnings for C++11 and C23 attributes that are in a position that - /// clang accepts as an extension. - void DiagnoseCXX11AttributeExtension(ParsedAttributes &Attrs); + ParsingClass &getCurrentClass() { + assert(!ClassStack.empty() && "No lexed method stacks!"); + return *ClassStack.top(); + } - ExprResult ParseUnevaluatedStringInAttribute(const IdentifierInfo &AttrName); + /// RAII object used to manage the parsing of a class definition. + class ParsingClassDefinition { + Parser &P; + bool Popped; + Sema::ParsingClassState State; - bool - ParseAttributeArgumentList(const clang::IdentifierInfo &AttrName, - SmallVectorImpl &Exprs, - ParsedAttributeArgumentsProperties ArgsProperties); + public: + ParsingClassDefinition(Parser &P, Decl *TagOrTemplate, bool TopLevelClass, + bool IsInterface) + : P(P), Popped(false), + State(P.PushParsingClass(TagOrTemplate, TopLevelClass, IsInterface)) { + } - /// Parses syntax-generic attribute arguments for attributes which are - /// known to the implementation, and adds them to the given ParsedAttributes - /// list with the given attribute syntax. Returns the number of arguments - /// parsed for the attribute. - unsigned - ParseAttributeArgsCommon(IdentifierInfo *AttrName, SourceLocation AttrNameLoc, - ParsedAttributes &Attrs, SourceLocation *EndLoc, - IdentifierInfo *ScopeName, SourceLocation ScopeLoc, - ParsedAttr::Form Form); + /// Pop this class of the stack. + void Pop() { + assert(!Popped && "Nested class has already been popped"); + Popped = true; + P.PopParsingClass(State); + } - enum ParseAttrKindMask { - PAKM_GNU = 1 << 0, - PAKM_Declspec = 1 << 1, - PAKM_CXX11 = 1 << 2, + ~ParsingClassDefinition() { + if (!Popped) + P.PopParsingClass(State); + } }; - /// \brief Parse attributes based on what syntaxes are desired, allowing for - /// the order to vary. e.g. with PAKM_GNU | PAKM_Declspec: - /// __attribute__((...)) __declspec(...) __attribute__((...))) - /// Note that Microsoft attributes (spelled with single square brackets) are - /// not supported by this because of parsing ambiguities with other - /// constructs. + /// Parse a C++ exception-specification if present (C++0x [except.spec]). /// - /// There are some attribute parse orderings that should not be allowed in - /// arbitrary order. e.g., + /// \verbatim + /// exception-specification: + /// dynamic-exception-specification + /// noexcept-specification /// - /// [[]] __attribute__(()) int i; // OK - /// __attribute__(()) [[]] int i; // Not OK + /// noexcept-specification: + /// 'noexcept' + /// 'noexcept' '(' constant-expression ')' + /// \endverbatim + ExceptionSpecificationType tryParseExceptionSpecification( + bool Delayed, SourceRange &SpecificationRange, + SmallVectorImpl &DynamicExceptions, + SmallVectorImpl &DynamicExceptionRanges, + ExprResult &NoexceptExpr, CachedTokens *&ExceptionSpecTokens); + + /// ParseDynamicExceptionSpecification - Parse a C++ + /// dynamic-exception-specification (C++ [except.spec]). + /// EndLoc is filled with the location of the last token of the specification. /// - /// Such situations should use the specific attribute parsing functionality. - void ParseAttributes(unsigned WhichAttrKinds, ParsedAttributes &Attrs, - LateParsedAttrList *LateAttrs = nullptr); - /// \brief Possibly parse attributes based on what syntaxes are desired, - /// allowing for the order to vary. - bool MaybeParseAttributes(unsigned WhichAttrKinds, ParsedAttributes &Attrs, - LateParsedAttrList *LateAttrs = nullptr) { - if (Tok.isOneOf(tok::kw___attribute, tok::kw___declspec) || - isAllowedCXX11AttributeSpecifier()) { - ParseAttributes(WhichAttrKinds, Attrs, LateAttrs); - return true; - } - return false; - } + /// \verbatim + /// dynamic-exception-specification: + /// 'throw' '(' type-id-list [opt] ')' + /// [MS] 'throw' '(' '...' ')' + /// + /// type-id-list: + /// type-id ... [opt] + /// type-id-list ',' type-id ... [opt] + /// \endverbatim + /// + ExceptionSpecificationType + ParseDynamicExceptionSpecification(SourceRange &SpecificationRange, + SmallVectorImpl &Exceptions, + SmallVectorImpl &Ranges); - void MaybeParseGNUAttributes(Declarator &D, - LateParsedAttrList *LateAttrs = nullptr) { - if (Tok.is(tok::kw___attribute)) { - ParsedAttributes Attrs(AttrFactory); - ParseGNUAttributes(Attrs, LateAttrs, &D); - D.takeAttributes(Attrs); - } - } + //===--------------------------------------------------------------------===// + // C++0x 8: Function declaration trailing-return-type - bool MaybeParseGNUAttributes(ParsedAttributes &Attrs, - LateParsedAttrList *LateAttrs = nullptr) { - if (Tok.is(tok::kw___attribute)) { - ParseGNUAttributes(Attrs, LateAttrs); - return true; - } - return false; - } + /// ParseTrailingReturnType - Parse a trailing return type on a new-style + /// function declaration. + TypeResult ParseTrailingReturnType(SourceRange &Range, + bool MayBeFollowedByDirectInit); - bool ParseSingleGNUAttribute(ParsedAttributes &Attrs, SourceLocation &EndLoc, - LateParsedAttrList *LateAttrs = nullptr, - Declarator *D = nullptr); - void ParseGNUAttributes(ParsedAttributes &Attrs, - LateParsedAttrList *LateAttrs = nullptr, - Declarator *D = nullptr); - void ParseGNUAttributeArgs(IdentifierInfo *AttrName, - SourceLocation AttrNameLoc, - ParsedAttributes &Attrs, SourceLocation *EndLoc, - IdentifierInfo *ScopeName, SourceLocation ScopeLoc, - ParsedAttr::Form Form, Declarator *D); - IdentifierLoc *ParseIdentifierLoc(); + /// Parse a requires-clause as part of a function declaration. + void ParseTrailingRequiresClause(Declarator &D); - unsigned - ParseClangAttributeArgs(IdentifierInfo *AttrName, SourceLocation AttrNameLoc, - ParsedAttributes &Attrs, SourceLocation *EndLoc, - IdentifierInfo *ScopeName, SourceLocation ScopeLoc, - ParsedAttr::Form Form); + void ParseMicrosoftIfExistsClassDeclaration(DeclSpec::TST TagType, + ParsedAttributes &AccessAttrs, + AccessSpecifier &CurAS); - void ReplayOpenMPAttributeTokens(CachedTokens &OpenMPTokens) { - // If parsing the attributes found an OpenMP directive, emit those tokens - // to the parse stream now. - if (!OpenMPTokens.empty()) { - PP.EnterToken(Tok, /*IsReinject*/ true); - PP.EnterTokenStream(OpenMPTokens, /*DisableMacroExpansion*/ true, - /*IsReinject*/ true); - ConsumeAnyToken(/*ConsumeCodeCompletionTok*/ true); - } - } - void MaybeParseCXX11Attributes(Declarator &D) { - if (isAllowedCXX11AttributeSpecifier()) { - ParsedAttributes Attrs(AttrFactory); - ParseCXX11Attributes(Attrs); - D.takeAttributes(Attrs); - } - } + SourceLocation ParsePackIndexingType(DeclSpec &DS); + void AnnotateExistingIndexedTypeNamePack(ParsedType T, + SourceLocation StartLoc, + SourceLocation EndLoc); - bool MaybeParseCXX11Attributes(ParsedAttributes &Attrs, - bool OuterMightBeMessageSend = false) { - if (isAllowedCXX11AttributeSpecifier(false, OuterMightBeMessageSend)) { - ParseCXX11Attributes(Attrs); - return true; - } - return false; + /// Return true if the next token should be treated as a [[]] attribute, + /// or as a keyword that behaves like one. The former is only true if + /// [[]] attributes are enabled, whereas the latter is true whenever + /// such a keyword appears. The arguments are as for + /// isCXX11AttributeSpecifier. + bool isAllowedCXX11AttributeSpecifier(bool Disambiguate = false, + bool OuterMightBeMessageSend = false) { + return (Tok.isRegularKeywordAttribute() || + isCXX11AttributeSpecifier(Disambiguate, OuterMightBeMessageSend) != + CXX11AttributeKind::NotAttributeSpecifier); } + /// Skip C++11 and C23 attributes and return the end location of the + /// last one. + /// \returns SourceLocation() if there are no attributes. + SourceLocation SkipCXX11Attributes(); + + /// Diagnose and skip C++11 and C23 attributes that appear in syntactic + /// locations where attributes are not allowed. + void DiagnoseAndSkipCXX11Attributes(); + void ParseOpenMPAttributeArgs(const IdentifierInfo *AttrName, CachedTokens &OpenMPTokens); + /// Parse a C++11 or C23 attribute-specifier. + /// + /// \verbatim + /// [C++11] attribute-specifier: + /// '[' '[' attribute-list ']' ']' + /// alignment-specifier + /// + /// [C++11] attribute-list: + /// attribute[opt] + /// attribute-list ',' attribute[opt] + /// attribute '...' + /// attribute-list ',' attribute '...' + /// + /// [C++11] attribute: + /// attribute-token attribute-argument-clause[opt] + /// + /// [C++11] attribute-token: + /// identifier + /// attribute-scoped-token + /// + /// [C++11] attribute-scoped-token: + /// attribute-namespace '::' identifier + /// + /// [C++11] attribute-namespace: + /// identifier + /// \endverbatim void ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs, CachedTokens &OpenMPTokens, SourceLocation *EndLoc = nullptr); @@ -3033,9 +3001,33 @@ class Parser : public CodeCompletionHandler { ParseCXX11AttributeSpecifierInternal(Attrs, OpenMPTokens, EndLoc); ReplayOpenMPAttributeTokens(OpenMPTokens); } + + /// ParseCXX11Attributes - Parse a C++11 or C23 attribute-specifier-seq. + /// + /// \verbatim + /// attribute-specifier-seq: + /// attribute-specifier-seq[opt] attribute-specifier + /// \endverbatim void ParseCXX11Attributes(ParsedAttributes &attrs); + + /// ParseCXX11AttributeArgs -- Parse a C++11 attribute-argument-clause. /// Parses a C++11 (or C23)-style attribute argument list. Returns true /// if this results in adding an attribute to the ParsedAttributes list. + /// + /// \verbatim + /// [C++11] attribute-argument-clause: + /// '(' balanced-token-seq ')' + /// + /// [C++11] balanced-token-seq: + /// balanced-token + /// balanced-token-seq balanced-token + /// + /// [C++11] balanced-token: + /// '(' balanced-token-seq ')' + /// '[' balanced-token-seq ']' + /// '{' balanced-token-seq '}' + /// any token but '(', ')', '[', ']', '{', or '}' + /// \endverbatim bool ParseCXX11AttributeArgs(IdentifierInfo *AttrName, SourceLocation AttrNameLoc, ParsedAttributes &Attrs, SourceLocation *EndLoc, @@ -3043,278 +3035,127 @@ class Parser : public CodeCompletionHandler { SourceLocation ScopeLoc, CachedTokens &OpenMPTokens); - /// Parse a C++23 assume() attribute. Returns true on error. + /// Parse the argument to C++23's [[assume()]] attribute. Returns true on + /// error. bool ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs, IdentifierInfo *AttrName, SourceLocation AttrNameLoc, SourceLocation *EndLoc, ParsedAttr::Form Form); + /// Try to parse an 'identifier' which appears within an attribute-token. + /// + /// \return the parsed identifier on success, and 0 if the next token is not + /// an attribute-token. + /// + /// C++11 [dcl.attr.grammar]p3: + /// If a keyword or an alternative token that satisfies the syntactic + /// requirements of an identifier is contained in an attribute-token, + /// it is considered an identifier. IdentifierInfo *TryParseCXX11AttributeIdentifier( SourceLocation &Loc, SemaCodeCompletion::AttributeCompletion Completion = SemaCodeCompletion::AttributeCompletion::None, const IdentifierInfo *EnclosingScope = nullptr); - bool MaybeParseHLSLAnnotations(Declarator &D, - SourceLocation *EndLoc = nullptr, - bool CouldBeBitField = false) { - assert(getLangOpts().HLSL && "MaybeParseHLSLAnnotations is for HLSL only"); - if (Tok.is(tok::colon)) { - ParsedAttributes Attrs(AttrFactory); - ParseHLSLAnnotations(Attrs, EndLoc, CouldBeBitField); - D.takeAttributes(Attrs); - return true; - } - return false; - } - - void MaybeParseHLSLAnnotations(ParsedAttributes &Attrs, - SourceLocation *EndLoc = nullptr) { - assert(getLangOpts().HLSL && "MaybeParseHLSLAnnotations is for HLSL only"); - if (Tok.is(tok::colon)) - ParseHLSLAnnotations(Attrs, EndLoc); - } + /// Parse uuid() attribute when it appears in a [] Microsoft attribute. + void ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs); - void ParseHLSLAnnotations(ParsedAttributes &Attrs, - SourceLocation *EndLoc = nullptr, - bool CouldBeBitField = false); - Decl *ParseHLSLBuffer(SourceLocation &DeclEnd); - - bool MaybeParseMicrosoftAttributes(ParsedAttributes &Attrs) { - bool AttrsParsed = false; - if ((getLangOpts().MicrosoftExt || getLangOpts().HLSL) && - Tok.is(tok::l_square)) { - ParsedAttributes AttrsWithRange(AttrFactory); - ParseMicrosoftAttributes(AttrsWithRange); - AttrsParsed = !AttrsWithRange.empty(); - Attrs.takeAllFrom(AttrsWithRange); - } - return AttrsParsed; - } - void ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs); - void ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs); + /// ParseMicrosoftAttributes - Parse Microsoft attributes [Attr] + /// + /// \verbatim + /// [MS] ms-attribute: + /// '[' token-seq ']' + /// + /// [MS] ms-attribute-seq: + /// ms-attribute[opt] + /// ms-attribute ms-attribute-seq + /// \endverbatim void ParseMicrosoftAttributes(ParsedAttributes &Attrs); - bool MaybeParseMicrosoftDeclSpecs(ParsedAttributes &Attrs) { - if (getLangOpts().DeclSpecKeyword && Tok.is(tok::kw___declspec)) { - ParseMicrosoftDeclSpecs(Attrs); - return true; - } - return false; - } - void ParseMicrosoftDeclSpecs(ParsedAttributes &Attrs); - bool ParseMicrosoftDeclSpecArgs(IdentifierInfo *AttrName, - SourceLocation AttrNameLoc, - ParsedAttributes &Attrs); - void ParseMicrosoftTypeAttributes(ParsedAttributes &attrs); - void ParseWebAssemblyFuncrefTypeAttribute(ParsedAttributes &Attrs); - void DiagnoseAndSkipExtendedMicrosoftTypeAttributes(); - SourceLocation SkipExtendedMicrosoftTypeAttributes(); + void ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs); void ParseNullabilityClassAttributes(ParsedAttributes &attrs); - void ParseBorlandTypeAttributes(ParsedAttributes &attrs); - void ParseOpenCLKernelAttributes(ParsedAttributes &attrs); - void ParseOpenCLQualifiers(ParsedAttributes &Attrs); - void ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs); - void ParseCUDAFunctionAttributes(ParsedAttributes &attrs); - bool isHLSLQualifier(const Token &Tok) const; - void ParseHLSLQualifiers(ParsedAttributes &Attrs); - - VersionTuple ParseVersionTuple(SourceRange &Range); - void ParseAvailabilityAttribute(IdentifierInfo &Availability, - SourceLocation AvailabilityLoc, - ParsedAttributes &attrs, - SourceLocation *endLoc, - IdentifierInfo *ScopeName, - SourceLocation ScopeLoc, - ParsedAttr::Form Form); - - std::optional ParseAvailabilitySpec(); - ExprResult ParseAvailabilityCheckExpr(SourceLocation StartLoc); - - void ParseExternalSourceSymbolAttribute(IdentifierInfo &ExternalSourceSymbol, - SourceLocation Loc, - ParsedAttributes &Attrs, - SourceLocation *EndLoc, - IdentifierInfo *ScopeName, - SourceLocation ScopeLoc, - ParsedAttr::Form Form); - - void ParseObjCBridgeRelatedAttribute(IdentifierInfo &ObjCBridgeRelated, - SourceLocation ObjCBridgeRelatedLoc, - ParsedAttributes &Attrs, - SourceLocation *EndLoc, - IdentifierInfo *ScopeName, - SourceLocation ScopeLoc, - ParsedAttr::Form Form); - - void ParseSwiftNewTypeAttribute(IdentifierInfo &AttrName, - SourceLocation AttrNameLoc, - ParsedAttributes &Attrs, - SourceLocation *EndLoc, - IdentifierInfo *ScopeName, - SourceLocation ScopeLoc, - ParsedAttr::Form Form); - - void ParseTypeTagForDatatypeAttribute(IdentifierInfo &AttrName, - SourceLocation AttrNameLoc, - ParsedAttributes &Attrs, - SourceLocation *EndLoc, - IdentifierInfo *ScopeName, - SourceLocation ScopeLoc, - ParsedAttr::Form Form); - - void ParseAttributeWithTypeArg(IdentifierInfo &AttrName, - SourceLocation AttrNameLoc, - ParsedAttributes &Attrs, - IdentifierInfo *ScopeName, - SourceLocation ScopeLoc, - ParsedAttr::Form Form); - - void DistributeCLateParsedAttrs(Decl *Dcl, LateParsedAttrList *LateAttrs); - void ParseBoundsAttribute(IdentifierInfo &AttrName, - SourceLocation AttrNameLoc, ParsedAttributes &Attrs, - IdentifierInfo *ScopeName, SourceLocation ScopeLoc, - ParsedAttr::Form Form); - - void ParseTypeofSpecifier(DeclSpec &DS); + /// ParseDecltypeSpecifier - Parse a C++11 decltype specifier. + /// + /// \verbatim + /// 'decltype' ( expression ) + /// 'decltype' ( 'auto' ) [C++1y] + /// \endverbatim + /// SourceLocation ParseDecltypeSpecifier(DeclSpec &DS); void AnnotateExistingDecltypeSpecifier(const DeclSpec &DS, SourceLocation StartLoc, SourceLocation EndLoc); - void ParseAtomicSpecifier(DeclSpec &DS); - - ExprResult ParseAlignArgument(StringRef KWName, SourceLocation Start, - SourceLocation &EllipsisLoc, bool &IsType, - ParsedType &Ty); - void ParseAlignmentSpecifier(ParsedAttributes &Attrs, - SourceLocation *endLoc = nullptr); - ExprResult ParseExtIntegerArgument(); - - void ParsePtrauthQualifier(ParsedAttributes &Attrs); + /// isCXX11VirtSpecifier - Determine whether the given token is a C++11 + /// virt-specifier. + /// + /// \verbatim + /// virt-specifier: + /// override + /// final + /// __final + /// \endverbatim VirtSpecifiers::Specifier isCXX11VirtSpecifier(const Token &Tok) const; VirtSpecifiers::Specifier isCXX11VirtSpecifier() const { return isCXX11VirtSpecifier(Tok); } + + /// ParseOptionalCXX11VirtSpecifierSeq - Parse a virt-specifier-seq. + /// + /// \verbatim + /// virt-specifier-seq: + /// virt-specifier + /// virt-specifier-seq virt-specifier + /// \endverbatim void ParseOptionalCXX11VirtSpecifierSeq(VirtSpecifiers &VS, bool IsInterface, SourceLocation FriendLoc); + /// isCXX11FinalKeyword - Determine whether the next token is a C++11 + /// 'final' or Microsoft 'sealed' contextual keyword. bool isCXX11FinalKeyword() const; - bool isCXX2CTriviallyRelocatableKeyword(Token Tok) const; - bool isCXX2CTriviallyRelocatableKeyword() const; - void ParseCXX2CTriviallyRelocatableSpecifier(SourceLocation &TRS); - - bool isCXX2CReplaceableKeyword(Token Tok) const; - bool isCXX2CReplaceableKeyword() const; - void ParseCXX2CReplaceableSpecifier(SourceLocation &MRS); - - bool isClassCompatibleKeyword(Token Tok) const; + /// isClassCompatibleKeyword - Determine whether the next token is a C++11 + /// 'final', a C++26 'trivially_relocatable_if_eligible', + /// 'replaceable_if_eligible', or Microsoft 'sealed' or 'abstract' contextual + /// keyword. bool isClassCompatibleKeyword() const; - /// DeclaratorScopeObj - RAII object used in Parser::ParseDirectDeclarator to - /// enter a new C++ declarator scope and exit it when the function is - /// finished. - class DeclaratorScopeObj { - Parser &P; - CXXScopeSpec &SS; - bool EnteredScope; - bool CreatedScope; - public: - DeclaratorScopeObj(Parser &p, CXXScopeSpec &ss) - : P(p), SS(ss), EnteredScope(false), CreatedScope(false) {} - - void EnterDeclaratorScope() { - assert(!EnteredScope && "Already entered the scope!"); - assert(SS.isSet() && "C++ scope was not set!"); - - CreatedScope = true; - P.EnterScope(0); // Not a decl scope. - - if (!P.Actions.ActOnCXXEnterDeclaratorScope(P.getCurScope(), SS)) - EnteredScope = true; - } - - ~DeclaratorScopeObj() { - if (EnteredScope) { - assert(SS.isSet() && "C++ scope was cleared ?"); - P.Actions.ActOnCXXExitDeclaratorScope(P.getCurScope(), SS); - } - if (CreatedScope) - P.ExitScope(); - } - }; - - /// ParseDeclarator - Parse and verify a newly-initialized declarator. - void ParseDeclarator(Declarator &D); - /// A function that parses a variant of direct-declarator. - typedef void (Parser::*DirectDeclParseFunction)(Declarator&); - void ParseDeclaratorInternal(Declarator &D, - DirectDeclParseFunction DirectDeclParser); - - enum AttrRequirements { - AR_NoAttributesParsed = 0, ///< No attributes are diagnosed. - AR_GNUAttributesParsedAndRejected = 1 << 0, ///< Diagnose GNU attributes. - AR_GNUAttributesParsed = 1 << 1, - AR_CXX11AttributesParsed = 1 << 2, - AR_DeclspecAttributesParsed = 1 << 3, - AR_AllAttributesParsed = AR_GNUAttributesParsed | - AR_CXX11AttributesParsed | - AR_DeclspecAttributesParsed, - AR_VendorAttributesParsed = AR_GNUAttributesParsed | - AR_DeclspecAttributesParsed - }; - - void ParseTypeQualifierListOpt( - DeclSpec &DS, unsigned AttrReqs = AR_AllAttributesParsed, - bool AtomicOrPtrauthAllowed = true, bool IdentifierRequired = false, - std::optional> CodeCompletionHandler = - std::nullopt); - void ParseDirectDeclarator(Declarator &D); - void ParseDecompositionDeclarator(Declarator &D); - void ParseParenDeclarator(Declarator &D); - void ParseFunctionDeclarator(Declarator &D, ParsedAttributes &FirstArgAttrs, - BalancedDelimiterTracker &Tracker, - bool IsAmbiguous, bool RequiresArg = false); - void InitCXXThisScopeForDeclaratorIfRelevant( - const Declarator &D, const DeclSpec &DS, - std::optional &ThisScope); - bool ParseRefQualifier(bool &RefQualifierIsLValueRef, - SourceLocation &RefQualifierLoc); - bool isFunctionDeclaratorIdentifierList(); - void ParseFunctionDeclaratorIdentifierList( - Declarator &D, - SmallVectorImpl &ParamInfo); - void ParseParameterDeclarationClause( - Declarator &D, ParsedAttributes &attrs, - SmallVectorImpl &ParamInfo, - SourceLocation &EllipsisLoc) { - return ParseParameterDeclarationClause( - D.getContext(), attrs, ParamInfo, EllipsisLoc, - D.getCXXScopeSpec().isSet() && - D.isFunctionDeclaratorAFunctionDeclaration()); - } - void ParseParameterDeclarationClause( - DeclaratorContext DeclaratorContext, ParsedAttributes &attrs, - SmallVectorImpl &ParamInfo, - SourceLocation &EllipsisLoc, bool IsACXXFunctionDeclaration = false); - - void ParseBracketDeclarator(Declarator &D); - void ParseMisplacedBracketDeclarator(Declarator &D); bool MaybeParseTypeTransformTypeSpecifier(DeclSpec &DS); DeclSpec::TST TypeTransformTokToDeclSpec(); - //===--------------------------------------------------------------------===// - // C++ 7: Declarations [dcl.dcl] - - CXX11AttributeKind - isCXX11AttributeSpecifier(bool Disambiguate = false, - bool OuterMightBeMessageSend = false); - void DiagnoseUnexpectedNamespace(NamedDecl *Context); + /// ParseNamespace - We know that the current token is a namespace keyword. + /// This may either be a top level namespace or a block-level namespace alias. + /// If there was an inline keyword, it has already been parsed. + /// + /// \verbatim + /// namespace-definition: [C++: namespace.def] + /// named-namespace-definition + /// unnamed-namespace-definition + /// nested-namespace-definition + /// + /// named-namespace-definition: + /// 'inline'[opt] 'namespace' attributes[opt] identifier '{' + /// namespace-body '}' + /// + /// unnamed-namespace-definition: + /// 'inline'[opt] 'namespace' attributes[opt] '{' namespace-body '}' + /// + /// nested-namespace-definition: + /// 'namespace' enclosing-namespace-specifier '::' 'inline'[opt] + /// identifier '{' namespace-body '}' + /// + /// enclosing-namespace-specifier: + /// identifier + /// enclosing-namespace-specifier '::' 'inline'[opt] identifier + /// + /// namespace-alias-definition: [C++ 7.3.2: namespace.alias] + /// 'namespace' identifier '=' qualified-namespace-specifier ';' + /// \endverbatim + /// DeclGroupPtrTy ParseNamespace(DeclaratorContext Context, SourceLocation &DeclEnd, SourceLocation InlineLoc = SourceLocation()); @@ -3327,19 +3168,63 @@ class Parser : public CodeCompletionHandler { }; using InnerNamespaceInfoList = llvm::SmallVector; + /// ParseInnerNamespace - Parse the contents of a namespace. void ParseInnerNamespace(const InnerNamespaceInfoList &InnerNSs, unsigned int index, SourceLocation &InlineLoc, ParsedAttributes &attrs, BalancedDelimiterTracker &Tracker); + + /// ParseLinkage - We know that the current token is a string_literal + /// and just before that, that extern was seen. + /// + /// \verbatim + /// linkage-specification: [C++ 7.5p2: dcl.link] + /// 'extern' string-literal '{' declaration-seq[opt] '}' + /// 'extern' string-literal declaration + /// \endverbatim + /// Decl *ParseLinkage(ParsingDeclSpec &DS, DeclaratorContext Context); + + /// Parse a standard C++ Modules export-declaration. + /// + /// \verbatim + /// export-declaration: + /// 'export' declaration + /// 'export' '{' declaration-seq[opt] '}' + /// \endverbatim + /// + /// HLSL: Parse export function declaration. + /// + /// \verbatim + /// export-function-declaration: + /// 'export' function-declaration + /// + /// export-declaration-group: + /// 'export' '{' function-declaration-seq[opt] '}' + /// \endverbatim + /// Decl *ParseExportDeclaration(); + + /// ParseUsingDirectiveOrDeclaration - Parse C++ using using-declaration or + /// using-directive. Assumes that current token is 'using'. DeclGroupPtrTy ParseUsingDirectiveOrDeclaration( DeclaratorContext Context, const ParsedTemplateInfo &TemplateInfo, SourceLocation &DeclEnd, ParsedAttributes &Attrs); - Decl *ParseUsingDirective(DeclaratorContext Context, - SourceLocation UsingLoc, - SourceLocation &DeclEnd, - ParsedAttributes &attrs); + + /// ParseUsingDirective - Parse C++ using-directive, assumes + /// that current token is 'namespace' and 'using' was already parsed. + /// + /// \verbatim + /// using-directive: [C++ 7.3.p4: namespace.udir] + /// 'using' 'namespace' ::[opt] nested-name-specifier[opt] + /// namespace-name ; + /// [GNU] using-directive: + /// 'using' 'namespace' ::[opt] nested-name-specifier[opt] + /// namespace-name attributes[opt] ; + /// \endverbatim + /// + Decl *ParseUsingDirective(DeclaratorContext Context, SourceLocation UsingLoc, + SourceLocation &DeclEnd, ParsedAttributes &attrs); struct UsingDeclarator { SourceLocation TypenameLoc; @@ -3354,7 +3239,40 @@ class Parser : public CodeCompletionHandler { } }; + /// Parse a using-declarator (or the identifier in a C++11 alias-declaration). + /// + /// \verbatim + /// using-declarator: + /// 'typename'[opt] nested-name-specifier unqualified-id + /// \endverbatim + /// bool ParseUsingDeclarator(DeclaratorContext Context, UsingDeclarator &D); + + /// ParseUsingDeclaration - Parse C++ using-declaration or alias-declaration. + /// Assumes that 'using' was already seen. + /// + /// \verbatim + /// using-declaration: [C++ 7.3.p3: namespace.udecl] + /// 'using' using-declarator-list[opt] ; + /// + /// using-declarator-list: [C++1z] + /// using-declarator '...'[opt] + /// using-declarator-list ',' using-declarator '...'[opt] + /// + /// using-declarator-list: [C++98-14] + /// using-declarator + /// + /// alias-declaration: C++11 [dcl.dcl]p1 + /// 'using' identifier attribute-specifier-seq[opt] = type-id ; + /// + /// using-enum-declaration: [C++20, dcl.enum] + /// 'using' elaborated-enum-specifier ; + /// The terminal name of the elaborated-enum-specifier undergoes + /// type-only lookup + /// + /// elaborated-enum-specifier: + /// 'enum' nested-name-specifier[opt] identifier + /// \endverbatim DeclGroupPtrTy ParseUsingDeclaration(DeclaratorContext Context, const ParsedTemplateInfo &TemplateInfo, SourceLocation UsingLoc, @@ -3366,35 +3284,187 @@ class Parser : public CodeCompletionHandler { UsingDeclarator &D, SourceLocation &DeclEnd, AccessSpecifier AS, ParsedAttributes &Attrs, Decl **OwnedType = nullptr); + /// ParseStaticAssertDeclaration - Parse C++0x or C11 + /// static_assert-declaration. + /// + /// \verbatim + /// [C++0x] static_assert-declaration: + /// static_assert ( constant-expression , string-literal ) ; + /// + /// [C11] static_assert-declaration: + /// _Static_assert ( constant-expression , string-literal ) ; + /// \endverbatim + /// Decl *ParseStaticAssertDeclaration(SourceLocation &DeclEnd); + + /// ParseNamespaceAlias - Parse the part after the '=' in a namespace + /// alias definition. + /// Decl *ParseNamespaceAlias(SourceLocation NamespaceLoc, SourceLocation AliasLoc, IdentifierInfo *Alias, SourceLocation &DeclEnd); //===--------------------------------------------------------------------===// // C++ 9: classes [class] and C structs/unions. + + /// Determine whether the following tokens are valid after a type-specifier + /// which could be a standalone declaration. This will conservatively return + /// true if there's any doubt, and is appropriate for insert-';' fixits. bool isValidAfterTypeSpecifier(bool CouldBeBitfield); + + /// ParseClassSpecifier - Parse a C++ class-specifier [C++ class] or + /// elaborated-type-specifier [C++ dcl.type.elab]; we can't tell which + /// until we reach the start of a definition or see a token that + /// cannot start a definition. + /// + /// \verbatim + /// class-specifier: [C++ class] + /// class-head '{' member-specification[opt] '}' + /// class-head '{' member-specification[opt] '}' attributes[opt] + /// class-head: + /// class-key identifier[opt] base-clause[opt] + /// class-key nested-name-specifier identifier base-clause[opt] + /// class-key nested-name-specifier[opt] simple-template-id + /// base-clause[opt] + /// [GNU] class-key attributes[opt] identifier[opt] base-clause[opt] + /// [GNU] class-key attributes[opt] nested-name-specifier + /// identifier base-clause[opt] + /// [GNU] class-key attributes[opt] nested-name-specifier[opt] + /// simple-template-id base-clause[opt] + /// class-key: + /// 'class' + /// 'struct' + /// 'union' + /// + /// elaborated-type-specifier: [C++ dcl.type.elab] + /// class-key ::[opt] nested-name-specifier[opt] identifier + /// class-key ::[opt] nested-name-specifier[opt] 'template'[opt] + /// simple-template-id + /// + /// Note that the C++ class-specifier and elaborated-type-specifier, + /// together, subsume the C99 struct-or-union-specifier: + /// + /// struct-or-union-specifier: [C99 6.7.2.1] + /// struct-or-union identifier[opt] '{' struct-contents '}' + /// struct-or-union identifier + /// [GNU] struct-or-union attributes[opt] identifier[opt] '{' struct-contents + /// '}' attributes[opt] + /// [GNU] struct-or-union attributes[opt] identifier + /// struct-or-union: + /// 'struct' + /// 'union' + /// \endverbatim void ParseClassSpecifier(tok::TokenKind TagTokKind, SourceLocation TagLoc, DeclSpec &DS, ParsedTemplateInfo &TemplateInfo, AccessSpecifier AS, bool EnteringContext, DeclSpecContext DSC, ParsedAttributes &Attributes); void SkipCXXMemberSpecification(SourceLocation StartLoc, - SourceLocation AttrFixitLoc, - unsigned TagType, + SourceLocation AttrFixitLoc, unsigned TagType, Decl *TagDecl); - void ParseCXXMemberSpecification(SourceLocation StartLoc, - SourceLocation AttrFixitLoc, + + /// ParseCXXMemberSpecification - Parse the class definition. + /// + /// \verbatim + /// member-specification: + /// member-declaration member-specification[opt] + /// access-specifier ':' member-specification[opt] + /// \endverbatim + /// + void ParseCXXMemberSpecification(SourceLocation StartLoc, + SourceLocation AttrFixitLoc, ParsedAttributes &Attrs, unsigned TagType, Decl *TagDecl); + + /// ParseCXXMemberInitializer - Parse the brace-or-equal-initializer. + /// Also detect and reject any attempted defaulted/deleted function + /// definition. The location of the '=', if any, will be placed in EqualLoc. + /// + /// This does not check for a pure-specifier; that's handled elsewhere. + /// + /// \verbatim + /// brace-or-equal-initializer: + /// '=' initializer-expression + /// braced-init-list + /// + /// initializer-clause: + /// assignment-expression + /// braced-init-list + /// + /// defaulted/deleted function-definition: + /// '=' 'default' + /// '=' 'delete' + /// \endverbatim + /// + /// Prior to C++0x, the assignment-expression in an initializer-clause must + /// be a constant-expression. ExprResult ParseCXXMemberInitializer(Decl *D, bool IsFunction, SourceLocation &EqualLoc); - bool - ParseCXXMemberDeclaratorBeforeInitializer(Declarator &DeclaratorInfo, - VirtSpecifiers &VS, - ExprResult &BitfieldSize, - LateParsedAttrList &LateAttrs); - void MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq(Declarator &D, - VirtSpecifiers &VS); + + /// Parse a C++ member-declarator up to, but not including, the optional + /// brace-or-equal-initializer or pure-specifier. + bool ParseCXXMemberDeclaratorBeforeInitializer(Declarator &DeclaratorInfo, + VirtSpecifiers &VS, + ExprResult &BitfieldSize, + LateParsedAttrList &LateAttrs); + + /// Look for declaration specifiers possibly occurring after C++11 + /// virt-specifier-seq and diagnose them. + void + MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq(Declarator &D, + VirtSpecifiers &VS); + + /// ParseCXXClassMemberDeclaration - Parse a C++ class member declaration. + /// + /// \verbatim + /// member-declaration: + /// decl-specifier-seq[opt] member-declarator-list[opt] ';' + /// function-definition ';'[opt] + /// [C++26] friend-type-declaration + /// ::[opt] nested-name-specifier template[opt] unqualified-id ';'[TODO] + /// using-declaration [TODO] + /// [C++0x] static_assert-declaration + /// template-declaration + /// [GNU] '__extension__' member-declaration + /// + /// member-declarator-list: + /// member-declarator + /// member-declarator-list ',' member-declarator + /// + /// member-declarator: + /// declarator virt-specifier-seq[opt] pure-specifier[opt] + /// [C++2a] declarator requires-clause + /// declarator constant-initializer[opt] + /// [C++11] declarator brace-or-equal-initializer[opt] + /// identifier[opt] ':' constant-expression + /// + /// virt-specifier-seq: + /// virt-specifier + /// virt-specifier-seq virt-specifier + /// + /// virt-specifier: + /// override + /// final + /// [MS] sealed + /// + /// pure-specifier: + /// '= 0' + /// + /// constant-initializer: + /// '=' constant-expression + /// + /// friend-type-declaration: + /// 'friend' friend-type-specifier-list ; + /// + /// friend-type-specifier-list: + /// friend-type-specifier ...[opt] + /// friend-type-specifier-list , friend-type-specifier ...[opt] + /// + /// friend-type-specifier: + /// simple-type-specifier + /// elaborated-type-specifier + /// typename-specifier + /// \endverbatim + /// DeclGroupPtrTy ParseCXXClassMemberDeclaration( AccessSpecifier AS, ParsedAttributes &Attr, ParsedTemplateInfo &TemplateInfo, @@ -3403,626 +3473,5444 @@ class Parser : public CodeCompletionHandler { ParseCXXClassMemberDeclarationWithPragmas(AccessSpecifier &AS, ParsedAttributes &AccessAttrs, DeclSpec::TST TagType, Decl *Tag); + + /// ParseConstructorInitializer - Parse a C++ constructor initializer, + /// which explicitly initializes the members or base classes of a + /// class (C++ [class.base.init]). For example, the three initializers + /// after the ':' in the Derived constructor below: + /// + /// @code + /// class Base { }; + /// class Derived : Base { + /// int x; + /// float f; + /// public: + /// Derived(float f) : Base(), x(17), f(f) { } + /// }; + /// @endcode + /// + /// \verbatim + /// [C++] ctor-initializer: + /// ':' mem-initializer-list + /// + /// [C++] mem-initializer-list: + /// mem-initializer ...[opt] + /// mem-initializer ...[opt] , mem-initializer-list + /// \endverbatim void ParseConstructorInitializer(Decl *ConstructorDecl); + + /// ParseMemInitializer - Parse a C++ member initializer, which is + /// part of a constructor initializer that explicitly initializes one + /// member or base class (C++ [class.base.init]). See + /// ParseConstructorInitializer for an example. + /// + /// \verbatim + /// [C++] mem-initializer: + /// mem-initializer-id '(' expression-list[opt] ')' + /// [C++0x] mem-initializer-id braced-init-list + /// + /// [C++] mem-initializer-id: + /// '::'[opt] nested-name-specifier[opt] class-name + /// identifier + /// \endverbatim MemInitResult ParseMemInitializer(Decl *ConstructorDecl); - void HandleMemberFunctionDeclDelays(Declarator& DeclaratorInfo, + + /// If the given declarator has any parts for which parsing has to be + /// delayed, e.g., default arguments or an exception-specification, create a + /// late-parsed method declaration record to handle the parsing at the end of + /// the class definition. + void HandleMemberFunctionDeclDelays(Declarator &DeclaratorInfo, Decl *ThisDecl); //===--------------------------------------------------------------------===// // C++ 10: Derived classes [class.derived] + + /// ParseBaseTypeSpecifier - Parse a C++ base-type-specifier which is either a + /// class name or decltype-specifier. Note that we only check that the result + /// names a type; semantic analysis will need to verify that the type names a + /// class. The result is either a type or null, depending on whether a type + /// name was found. + /// + /// \verbatim + /// base-type-specifier: [C++11 class.derived] + /// class-or-decltype + /// class-or-decltype: [C++11 class.derived] + /// nested-name-specifier[opt] class-name + /// decltype-specifier + /// class-name: [C++ class.name] + /// identifier + /// simple-template-id + /// \endverbatim + /// + /// In C++98, instead of base-type-specifier, we have: + /// + /// \verbatim + /// ::[opt] nested-name-specifier[opt] class-name + /// \endverbatim TypeResult ParseBaseTypeSpecifier(SourceLocation &BaseLoc, SourceLocation &EndLocation); - void ParseBaseClause(Decl *ClassDecl); - BaseResult ParseBaseSpecifier(Decl *ClassDecl); - AccessSpecifier getAccessSpecifierIfPresent() const; - - bool ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, - ParsedType ObjectType, - bool ObjectHadErrors, - SourceLocation TemplateKWLoc, - IdentifierInfo *Name, - SourceLocation NameLoc, - bool EnteringContext, - UnqualifiedId &Id, - bool AssumeTemplateId); - bool ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, - ParsedType ObjectType, - UnqualifiedId &Result); - //===--------------------------------------------------------------------===// - // OpenMP: Directives and clauses. - /// Parse clauses for '#pragma omp declare simd'. - DeclGroupPtrTy ParseOMPDeclareSimdClauses(DeclGroupPtrTy Ptr, - CachedTokens &Toks, - SourceLocation Loc); + /// ParseBaseClause - Parse the base-clause of a C++ class [C++ + /// class.derived]. + /// + /// \verbatim + /// base-clause : [C++ class.derived] + /// ':' base-specifier-list + /// base-specifier-list: + /// base-specifier '...'[opt] + /// base-specifier-list ',' base-specifier '...'[opt] + /// \endverbatim + void ParseBaseClause(Decl *ClassDecl); - /// Parse a property kind into \p TIProperty for the selector set \p Set and - /// selector \p Selector. - void parseOMPTraitPropertyKind(OMPTraitProperty &TIProperty, - llvm::omp::TraitSet Set, - llvm::omp::TraitSelector Selector, - llvm::StringMap &Seen); + /// ParseBaseSpecifier - Parse a C++ base-specifier. A base-specifier is + /// one entry in the base class list of a class specifier, for example: + /// class foo : public bar, virtual private baz { + /// 'public bar' and 'virtual private baz' are each base-specifiers. + /// + /// \verbatim + /// base-specifier: [C++ class.derived] + /// attribute-specifier-seq[opt] base-type-specifier + /// attribute-specifier-seq[opt] 'virtual' access-specifier[opt] + /// base-type-specifier + /// attribute-specifier-seq[opt] access-specifier 'virtual'[opt] + /// base-type-specifier + /// \endverbatim + BaseResult ParseBaseSpecifier(Decl *ClassDecl); - /// Parse a selector kind into \p TISelector for the selector set \p Set. - void parseOMPTraitSelectorKind(OMPTraitSelector &TISelector, - llvm::omp::TraitSet Set, - llvm::StringMap &Seen); + /// getAccessSpecifierIfPresent - Determine whether the next token is + /// a C++ access-specifier. + /// + /// \verbatim + /// access-specifier: [C++ class.derived] + /// 'private' + /// 'protected' + /// 'public' + /// \endverbatim + AccessSpecifier getAccessSpecifierIfPresent() const; - /// Parse a selector set kind into \p TISet. - void parseOMPTraitSetKind(OMPTraitSet &TISet, - llvm::StringMap &Seen); + bool isCXX2CTriviallyRelocatableKeyword(Token Tok) const; + bool isCXX2CTriviallyRelocatableKeyword() const; + void ParseCXX2CTriviallyRelocatableSpecifier(SourceLocation &TRS); - /// Parses an OpenMP context property. - void parseOMPContextProperty(OMPTraitSelector &TISelector, - llvm::omp::TraitSet Set, - llvm::StringMap &Seen); + bool isCXX2CReplaceableKeyword(Token Tok) const; + bool isCXX2CReplaceableKeyword() const; + void ParseCXX2CReplaceableSpecifier(SourceLocation &MRS); - /// Parses an OpenMP context selector. - void parseOMPContextSelector(OMPTraitSelector &TISelector, - llvm::omp::TraitSet Set, - llvm::StringMap &SeenSelectors); + /// 'final', a C++26 'trivially_relocatable_if_eligible', + /// 'replaceable_if_eligible', or Microsoft 'sealed' or 'abstract' contextual + /// keyword. + bool isClassCompatibleKeyword(Token Tok) const; - /// Parses an OpenMP context selector set. - void parseOMPContextSelectorSet(OMPTraitSet &TISet, - llvm::StringMap &SeenSets); + void ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs); - /// Parses OpenMP context selectors. - bool parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI); + ///@} - /// Parse an 'append_args' clause for '#pragma omp declare variant'. - bool parseOpenMPAppendArgs(SmallVectorImpl &InteropInfos); + // + // + // ------------------------------------------------------------------------- + // + // - /// Parse a `match` clause for an '#pragma omp declare variant'. Return true - /// if there was an error. - bool parseOMPDeclareVariantMatchClause(SourceLocation Loc, OMPTraitInfo &TI, - OMPTraitInfo *ParentTI); + /// \name Expressions + /// Implementations are in ParseExpr.cpp + ///@{ - /// Parse clauses for '#pragma omp declare variant'. - void ParseOMPDeclareVariantClauses(DeclGroupPtrTy Ptr, CachedTokens &Toks, - SourceLocation Loc); +public: + friend class OffsetOfStateRAIIObject; - /// Parse 'omp [begin] assume[s]' directive. - void ParseOpenMPAssumesDirective(OpenMPDirectiveKind DKind, - SourceLocation Loc); + typedef Sema::FullExprArg FullExprArg; - /// Parse 'omp end assumes' directive. - void ParseOpenMPEndAssumesDirective(SourceLocation Loc); + //===--------------------------------------------------------------------===// + // C99 6.5: Expressions. - /// Parses clauses for directive. + /// Simple precedence-based parser for binary/ternary operators. /// - /// \param DKind Kind of current directive. - /// \param clauses for current directive. - /// \param start location for clauses of current directive - void ParseOpenMPClauses(OpenMPDirectiveKind DKind, - SmallVectorImpl &Clauses, - SourceLocation Loc); - - /// Parse clauses for '#pragma omp [begin] declare target'. - void ParseOMPDeclareTargetClauses(SemaOpenMP::DeclareTargetContextInfo &DTCI); - - /// Parse '#pragma omp end declare target'. - void ParseOMPEndDeclareTargetDirective(OpenMPDirectiveKind BeginDKind, - OpenMPDirectiveKind EndDKind, - SourceLocation Loc); - - /// Skip tokens until a `annot_pragma_openmp_end` was found. Emit a warning if - /// it is not the current token. - void skipUntilPragmaOpenMPEnd(OpenMPDirectiveKind DKind); - - /// Check the \p FoundKind against the \p ExpectedKind, if not issue an error - /// that the "end" matching the "begin" directive of kind \p BeginKind was not - /// found. Finally, if the expected kind was found or if \p SkipUntilOpenMPEnd - /// is set, skip ahead using the helper `skipUntilPragmaOpenMPEnd`. - void parseOMPEndDirective(OpenMPDirectiveKind BeginKind, - OpenMPDirectiveKind ExpectedKind, - OpenMPDirectiveKind FoundKind, - SourceLocation MatchingLoc, - SourceLocation FoundLoc, - bool SkipUntilOpenMPEnd); - - /// Parses declarative OpenMP directives. - DeclGroupPtrTy ParseOpenMPDeclarativeDirectiveWithExtDecl( - AccessSpecifier &AS, ParsedAttributes &Attrs, bool Delayed = false, - DeclSpec::TST TagType = DeclSpec::TST_unspecified, - Decl *TagDecl = nullptr); - /// Parse 'omp declare reduction' construct. - DeclGroupPtrTy ParseOpenMPDeclareReductionDirective(AccessSpecifier AS); - /// Parses initializer for provided omp_priv declaration inside the reduction - /// initializer. - void ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm); - - /// Parses 'omp declare mapper' directive. - DeclGroupPtrTy ParseOpenMPDeclareMapperDirective(AccessSpecifier AS); - /// Parses variable declaration in 'omp declare mapper' directive. - TypeResult parseOpenMPDeclareMapperVarDecl(SourceRange &Range, - DeclarationName &Name, - AccessSpecifier AS = AS_none); - - /// Parses 'omp begin declare variant' directive. - bool ParseOpenMPDeclareBeginVariantDirective(SourceLocation Loc); - - /// Tries to parse cast part of OpenMP array shaping operation: - /// '[' expression ']' { '[' expression ']' } ')'. - bool tryParseOpenMPArrayShapingCastPart(); - - /// Parses simple list of variables. + /// Note: we diverge from the C99 grammar when parsing the + /// assignment-expression production. C99 specifies that the LHS of an + /// assignment operator should be parsed as a unary-expression, but + /// consistency dictates that it be a conditional-expession. In practice, the + /// important thing here is that the LHS of an assignment has to be an + /// l-value, which productions between unary-expression and + /// conditional-expression don't produce. Because we want consistency, we + /// parse the LHS as a conditional-expression, then check for l-value-ness in + /// semantic analysis stages. /// - /// \param Kind Kind of the directive. - /// \param Callback Callback function to be called for the list elements. - /// \param AllowScopeSpecifier true, if the variables can have fully - /// qualified names. + /// \verbatim + /// pm-expression: [C++ 5.5] + /// cast-expression + /// pm-expression '.*' cast-expression + /// pm-expression '->*' cast-expression /// - bool ParseOpenMPSimpleVarList( - OpenMPDirectiveKind Kind, - const llvm::function_ref & - Callback, - bool AllowScopeSpecifier); - /// Parses declarative or executable directive. + /// multiplicative-expression: [C99 6.5.5] + /// Note: in C++, apply pm-expression instead of cast-expression + /// cast-expression + /// multiplicative-expression '*' cast-expression + /// multiplicative-expression '/' cast-expression + /// multiplicative-expression '%' cast-expression /// - /// \param StmtCtx The context in which we're parsing the directive. - /// \param ReadDirectiveWithinMetadirective true if directive is within a - /// metadirective and therefore ends on the closing paren. - StmtResult ParseOpenMPDeclarativeOrExecutableDirective( - ParsedStmtContext StmtCtx, bool ReadDirectiveWithinMetadirective = false); - - /// Parses executable directive. + /// additive-expression: [C99 6.5.6] + /// multiplicative-expression + /// additive-expression '+' multiplicative-expression + /// additive-expression '-' multiplicative-expression /// - /// \param StmtCtx The context in which we're parsing the directive. - /// \param DKind The kind of the executable directive. - /// \param Loc Source location of the beginning of the directive. - /// \param ReadDirectiveWithinMetadirective true if directive is within a - /// metadirective and therefore ends on the closing paren. - StmtResult - ParseOpenMPExecutableDirective(ParsedStmtContext StmtCtx, - OpenMPDirectiveKind DKind, SourceLocation Loc, - bool ReadDirectiveWithinMetadirective); - - /// Parses informational directive. + /// shift-expression: [C99 6.5.7] + /// additive-expression + /// shift-expression '<<' additive-expression + /// shift-expression '>>' additive-expression /// - /// \param StmtCtx The context in which we're parsing the directive. - /// \param DKind The kind of the informational directive. - /// \param Loc Source location of the beginning of the directive. - /// \param ReadDirectiveWithinMetadirective true if directive is within a - /// metadirective and therefore ends on the closing paren. - StmtResult ParseOpenMPInformationalDirective( - ParsedStmtContext StmtCtx, OpenMPDirectiveKind DKind, SourceLocation Loc, - bool ReadDirectiveWithinMetadirective); - - /// Parses clause of kind \a CKind for directive of a kind \a Kind. + /// compare-expression: [C++20 expr.spaceship] + /// shift-expression + /// compare-expression '<=>' shift-expression /// - /// \param DKind Kind of current directive. - /// \param CKind Kind of current clause. - /// \param FirstClause true, if this is the first clause of a kind \a CKind - /// in current directive. + /// relational-expression: [C99 6.5.8] + /// compare-expression + /// relational-expression '<' compare-expression + /// relational-expression '>' compare-expression + /// relational-expression '<=' compare-expression + /// relational-expression '>=' compare-expression /// - OMPClause *ParseOpenMPClause(OpenMPDirectiveKind DKind, - OpenMPClauseKind CKind, bool FirstClause); - /// Parses clause with a single expression of a kind \a Kind. + /// equality-expression: [C99 6.5.9] + /// relational-expression + /// equality-expression '==' relational-expression + /// equality-expression '!=' relational-expression /// - /// \param Kind Kind of current clause. - /// \param ParseOnly true to skip the clause's semantic actions and return - /// nullptr. + /// AND-expression: [C99 6.5.10] + /// equality-expression + /// AND-expression '&' equality-expression /// - OMPClause *ParseOpenMPSingleExprClause(OpenMPClauseKind Kind, - bool ParseOnly); - /// Parses simple clause of a kind \a Kind. + /// exclusive-OR-expression: [C99 6.5.11] + /// AND-expression + /// exclusive-OR-expression '^' AND-expression /// - /// \param Kind Kind of current clause. - /// \param ParseOnly true to skip the clause's semantic actions and return - /// nullptr. + /// inclusive-OR-expression: [C99 6.5.12] + /// exclusive-OR-expression + /// inclusive-OR-expression '|' exclusive-OR-expression /// - OMPClause *ParseOpenMPSimpleClause(OpenMPClauseKind Kind, bool ParseOnly); - /// Parses indirect clause - /// \param ParseOnly true to skip the clause's semantic actions and return - // false; - bool ParseOpenMPIndirectClause(SemaOpenMP::DeclareTargetContextInfo &DTCI, - bool ParseOnly); - /// Parses clause with a single expression and an additional argument - /// of a kind \a Kind. + /// logical-AND-expression: [C99 6.5.13] + /// inclusive-OR-expression + /// logical-AND-expression '&&' inclusive-OR-expression /// - /// \param DKind Directive kind. - /// \param Kind Kind of current clause. - /// \param ParseOnly true to skip the clause's semantic actions and return - /// nullptr. + /// logical-OR-expression: [C99 6.5.14] + /// logical-AND-expression + /// logical-OR-expression '||' logical-AND-expression /// - OMPClause *ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind, - OpenMPClauseKind Kind, - bool ParseOnly); + /// conditional-expression: [C99 6.5.15] + /// logical-OR-expression + /// logical-OR-expression '?' expression ':' conditional-expression + /// [GNU] logical-OR-expression '?' ':' conditional-expression + /// [C++] the third operand is an assignment-expression + /// + /// assignment-expression: [C99 6.5.16] + /// conditional-expression + /// unary-expression assignment-operator assignment-expression + /// [C++] throw-expression [C++ 15] + /// + /// assignment-operator: one of + /// = *= /= %= += -= <<= >>= &= ^= |= + /// + /// expression: [C99 6.5.17] + /// assignment-expression ...[opt] + /// expression ',' assignment-expression ...[opt] + /// \endverbatim + ExprResult + ParseExpression(TypeCastState isTypeCast = TypeCastState::NotTypeCast); - /// Parses the 'sizes' clause of a '#pragma omp tile' directive. - OMPClause *ParseOpenMPSizesClause(); + ExprResult ParseConstantExpressionInExprEvalContext( + TypeCastState isTypeCast = TypeCastState::NotTypeCast); + ExprResult ParseConstantExpression(); + ExprResult ParseArrayBoundExpression(); + ExprResult ParseCaseExpression(SourceLocation CaseLoc); - /// Parses the 'permutation' clause of a '#pragma omp interchange' directive. - OMPClause *ParseOpenMPPermutationClause(); + /// Parse a constraint-expression. + /// + /// \verbatim + /// constraint-expression: C++2a[temp.constr.decl]p1 + /// logical-or-expression + /// \endverbatim + ExprResult ParseConstraintExpression(); - /// Parses clause without any additional arguments. + /// \brief Parse a constraint-logical-and-expression. /// - /// \param Kind Kind of current clause. - /// \param ParseOnly true to skip the clause's semantic actions and return - /// nullptr. + /// \verbatim + /// C++2a[temp.constr.decl]p1 + /// constraint-logical-and-expression: + /// primary-expression + /// constraint-logical-and-expression '&&' primary-expression /// - OMPClause *ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly = false); - /// Parses clause with the list of variables of a kind \a Kind. + /// \endverbatim + ExprResult ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause); + + /// \brief Parse a constraint-logical-or-expression. /// - /// \param Kind Kind of current clause. - /// \param ParseOnly true to skip the clause's semantic actions and return - /// nullptr. + /// \verbatim + /// C++2a[temp.constr.decl]p1 + /// constraint-logical-or-expression: + /// constraint-logical-and-expression + /// constraint-logical-or-expression '||' + /// constraint-logical-and-expression /// - OMPClause *ParseOpenMPVarListClause(OpenMPDirectiveKind DKind, - OpenMPClauseKind Kind, bool ParseOnly); + /// \endverbatim + ExprResult ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause); - /// Parses a clause consisting of a list of expressions. + /// Parse an expr that doesn't include (top-level) commas. + ExprResult ParseAssignmentExpression( + TypeCastState isTypeCast = TypeCastState::NotTypeCast); + + ExprResult ParseConditionalExpression(); + + /// ParseStringLiteralExpression - This handles the various token types that + /// form string literals, and also handles string concatenation [C99 5.1.1.2, + /// translation phase #6]. /// - /// \param Kind The clause to parse. - /// \param ClauseNameLoc [out] The location of the clause name. - /// \param OpenLoc [out] The location of '('. - /// \param CloseLoc [out] The location of ')'. - /// \param Exprs [out] The parsed expressions. - /// \param ReqIntConst If true, each expression must be an integer constant. + /// \verbatim + /// primary-expression: [C99 6.5.1] + /// string-literal + /// \endverbatim + ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral = false); + ExprResult ParseUnevaluatedStringLiteralExpression(); + +private: + /// Whether the '>' token acts as an operator or not. This will be + /// true except when we are parsing an expression within a C++ + /// template argument list, where the '>' closes the template + /// argument list. + bool GreaterThanIsOperator; + + // C++ type trait keywords that can be reverted to identifiers and still be + // used as type traits. + llvm::SmallDenseMap RevertibleTypeTraits; + + OffsetOfKind OffsetOfState = OffsetOfKind::Outside; + + /// The location of the expression statement that is being parsed right now. + /// Used to determine if an expression that is being parsed is a statement or + /// just a regular sub-expression. + SourceLocation ExprStatementTokLoc; + + /// Checks if the \p Level is valid for use in a fold expression. + bool isFoldOperator(prec::Level Level) const; + + /// Checks if the \p Kind is a valid operator for fold expressions. + bool isFoldOperator(tok::TokenKind Kind) const; + + /// We have just started parsing the definition of a new class, + /// so push that class onto our stack of classes that is currently + /// being parsed. + Sema::ParsingClassState + PushParsingClass(Decl *TagOrTemplate, bool TopLevelClass, bool IsInterface); + + /// Deallocate the given parsed class and all of its nested + /// classes. + void DeallocateParsedClasses(ParsingClass *Class); + + /// Pop the top class of the stack of classes that are + /// currently being parsed. /// - /// \return Whether the clause was parsed successfully. - bool ParseOpenMPExprListClause(OpenMPClauseKind Kind, - SourceLocation &ClauseNameLoc, - SourceLocation &OpenLoc, - SourceLocation &CloseLoc, - SmallVectorImpl &Exprs, - bool ReqIntConst = false); + /// This routine should be called when we have finished parsing the + /// definition of a class, but have not yet popped the Scope + /// associated with the class's definition. + void PopParsingClass(Sema::ParsingClassState); - /// Parses and creates OpenMP 5.0 iterators expression: - /// = 'iterator' '(' { [ ] identifier = - /// }+ ')' - ExprResult ParseOpenMPIteratorsExpr(); + ExprResult ParseStringLiteralExpression(bool AllowUserDefinedLiteral, + bool Unevaluated); - /// Parses allocators and traits in the context of the uses_allocator clause. - /// Expected format: - /// '(' { [ '(' ')' ] }+ ')' - OMPClause *ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind); + /// This routine is called when the '@' is seen and consumed. + /// Current token is an Identifier and is not a 'try'. This + /// routine is necessary to disambiguate \@try-statement from, + /// for example, \@encode-expression. + /// + ExprResult ParseExpressionWithLeadingAt(SourceLocation AtLoc); - /// Parses the 'interop' parts of the 'append_args' and 'init' clauses. - bool ParseOMPInteropInfo(OMPInteropInfo &InteropInfo, OpenMPClauseKind Kind); + /// This routine is called when a leading '__extension__' is seen and + /// consumed. This is necessary because the token gets consumed in the + /// process of disambiguating between an expression and a declaration. + ExprResult ParseExpressionWithLeadingExtension(SourceLocation ExtLoc); - /// Parses clause with an interop variable of kind \a Kind. + /// Parse a binary expression that starts with \p LHS and has a + /// precedence of at least \p MinPrec. + ExprResult ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec); + + bool isRevertibleTypeTrait(const IdentifierInfo *Id, + clang::tok::TokenKind *Kind = nullptr); + + /// Parse a cast-expression, or, if \pisUnaryExpression is true, parse + /// a unary-expression. /// - /// \param Kind Kind of current clause. - /// \param ParseOnly true to skip the clause's semantic actions and return - /// nullptr. - // - OMPClause *ParseOpenMPInteropClause(OpenMPClauseKind Kind, bool ParseOnly); + /// \p isAddressOfOperand exists because an id-expression that is the operand + /// of address-of gets special treatment due to member pointers. NotCastExpr + /// is set to true if the token is not the start of a cast-expression, and no + /// diagnostic is emitted in this case and no tokens are consumed. + /// + /// \verbatim + /// cast-expression: [C99 6.5.4] + /// unary-expression + /// '(' type-name ')' cast-expression + /// + /// unary-expression: [C99 6.5.3] + /// postfix-expression + /// '++' unary-expression + /// '--' unary-expression + /// [Coro] 'co_await' cast-expression + /// unary-operator cast-expression + /// 'sizeof' unary-expression + /// 'sizeof' '(' type-name ')' + /// [C++11] 'sizeof' '...' '(' identifier ')' + /// [GNU] '__alignof' unary-expression + /// [GNU] '__alignof' '(' type-name ')' + /// [C11] '_Alignof' '(' type-name ')' + /// [C++11] 'alignof' '(' type-id ')' + /// [C2y] '_Countof' unary-expression + /// [C2y] '_Countof' '(' type-name ')' + /// [GNU] '&&' identifier + /// [C++11] 'noexcept' '(' expression ')' [C++11 5.3.7] + /// [C++] new-expression + /// [C++] delete-expression + /// + /// unary-operator: one of + /// '&' '*' '+' '-' '~' '!' + /// [GNU] '__extension__' '__real' '__imag' + /// + /// primary-expression: [C99 6.5.1] + /// [C99] identifier + /// [C++] id-expression + /// constant + /// string-literal + /// [C++] boolean-literal [C++ 2.13.5] + /// [C++11] 'nullptr' [C++11 2.14.7] + /// [C++11] user-defined-literal + /// '(' expression ')' + /// [C11] generic-selection + /// [C++2a] requires-expression + /// '__func__' [C99 6.4.2.2] + /// [GNU] '__FUNCTION__' + /// [MS] '__FUNCDNAME__' + /// [MS] 'L__FUNCTION__' + /// [MS] '__FUNCSIG__' + /// [MS] 'L__FUNCSIG__' + /// [GNU] '__PRETTY_FUNCTION__' + /// [GNU] '(' compound-statement ')' + /// [GNU] '__builtin_va_arg' '(' assignment-expression ',' type-name ')' + /// [GNU] '__builtin_offsetof' '(' type-name ',' offsetof-member-designator')' + /// [GNU] '__builtin_choose_expr' '(' assign-expr ',' assign-expr ',' + /// assign-expr ')' + /// [GNU] '__builtin_FILE' '(' ')' + /// [CLANG] '__builtin_FILE_NAME' '(' ')' + /// [GNU] '__builtin_FUNCTION' '(' ')' + /// [MS] '__builtin_FUNCSIG' '(' ')' + /// [GNU] '__builtin_LINE' '(' ')' + /// [CLANG] '__builtin_COLUMN' '(' ')' + /// [GNU] '__builtin_source_location' '(' ')' + /// [GNU] '__builtin_types_compatible_p' '(' type-name ',' type-name ')' + /// [GNU] '__null' + /// [OBJC] '[' objc-message-expr ']' + /// [OBJC] '\@selector' '(' objc-selector-arg ')' + /// [OBJC] '\@protocol' '(' identifier ')' + /// [OBJC] '\@encode' '(' type-name ')' + /// [OBJC] objc-string-literal + /// [C++] simple-type-specifier '(' expression-list[opt] ')' [C++ 5.2.3] + /// [C++11] simple-type-specifier braced-init-list [C++11 5.2.3] + /// [C++] typename-specifier '(' expression-list[opt] ')' [C++ 5.2.3] + /// [C++11] typename-specifier braced-init-list [C++11 5.2.3] + /// [C++] 'const_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] + /// [C++] 'dynamic_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] + /// [C++] 'reinterpret_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] + /// [C++] 'static_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] + /// [C++] 'typeid' '(' expression ')' [C++ 5.2p1] + /// [C++] 'typeid' '(' type-id ')' [C++ 5.2p1] + /// [C++] 'this' [C++ 9.3.2] + /// [G++] unary-type-trait '(' type-id ')' + /// [G++] binary-type-trait '(' type-id ',' type-id ')' [TODO] + /// [EMBT] array-type-trait '(' type-id ',' integer ')' + /// [clang] '^' block-literal + /// + /// constant: [C99 6.4.4] + /// integer-constant + /// floating-constant + /// enumeration-constant -> identifier + /// character-constant + /// + /// id-expression: [C++ 5.1] + /// unqualified-id + /// qualified-id + /// + /// unqualified-id: [C++ 5.1] + /// identifier + /// operator-function-id + /// conversion-function-id + /// '~' class-name + /// template-id + /// + /// new-expression: [C++ 5.3.4] + /// '::'[opt] 'new' new-placement[opt] new-type-id + /// new-initializer[opt] + /// '::'[opt] 'new' new-placement[opt] '(' type-id ')' + /// new-initializer[opt] + /// + /// delete-expression: [C++ 5.3.5] + /// '::'[opt] 'delete' cast-expression + /// '::'[opt] 'delete' '[' ']' cast-expression + /// + /// [GNU/Embarcadero] unary-type-trait: + /// '__is_arithmetic' + /// '__is_floating_point' + /// '__is_integral' + /// '__is_lvalue_expr' + /// '__is_rvalue_expr' + /// '__is_complete_type' + /// '__is_void' + /// '__is_array' + /// '__is_function' + /// '__is_reference' + /// '__is_lvalue_reference' + /// '__is_rvalue_reference' + /// '__is_fundamental' + /// '__is_object' + /// '__is_scalar' + /// '__is_compound' + /// '__is_pointer' + /// '__is_member_object_pointer' + /// '__is_member_function_pointer' + /// '__is_member_pointer' + /// '__is_const' + /// '__is_volatile' + /// '__is_trivial' + /// '__is_standard_layout' + /// '__is_signed' + /// '__is_unsigned' + /// + /// [GNU] unary-type-trait: + /// '__has_nothrow_assign' + /// '__has_nothrow_copy' + /// '__has_nothrow_constructor' + /// '__has_trivial_assign' [TODO] + /// '__has_trivial_copy' [TODO] + /// '__has_trivial_constructor' + /// '__has_trivial_destructor' + /// '__has_virtual_destructor' + /// '__is_abstract' [TODO] + /// '__is_class' + /// '__is_empty' [TODO] + /// '__is_enum' + /// '__is_final' + /// '__is_pod' + /// '__is_polymorphic' + /// '__is_sealed' [MS] + /// '__is_trivial' + /// '__is_union' + /// '__has_unique_object_representations' + /// + /// [Clang] unary-type-trait: + /// '__is_aggregate' + /// '__trivially_copyable' + /// + /// binary-type-trait: + /// [GNU] '__is_base_of' + /// [MS] '__is_convertible_to' + /// '__is_convertible' + /// '__is_same' + /// + /// [Embarcadero] array-type-trait: + /// '__array_rank' + /// '__array_extent' + /// + /// [Embarcadero] expression-trait: + /// '__is_lvalue_expr' + /// '__is_rvalue_expr' + /// \endverbatim + /// + ExprResult ParseCastExpression(CastParseKind ParseKind, + bool isAddressOfOperand, bool &NotCastExpr, + TypeCastState isTypeCast, + bool isVectorLiteral = false, + bool *NotPrimaryExpression = nullptr); + ExprResult + ParseCastExpression(CastParseKind ParseKind, bool isAddressOfOperand = false, + TypeCastState isTypeCast = TypeCastState::NotTypeCast, + bool isVectorLiteral = false, + bool *NotPrimaryExpression = nullptr); - /// Parses a ompx_attribute clause + /// Returns true if the next token cannot start an expression. + bool isNotExpressionStart(); + + /// Returns true if the next token would start a postfix-expression + /// suffix. + bool isPostfixExpressionSuffixStart() { + tok::TokenKind K = Tok.getKind(); + return (K == tok::l_square || K == tok::l_paren || K == tok::period || + K == tok::arrow || K == tok::plusplus || K == tok::minusminus); + } + + /// Once the leading part of a postfix-expression is parsed, this + /// method parses any suffixes that apply. /// - /// \param ParseOnly true to skip the clause's semantic actions and return - /// nullptr. - // - OMPClause *ParseOpenMPOMPXAttributesClause(bool ParseOnly); + /// \verbatim + /// postfix-expression: [C99 6.5.2] + /// primary-expression + /// postfix-expression '[' expression ']' + /// postfix-expression '[' braced-init-list ']' + /// postfix-expression '[' expression-list [opt] ']' [C++23 12.4.5] + /// postfix-expression '(' argument-expression-list[opt] ')' + /// postfix-expression '.' identifier + /// postfix-expression '->' identifier + /// postfix-expression '++' + /// postfix-expression '--' + /// '(' type-name ')' '{' initializer-list '}' + /// '(' type-name ')' '{' initializer-list ',' '}' + /// + /// argument-expression-list: [C99 6.5.2] + /// argument-expression ...[opt] + /// argument-expression-list ',' assignment-expression ...[opt] + /// \endverbatim + ExprResult ParsePostfixExpressionSuffix(ExprResult LHS); -public: - /// Parses simple expression in parens for single-expression clauses of OpenMP - /// constructs. - /// \param RLoc Returned location of right paren. - ExprResult ParseOpenMPParensExpr(StringRef ClauseName, SourceLocation &RLoc, - bool IsAddressOfOperand = false); + /// Parse a sizeof or alignof expression. + /// + /// \verbatim + /// unary-expression: [C99 6.5.3] + /// 'sizeof' unary-expression + /// 'sizeof' '(' type-name ')' + /// [C++11] 'sizeof' '...' '(' identifier ')' + /// [Clang] '__datasizeof' unary-expression + /// [Clang] '__datasizeof' '(' type-name ')' + /// [GNU] '__alignof' unary-expression + /// [GNU] '__alignof' '(' type-name ')' + /// [C11] '_Alignof' '(' type-name ')' + /// [C++11] 'alignof' '(' type-id ')' + /// [C2y] '_Countof' unary-expression + /// [C2y] '_Countof' '(' type-name ')' + /// \endverbatim + ExprResult ParseUnaryExprOrTypeTraitExpression(); - /// Parses a reserved locator like 'omp_all_memory'. - bool ParseOpenMPReservedLocator(OpenMPClauseKind Kind, - SemaOpenMP::OpenMPVarListDataTy &Data, - const LangOptions &LangOpts); - /// Parses clauses with list. - bool ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, - SmallVectorImpl &Vars, - SemaOpenMP::OpenMPVarListDataTy &Data); - bool ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, - bool ObjectHadErrors, bool EnteringContext, - bool AllowDestructorName, bool AllowConstructorName, - bool AllowDeductionGuide, - SourceLocation *TemplateKWLoc, UnqualifiedId &Result); + /// ParseBuiltinPrimaryExpression + /// + /// \verbatim + /// primary-expression: [C99 6.5.1] + /// [GNU] '__builtin_va_arg' '(' assignment-expression ',' type-name ')' + /// [GNU] '__builtin_offsetof' '(' type-name ',' offsetof-member-designator')' + /// [GNU] '__builtin_choose_expr' '(' assign-expr ',' assign-expr ',' + /// assign-expr ')' + /// [GNU] '__builtin_types_compatible_p' '(' type-name ',' type-name ')' + /// [GNU] '__builtin_FILE' '(' ')' + /// [CLANG] '__builtin_FILE_NAME' '(' ')' + /// [GNU] '__builtin_FUNCTION' '(' ')' + /// [MS] '__builtin_FUNCSIG' '(' ')' + /// [GNU] '__builtin_LINE' '(' ')' + /// [CLANG] '__builtin_COLUMN' '(' ')' + /// [GNU] '__builtin_source_location' '(' ')' + /// [OCL] '__builtin_astype' '(' assignment-expression ',' type-name ')' + /// + /// [GNU] offsetof-member-designator: + /// [GNU] identifier + /// [GNU] offsetof-member-designator '.' identifier + /// [GNU] offsetof-member-designator '[' expression ']' + /// \endverbatim + ExprResult ParseBuiltinPrimaryExpression(); - /// Parses the mapper modifier in map, to, and from clauses. - bool parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data); - /// Parses map-type-modifiers in map clause. - /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list) - /// where, map-type-modifier ::= always | close | mapper(mapper-identifier) - bool parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data); + /// Parse a __builtin_sycl_unique_stable_name expression. Accepts a type-id + /// as a parameter. + ExprResult ParseSYCLUniqueStableNameExpression(); - //===--------------------------------------------------------------------===// - // OpenACC Parsing. + /// ParseExprAfterUnaryExprOrTypeTrait - We parsed a typeof/sizeof/alignof/ + /// vec_step and we are at the start of an expression or a parenthesized + /// type-id. OpTok is the operand token (typeof/sizeof/alignof). Returns the + /// expression (isCastExpr == false) or the type (isCastExpr == true). + /// + /// \verbatim + /// unary-expression: [C99 6.5.3] + /// 'sizeof' unary-expression + /// 'sizeof' '(' type-name ')' + /// [Clang] '__datasizeof' unary-expression + /// [Clang] '__datasizeof' '(' type-name ')' + /// [GNU] '__alignof' unary-expression + /// [GNU] '__alignof' '(' type-name ')' + /// [C11] '_Alignof' '(' type-name ')' + /// [C++0x] 'alignof' '(' type-id ')' + /// + /// [GNU] typeof-specifier: + /// typeof ( expressions ) + /// typeof ( type-name ) + /// [GNU/C++] typeof unary-expression + /// [C23] typeof-specifier: + /// typeof '(' typeof-specifier-argument ')' + /// typeof_unqual '(' typeof-specifier-argument ')' + /// + /// typeof-specifier-argument: + /// expression + /// type-name + /// + /// [OpenCL 1.1 6.11.12] vec_step built-in function: + /// vec_step ( expressions ) + /// vec_step ( type-name ) + /// \endverbatim + ExprResult ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok, + bool &isCastExpr, + ParsedType &CastTy, + SourceRange &CastRange); - /// Placeholder for now, should just ignore the directives after emitting a - /// diagnostic. Eventually will be split into a few functions to parse - /// different situations. -public: - DeclGroupPtrTy ParseOpenACCDirectiveDecl(AccessSpecifier &AS, - ParsedAttributes &Attrs, - DeclSpec::TST TagType, - Decl *TagDecl); - StmtResult ParseOpenACCDirectiveStmt(); + /// ParseExpressionList - Used for C/C++ (argument-)expression-list. + /// + /// \verbatim + /// argument-expression-list: + /// assignment-expression + /// argument-expression-list , assignment-expression + /// + /// [C++] expression-list: + /// [C++] assignment-expression + /// [C++] expression-list , assignment-expression + /// + /// [C++0x] expression-list: + /// [C++0x] initializer-list + /// + /// [C++0x] initializer-list + /// [C++0x] initializer-clause ...[opt] + /// [C++0x] initializer-list , initializer-clause ...[opt] + /// + /// [C++0x] initializer-clause: + /// [C++0x] assignment-expression + /// [C++0x] braced-init-list + /// \endverbatim + bool ParseExpressionList(SmallVectorImpl &Exprs, + llvm::function_ref ExpressionStarts = + llvm::function_ref(), + bool FailImmediatelyOnInvalidExpr = false, + bool EarlyTypoCorrection = false); -private: - /// A struct to hold the information that got parsed by ParseOpenACCDirective, - /// so that the callers of it can use that to construct the appropriate AST - /// nodes. - struct OpenACCDirectiveParseInfo { - OpenACCDirectiveKind DirKind; - SourceLocation StartLoc; - SourceLocation DirLoc; - SourceLocation LParenLoc; - SourceLocation RParenLoc; - SourceLocation EndLoc; - SourceLocation MiscLoc; - OpenACCAtomicKind AtomicKind; - SmallVector Exprs; - SmallVector Clauses; - // TODO OpenACC: As we implement support for the Atomic, Routine, and Cache - // constructs, we likely want to put that information in here as well. - }; + /// ParseSimpleExpressionList - A simple comma-separated list of expressions, + /// used for misc language extensions. + /// + /// \verbatim + /// simple-expression-list: + /// assignment-expression + /// simple-expression-list , assignment-expression + /// \endverbatim + bool ParseSimpleExpressionList(SmallVectorImpl &Exprs); - struct OpenACCWaitParseInfo { - bool Failed = false; - Expr *DevNumExpr = nullptr; - SourceLocation QueuesLoc; - SmallVector QueueIdExprs; + /// ParseParenExpression - This parses the unit that starts with a '(' token, + /// based on what is allowed by ExprType. The actual thing parsed is returned + /// in ExprType. If stopIfCastExpr is true, it will only return the parsed + /// type, not the parsed cast-expression. + /// + /// \verbatim + /// primary-expression: [C99 6.5.1] + /// '(' expression ')' + /// [GNU] '(' compound-statement ')' (if !ParenExprOnly) + /// postfix-expression: [C99 6.5.2] + /// '(' type-name ')' '{' initializer-list '}' + /// '(' type-name ')' '{' initializer-list ',' '}' + /// cast-expression: [C99 6.5.4] + /// '(' type-name ')' cast-expression + /// [ARC] bridged-cast-expression + /// [ARC] bridged-cast-expression: + /// (__bridge type-name) cast-expression + /// (__bridge_transfer type-name) cast-expression + /// (__bridge_retained type-name) cast-expression + /// fold-expression: [C++1z] + /// '(' cast-expression fold-operator '...' ')' + /// '(' '...' fold-operator cast-expression ')' + /// '(' cast-expression fold-operator '...' + /// fold-operator cast-expression ')' + /// [OPENMP] Array shaping operation + /// '(' '[' expression ']' { '[' expression ']' } cast-expression + /// \endverbatim + ExprResult ParseParenExpression(ParenParseOption &ExprType, + bool stopIfCastExpr, bool isTypeCast, + ParsedType &CastTy, + SourceLocation &RParenLoc); - SmallVector getAllExprs() { - SmallVector Out; - Out.push_back(DevNumExpr); - llvm::append_range(Out, QueueIdExprs); - return Out; - } - }; - struct OpenACCCacheParseInfo { - bool Failed = false; - SourceLocation ReadOnlyLoc; - SmallVector Vars; - }; + /// ParseCompoundLiteralExpression - We have parsed the parenthesized + /// type-name and we are at the left brace. + /// + /// \verbatim + /// postfix-expression: [C99 6.5.2] + /// '(' type-name ')' '{' initializer-list '}' + /// '(' type-name ')' '{' initializer-list ',' '}' + /// \endverbatim + ExprResult ParseCompoundLiteralExpression(ParsedType Ty, + SourceLocation LParenLoc, + SourceLocation RParenLoc); - /// Represents the 'error' state of parsing an OpenACC Clause, and stores - /// whether we can continue parsing, or should give up on the directive. - enum class OpenACCParseCanContinue { Cannot = 0, Can = 1 }; + /// ParseGenericSelectionExpression - Parse a C11 generic-selection + /// [C11 6.5.1.1]. + /// + /// \verbatim + /// generic-selection: + /// _Generic ( assignment-expression , generic-assoc-list ) + /// generic-assoc-list: + /// generic-association + /// generic-assoc-list , generic-association + /// generic-association: + /// type-name : assignment-expression + /// default : assignment-expression + /// \endverbatim + /// + /// As an extension, Clang also accepts: + /// \verbatim + /// generic-selection: + /// _Generic ( type-name, generic-assoc-list ) + /// \endverbatim + ExprResult ParseGenericSelectionExpression(); - /// A type to represent the state of parsing an OpenACC Clause. Situations - /// that result in an OpenACCClause pointer are a success and can continue - /// parsing, however some other situations can also continue. - /// FIXME: This is better represented as a std::expected when we get C++23. - using OpenACCClauseParseResult = - llvm::PointerIntPair; + /// ParseObjCBoolLiteral - This handles the objective-c Boolean literals. + /// + /// '__objc_yes' + /// '__objc_no' + ExprResult ParseObjCBoolLiteral(); - OpenACCClauseParseResult OpenACCCanContinue(); - OpenACCClauseParseResult OpenACCCannotContinue(); - OpenACCClauseParseResult OpenACCSuccess(OpenACCClause *Clause); + /// Parse A C++1z fold-expression after the opening paren and optional + /// left-hand-side expression. + /// + /// \verbatim + /// fold-expression: + /// ( cast-expression fold-operator ... ) + /// ( ... fold-operator cast-expression ) + /// ( cast-expression fold-operator ... fold-operator cast-expression ) + /// \endverbatim + ExprResult ParseFoldExpression(ExprResult LHS, BalancedDelimiterTracker &T); - /// Parses the OpenACC directive (the entire pragma) including the clause - /// list, but does not produce the main AST node. - OpenACCDirectiveParseInfo ParseOpenACCDirective(); - /// Helper that parses an ID Expression based on the language options. - ExprResult ParseOpenACCIDExpression(); - /// Parses the variable list for the `cache` construct. - OpenACCCacheParseInfo ParseOpenACCCacheVarList(); - /// Parses the 'modifier-list' for copy, copyin, copyout, create. - OpenACCModifierKind tryParseModifierList(OpenACCClauseKind CK); + void injectEmbedTokens(); - using OpenACCVarParseResult = std::pair; - /// Parses a single variable in a variable list for OpenACC. - OpenACCVarParseResult ParseOpenACCVar(OpenACCDirectiveKind DK, - OpenACCClauseKind CK); - /// Parses the variable list for the variety of places that take a var-list. - llvm::SmallVector ParseOpenACCVarList(OpenACCDirectiveKind DK, - OpenACCClauseKind CK); - /// Parses any parameters for an OpenACC Clause, including required/optional - /// parens. + //===--------------------------------------------------------------------===// + // clang Expressions + + /// ParseBlockLiteralExpression - Parse a block literal, which roughly looks + /// like ^(int x){ return x+1; } + /// + /// \verbatim + /// block-literal: + /// [clang] '^' block-args[opt] compound-statement + /// [clang] '^' block-id compound-statement + /// [clang] block-args: + /// [clang] '(' parameter-list ')' + /// \endverbatim + ExprResult ParseBlockLiteralExpression(); // ^{...} + + /// Parse an assignment expression where part of an Objective-C message + /// send has already been parsed. + /// + /// In this case \p LBracLoc indicates the location of the '[' of the message + /// send, and either \p ReceiverName or \p ReceiverExpr is non-null indicating + /// the receiver of the message. + /// + /// Since this handles full assignment-expression's, it handles postfix + /// expressions and other binary operators for these expressions as well. + ExprResult ParseAssignmentExprWithObjCMessageExprStart( + SourceLocation LBracloc, SourceLocation SuperLoc, ParsedType ReceiverType, + Expr *ReceiverExpr); + + /// Return true if we know that we are definitely looking at a + /// decl-specifier, and isn't part of an expression such as a function-style + /// cast. Return false if it's no a decl-specifier, or we're not sure. + bool isKnownToBeDeclarationSpecifier() { + if (getLangOpts().CPlusPlus) + return isCXXDeclarationSpecifier(ImplicitTypenameContext::No) == + TPResult::True; + return isDeclarationSpecifier(ImplicitTypenameContext::No, true); + } + + /// Checks whether the current tokens form a type-id or an expression for the + /// purposes of use as the initial operand to a generic selection expression. + /// This requires special handling in C++ because it accepts either a type or + /// an expression, and we need to disambiguate which is which. However, we + /// cannot use the same logic as we've used for sizeof expressions, because + /// that logic relies on the operator only accepting a single argument, + /// whereas _Generic accepts a list of arguments. + bool isTypeIdForGenericSelection() { + if (getLangOpts().CPlusPlus) { + bool isAmbiguous; + return isCXXTypeId(TentativeCXXTypeIdContext::AsGenericSelectionArgument, + isAmbiguous); + } + return isTypeSpecifierQualifier(); + } + + /// Checks if the current tokens form type-id or expression. + /// It is similar to isTypeIdInParens but does not suppose that type-id + /// is in parenthesis. + bool isTypeIdUnambiguously() { + if (getLangOpts().CPlusPlus) { + bool isAmbiguous; + return isCXXTypeId(TentativeCXXTypeIdContext::Unambiguous, isAmbiguous); + } + return isTypeSpecifierQualifier(); + } + + /// ParseBlockId - Parse a block-id, which roughly looks like int (int x). + /// + /// \verbatim + /// [clang] block-id: + /// [clang] specifier-qualifier-list block-declarator + /// \endverbatim + void ParseBlockId(SourceLocation CaretLoc); + + /// Parse availability query specification. + /// + /// \verbatim + /// availability-spec: + /// '*' + /// identifier version-tuple + /// \endverbatim + std::optional ParseAvailabilitySpec(); + ExprResult ParseAvailabilityCheckExpr(SourceLocation StartLoc); + + /// Tries to parse cast part of OpenMP array shaping operation: + /// \verbatim + /// '[' expression ']' { '[' expression ']' } ')' + /// \endverbatim + bool tryParseOpenMPArrayShapingCastPart(); + + ExprResult ParseBuiltinPtrauthTypeDiscriminator(); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name C++ Expressions + /// Implementations are in ParseExprCXX.cpp + ///@{ + +public: + /// Parse a C++ unqualified-id (or a C identifier), which describes the + /// name of an entity. + /// + /// \verbatim + /// unqualified-id: [C++ expr.prim.general] + /// identifier + /// operator-function-id + /// conversion-function-id + /// [C++0x] literal-operator-id [TODO] + /// ~ class-name + /// template-id + /// \endverbatim + /// + /// \param SS The nested-name-specifier that preceded this unqualified-id. If + /// non-empty, then we are parsing the unqualified-id of a qualified-id. + /// + /// \param ObjectType if this unqualified-id occurs within a member access + /// expression, the type of the base object whose member is being accessed. + /// + /// \param ObjectHadErrors if this unqualified-id occurs within a member + /// access expression, indicates whether the original subexpressions had any + /// errors. When true, diagnostics for missing 'template' keyword will be + /// supressed. + /// + /// \param EnteringContext whether we are entering the scope of the + /// nested-name-specifier. + /// + /// \param AllowDestructorName whether we allow parsing of a destructor name. + /// + /// \param AllowConstructorName whether we allow parsing a constructor name. + /// + /// \param AllowDeductionGuide whether we allow parsing a deduction guide + /// name. + /// + /// \param Result on a successful parse, contains the parsed unqualified-id. + /// + /// \returns true if parsing fails, false otherwise. + bool ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, + bool ObjectHadErrors, bool EnteringContext, + bool AllowDestructorName, bool AllowConstructorName, + bool AllowDeductionGuide, + SourceLocation *TemplateKWLoc, UnqualifiedId &Result); + +private: + /// ColonIsSacred - When this is false, we aggressively try to recover from + /// code like "foo : bar" as if it were a typo for "foo :: bar". This is not + /// safe in case statements and a few other things. This is managed by the + /// ColonProtectionRAIIObject RAII object. + bool ColonIsSacred; + + /// ParseCXXAmbiguousParenExpression - We have parsed the left paren of a + /// parenthesized ambiguous type-id. This uses tentative parsing to + /// disambiguate based on the context past the parens. + ExprResult ParseCXXAmbiguousParenExpression( + ParenParseOption &ExprType, ParsedType &CastTy, + BalancedDelimiterTracker &Tracker, ColonProtectionRAIIObject &ColonProt); + + //===--------------------------------------------------------------------===// + // C++ Expressions + ExprResult tryParseCXXIdExpression(CXXScopeSpec &SS, bool isAddressOfOperand, + Token &Replacement); + + ExprResult tryParseCXXPackIndexingExpression(ExprResult PackIdExpression); + ExprResult ParseCXXPackIndexingExpression(ExprResult PackIdExpression); + + /// ParseCXXIdExpression - Handle id-expression. + /// + /// \verbatim + /// id-expression: + /// unqualified-id + /// qualified-id + /// + /// qualified-id: + /// '::'[opt] nested-name-specifier 'template'[opt] unqualified-id + /// '::' identifier + /// '::' operator-function-id + /// '::' template-id + /// + /// NOTE: The standard specifies that, for qualified-id, the parser does not + /// expect: + /// + /// '::' conversion-function-id + /// '::' '~' class-name + /// \endverbatim + /// + /// This may cause a slight inconsistency on diagnostics: + /// + /// class C {}; + /// namespace A {} + /// void f() { + /// :: A :: ~ C(); // Some Sema error about using destructor with a + /// // namespace. + /// :: ~ C(); // Some Parser error like 'unexpected ~'. + /// } + /// + /// We simplify the parser a bit and make it work like: + /// + /// \verbatim + /// qualified-id: + /// '::'[opt] nested-name-specifier 'template'[opt] unqualified-id + /// '::' unqualified-id + /// \endverbatim + /// + /// That way Sema can handle and report similar errors for namespaces and the + /// global scope. + /// + /// The isAddressOfOperand parameter indicates that this id-expression is a + /// direct operand of the address-of operator. This is, besides member + /// contexts, the only place where a qualified-id naming a non-static class + /// member may appear. + /// + ExprResult ParseCXXIdExpression(bool isAddressOfOperand = false); + + // Are the two tokens adjacent in the same source file? + bool areTokensAdjacent(const Token &A, const Token &B); + + // Check for '<::' which should be '< ::' instead of '[:' when following + // a template name. + void CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectTypePtr, + bool EnteringContext, IdentifierInfo &II, + CXXScopeSpec &SS); + + /// Parse global scope or nested-name-specifier if present. + /// + /// Parses a C++ global scope specifier ('::') or nested-name-specifier (which + /// may be preceded by '::'). Note that this routine will not parse ::new or + /// ::delete; it will just leave them in the token stream. + /// + /// \verbatim + /// '::'[opt] nested-name-specifier + /// '::' + /// + /// nested-name-specifier: + /// type-name '::' + /// namespace-name '::' + /// nested-name-specifier identifier '::' + /// nested-name-specifier 'template'[opt] simple-template-id '::' + /// \endverbatim + /// + /// + /// \param SS the scope specifier that will be set to the parsed + /// nested-name-specifier (or empty) + /// + /// \param ObjectType if this nested-name-specifier is being parsed following + /// the "." or "->" of a member access expression, this parameter provides the + /// type of the object whose members are being accessed. + /// + /// \param ObjectHadErrors if this unqualified-id occurs within a member + /// access expression, indicates whether the original subexpressions had any + /// errors. When true, diagnostics for missing 'template' keyword will be + /// supressed. + /// + /// \param EnteringContext whether we will be entering into the context of + /// the nested-name-specifier after parsing it. + /// + /// \param MayBePseudoDestructor When non-NULL, points to a flag that + /// indicates whether this nested-name-specifier may be part of a + /// pseudo-destructor name. In this case, the flag will be set false + /// if we don't actually end up parsing a destructor name. Moreover, + /// if we do end up determining that we are parsing a destructor name, + /// the last component of the nested-name-specifier is not parsed as + /// part of the scope specifier. + /// + /// \param IsTypename If \c true, this nested-name-specifier is known to be + /// part of a type name. This is used to improve error recovery. + /// + /// \param LastII When non-NULL, points to an IdentifierInfo* that will be + /// filled in with the leading identifier in the last component of the + /// nested-name-specifier, if any. + /// + /// \param OnlyNamespace If true, only considers namespaces in lookup. + /// + /// + /// \returns true if there was an error parsing a scope specifier + bool ParseOptionalCXXScopeSpecifier( + CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHasErrors, + bool EnteringContext, bool *MayBePseudoDestructor = nullptr, + bool IsTypename = false, const IdentifierInfo **LastII = nullptr, + bool OnlyNamespace = false, bool InUsingDeclaration = false, + bool Disambiguation = false); + + //===--------------------------------------------------------------------===// + // C++11 5.1.2: Lambda expressions + + /// Result of tentatively parsing a lambda-introducer. + enum class LambdaIntroducerTentativeParse { + /// This appears to be a lambda-introducer, which has been fully parsed. + Success, + /// This is a lambda-introducer, but has not been fully parsed, and this + /// function needs to be called again to parse it. + Incomplete, + /// This is definitely an Objective-C message send expression, rather than + /// a lambda-introducer, attribute-specifier, or array designator. + MessageSend, + /// This is not a lambda-introducer. + Invalid, + }; + + /// ParseLambdaExpression - Parse a C++11 lambda expression. + /// + /// \verbatim + /// lambda-expression: + /// lambda-introducer lambda-declarator compound-statement + /// lambda-introducer '<' template-parameter-list '>' + /// requires-clause[opt] lambda-declarator compound-statement + /// + /// lambda-introducer: + /// '[' lambda-capture[opt] ']' + /// + /// lambda-capture: + /// capture-default + /// capture-list + /// capture-default ',' capture-list + /// + /// capture-default: + /// '&' + /// '=' + /// + /// capture-list: + /// capture + /// capture-list ',' capture + /// + /// capture: + /// simple-capture + /// init-capture [C++1y] + /// + /// simple-capture: + /// identifier + /// '&' identifier + /// 'this' + /// + /// init-capture: [C++1y] + /// identifier initializer + /// '&' identifier initializer + /// + /// lambda-declarator: + /// lambda-specifiers [C++23] + /// '(' parameter-declaration-clause ')' lambda-specifiers + /// requires-clause[opt] + /// + /// lambda-specifiers: + /// decl-specifier-seq[opt] noexcept-specifier[opt] + /// attribute-specifier-seq[opt] trailing-return-type[opt] + /// \endverbatim + /// + ExprResult ParseLambdaExpression(); + + /// Use lookahead and potentially tentative parsing to determine if we are + /// looking at a C++11 lambda expression, and parse it if we are. + /// + /// If we are not looking at a lambda expression, returns ExprError(). + ExprResult TryParseLambdaExpression(); + + /// Parse a lambda introducer. + /// \param Intro A LambdaIntroducer filled in with information about the + /// contents of the lambda-introducer. + /// \param Tentative If non-null, we are disambiguating between a + /// lambda-introducer and some other construct. In this mode, we do not + /// produce any diagnostics or take any other irreversible action + /// unless we're sure that this is a lambda-expression. + /// \return \c true if parsing (or disambiguation) failed with a diagnostic + /// and the caller should bail out / recover. + bool + ParseLambdaIntroducer(LambdaIntroducer &Intro, + LambdaIntroducerTentativeParse *Tentative = nullptr); + + /// ParseLambdaExpressionAfterIntroducer - Parse the rest of a lambda + /// expression. + ExprResult ParseLambdaExpressionAfterIntroducer(LambdaIntroducer &Intro); + + //===--------------------------------------------------------------------===// + // C++ 5.2p1: C++ Casts + + /// ParseCXXCasts - This handles the various ways to cast expressions to + /// another type. + /// + /// \verbatim + /// postfix-expression: [C++ 5.2p1] + /// 'dynamic_cast' '<' type-name '>' '(' expression ')' + /// 'static_cast' '<' type-name '>' '(' expression ')' + /// 'reinterpret_cast' '<' type-name '>' '(' expression ')' + /// 'const_cast' '<' type-name '>' '(' expression ')' + /// \endverbatim + /// + /// C++ for OpenCL s2.3.1 adds: + /// 'addrspace_cast' '<' type-name '>' '(' expression ')' + ExprResult ParseCXXCasts(); + + /// Parse a __builtin_bit_cast(T, E), used to implement C++2a std::bit_cast. + ExprResult ParseBuiltinBitCast(); + + //===--------------------------------------------------------------------===// + // C++ 5.2p1: C++ Type Identification + + /// ParseCXXTypeid - This handles the C++ typeid expression. + /// + /// \verbatim + /// postfix-expression: [C++ 5.2p1] + /// 'typeid' '(' expression ')' + /// 'typeid' '(' type-id ')' + /// \endverbatim + /// + ExprResult ParseCXXTypeid(); + + //===--------------------------------------------------------------------===// + // C++ : Microsoft __uuidof Expression + + /// ParseCXXUuidof - This handles the Microsoft C++ __uuidof expression. + /// + /// \verbatim + /// '__uuidof' '(' expression ')' + /// '__uuidof' '(' type-id ')' + /// \endverbatim + /// + ExprResult ParseCXXUuidof(); + + //===--------------------------------------------------------------------===// + // C++ 5.2.4: C++ Pseudo-Destructor Expressions + + /// Parse a C++ pseudo-destructor expression after the base, + /// . or -> operator, and nested-name-specifier have already been + /// parsed. We're handling this fragment of the grammar: + /// + /// \verbatim + /// postfix-expression: [C++2a expr.post] + /// postfix-expression . template[opt] id-expression + /// postfix-expression -> template[opt] id-expression + /// + /// id-expression: + /// qualified-id + /// unqualified-id + /// + /// qualified-id: + /// nested-name-specifier template[opt] unqualified-id + /// + /// nested-name-specifier: + /// type-name :: + /// decltype-specifier :: FIXME: not implemented, but probably only + /// allowed in C++ grammar by accident + /// nested-name-specifier identifier :: + /// nested-name-specifier template[opt] simple-template-id :: + /// [...] + /// + /// unqualified-id: + /// ~ type-name + /// ~ decltype-specifier + /// [...] + /// \endverbatim + /// + /// ... where the all but the last component of the nested-name-specifier + /// has already been parsed, and the base expression is not of a non-dependent + /// class type. + ExprResult ParseCXXPseudoDestructor(Expr *Base, SourceLocation OpLoc, + tok::TokenKind OpKind, CXXScopeSpec &SS, + ParsedType ObjectType); + + //===--------------------------------------------------------------------===// + // C++ 9.3.2: C++ 'this' pointer + + /// ParseCXXThis - This handles the C++ 'this' pointer. + /// + /// C++ 9.3.2: In the body of a non-static member function, the keyword this + /// is a non-lvalue expression whose value is the address of the object for + /// which the function is called. + ExprResult ParseCXXThis(); + + //===--------------------------------------------------------------------===// + // C++ 15: C++ Throw Expression + + /// ParseThrowExpression - This handles the C++ throw expression. + /// + /// \verbatim + /// throw-expression: [C++ 15] + /// 'throw' assignment-expression[opt] + /// \endverbatim + ExprResult ParseThrowExpression(); + + //===--------------------------------------------------------------------===// + // C++ 2.13.5: C++ Boolean Literals + + /// ParseCXXBoolLiteral - This handles the C++ Boolean literals. + /// + /// \verbatim + /// boolean-literal: [C++ 2.13.5] + /// 'true' + /// 'false' + /// \endverbatim + ExprResult ParseCXXBoolLiteral(); + + //===--------------------------------------------------------------------===// + // C++ 5.2.3: Explicit type conversion (functional notation) + + /// ParseCXXTypeConstructExpression - Parse construction of a specified type. + /// Can be interpreted either as function-style casting ("int(x)") + /// or class type construction ("ClassType(x,y,z)") + /// or creation of a value-initialized type ("int()"). + /// See [C++ 5.2.3]. + /// + /// \verbatim + /// postfix-expression: [C++ 5.2p1] + /// simple-type-specifier '(' expression-list[opt] ')' + /// [C++0x] simple-type-specifier braced-init-list + /// typename-specifier '(' expression-list[opt] ')' + /// [C++0x] typename-specifier braced-init-list + /// \endverbatim + /// + /// In C++1z onwards, the type specifier can also be a template-name. + ExprResult ParseCXXTypeConstructExpression(const DeclSpec &DS); + + /// ParseCXXSimpleTypeSpecifier - [C++ 7.1.5.2] Simple type specifiers. + /// This should only be called when the current token is known to be part of + /// simple-type-specifier. + /// + /// \verbatim + /// simple-type-specifier: + /// '::'[opt] nested-name-specifier[opt] type-name + /// '::'[opt] nested-name-specifier 'template' simple-template-id [TODO] + /// char + /// wchar_t + /// bool + /// short + /// int + /// long + /// signed + /// unsigned + /// float + /// double + /// void + /// [GNU] typeof-specifier + /// [C++0x] auto [TODO] + /// + /// type-name: + /// class-name + /// enum-name + /// typedef-name + /// \endverbatim + /// + void ParseCXXSimpleTypeSpecifier(DeclSpec &DS); + + /// ParseCXXTypeSpecifierSeq - Parse a C++ type-specifier-seq (C++ + /// [dcl.name]), which is a non-empty sequence of type-specifiers, + /// e.g., "const short int". Note that the DeclSpec is *not* finished + /// by parsing the type-specifier-seq, because these sequences are + /// typically followed by some form of declarator. Returns true and + /// emits diagnostics if this is not a type-specifier-seq, false + /// otherwise. + /// + /// \verbatim + /// type-specifier-seq: [C++ 8.1] + /// type-specifier type-specifier-seq[opt] + /// \endverbatim + /// + bool ParseCXXTypeSpecifierSeq( + DeclSpec &DS, DeclaratorContext Context = DeclaratorContext::TypeName); + + //===--------------------------------------------------------------------===// + // C++ 5.3.4 and 5.3.5: C++ new and delete + + /// ParseExpressionListOrTypeId - Parse either an expression-list or a + /// type-id. This ambiguity appears in the syntax of the C++ new operator. + /// + /// \verbatim + /// new-expression: + /// '::'[opt] 'new' new-placement[opt] '(' type-id ')' + /// new-initializer[opt] + /// + /// new-placement: + /// '(' expression-list ')' + /// \endverbatim + /// + bool ParseExpressionListOrTypeId(SmallVectorImpl &Exprs, + Declarator &D); + + /// ParseDirectNewDeclarator - Parses a direct-new-declarator. Intended to be + /// passed to ParseDeclaratorInternal. + /// + /// \verbatim + /// direct-new-declarator: + /// '[' expression[opt] ']' + /// direct-new-declarator '[' constant-expression ']' + /// \endverbatim + /// + void ParseDirectNewDeclarator(Declarator &D); + + /// ParseCXXNewExpression - Parse a C++ new-expression. New is used to + /// allocate memory in a typesafe manner and call constructors. + /// + /// This method is called to parse the new expression after the optional :: + /// has been already parsed. If the :: was present, "UseGlobal" is true and + /// "Start" is its location. Otherwise, "Start" is the location of the 'new' + /// token. + /// + /// \verbatim + /// new-expression: + /// '::'[opt] 'new' new-placement[opt] new-type-id + /// new-initializer[opt] + /// '::'[opt] 'new' new-placement[opt] '(' type-id ')' + /// new-initializer[opt] + /// + /// new-placement: + /// '(' expression-list ')' + /// + /// new-type-id: + /// type-specifier-seq new-declarator[opt] + /// [GNU] attributes type-specifier-seq new-declarator[opt] + /// + /// new-declarator: + /// ptr-operator new-declarator[opt] + /// direct-new-declarator + /// + /// new-initializer: + /// '(' expression-list[opt] ')' + /// [C++0x] braced-init-list + /// \endverbatim + /// + ExprResult ParseCXXNewExpression(bool UseGlobal, SourceLocation Start); + + /// ParseCXXDeleteExpression - Parse a C++ delete-expression. Delete is used + /// to free memory allocated by new. + /// + /// This method is called to parse the 'delete' expression after the optional + /// '::' has been already parsed. If the '::' was present, "UseGlobal" is + /// true and "Start" is its location. Otherwise, "Start" is the location of + /// the 'delete' token. + /// + /// \verbatim + /// delete-expression: + /// '::'[opt] 'delete' cast-expression + /// '::'[opt] 'delete' '[' ']' cast-expression + /// \endverbatim + ExprResult ParseCXXDeleteExpression(bool UseGlobal, SourceLocation Start); + + //===--------------------------------------------------------------------===// + // C++ if/switch/while/for condition expression. + + /// ParseCXXCondition - if/switch/while condition expression. + /// + /// \verbatim + /// condition: + /// expression + /// type-specifier-seq declarator '=' assignment-expression + /// [C++11] type-specifier-seq declarator '=' initializer-clause + /// [C++11] type-specifier-seq declarator braced-init-list + /// [Clang] type-specifier-seq ref-qualifier[opt] '[' identifier-list ']' + /// brace-or-equal-initializer + /// [GNU] type-specifier-seq declarator simple-asm-expr[opt] attributes[opt] + /// '=' assignment-expression + /// \endverbatim + /// + /// In C++1z, a condition may in some contexts be preceded by an + /// optional init-statement. This function will parse that too. + /// + /// \param InitStmt If non-null, an init-statement is permitted, and if + /// present will be parsed and stored here. + /// + /// \param Loc The location of the start of the statement that requires this + /// condition, e.g., the "for" in a for loop. + /// + /// \param MissingOK Whether an empty condition is acceptable here. Otherwise + /// it is considered an error to be recovered from. + /// + /// \param FRI If non-null, a for range declaration is permitted, and if + /// present will be parsed and stored here, and a null result will be + /// returned. + /// + /// \param EnterForConditionScope If true, enter a continue/break scope at the + /// appropriate moment for a 'for' loop. + /// + /// \returns The parsed condition. + Sema::ConditionResult ParseCXXCondition(StmtResult *InitStmt, + SourceLocation Loc, + Sema::ConditionKind CK, + bool MissingOK, + ForRangeInfo *FRI = nullptr, + bool EnterForConditionScope = false); + DeclGroupPtrTy ParseAliasDeclarationInInitStatement(DeclaratorContext Context, + ParsedAttributes &Attrs); + + //===--------------------------------------------------------------------===// + // C++ Coroutines + + /// Parse the C++ Coroutines co_yield expression. + /// + /// \verbatim + /// co_yield-expression: + /// 'co_yield' assignment-expression[opt] + /// \endverbatim + ExprResult ParseCoyieldExpression(); + + //===--------------------------------------------------------------------===// + // C++ Concepts + + /// ParseRequiresExpression - Parse a C++2a requires-expression. + /// C++2a [expr.prim.req]p1 + /// A requires-expression provides a concise way to express requirements + /// on template arguments. A requirement is one that can be checked by + /// name lookup (6.4) or by checking properties of types and expressions. + /// + /// \verbatim + /// requires-expression: + /// 'requires' requirement-parameter-list[opt] requirement-body + /// + /// requirement-parameter-list: + /// '(' parameter-declaration-clause[opt] ')' + /// + /// requirement-body: + /// '{' requirement-seq '}' + /// + /// requirement-seq: + /// requirement + /// requirement-seq requirement + /// + /// requirement: + /// simple-requirement + /// type-requirement + /// compound-requirement + /// nested-requirement + /// \endverbatim + ExprResult ParseRequiresExpression(); + + /// isTypeIdInParens - Assumes that a '(' was parsed and now we want to know + /// whether the parens contain an expression or a type-id. + /// Returns true for a type-id and false for an expression. + bool isTypeIdInParens(bool &isAmbiguous) { + if (getLangOpts().CPlusPlus) + return isCXXTypeId(TentativeCXXTypeIdContext::InParens, isAmbiguous); + isAmbiguous = false; + return isTypeSpecifierQualifier(); + } + bool isTypeIdInParens() { + bool isAmbiguous; + return isTypeIdInParens(isAmbiguous); + } + + /// Finish parsing a C++ unqualified-id that is a template-id of + /// some form. + /// + /// This routine is invoked when a '<' is encountered after an identifier or + /// operator-function-id is parsed by \c ParseUnqualifiedId() to determine + /// whether the unqualified-id is actually a template-id. This routine will + /// then parse the template arguments and form the appropriate template-id to + /// return to the caller. + /// + /// \param SS the nested-name-specifier that precedes this template-id, if + /// we're actually parsing a qualified-id. + /// + /// \param ObjectType if this unqualified-id occurs within a member access + /// expression, the type of the base object whose member is being accessed. + /// + /// \param ObjectHadErrors this unqualified-id occurs within a member access + /// expression, indicates whether the original subexpressions had any errors. + /// + /// \param Name for constructor and destructor names, this is the actual + /// identifier that may be a template-name. + /// + /// \param NameLoc the location of the class-name in a constructor or + /// destructor. + /// + /// \param EnteringContext whether we're entering the scope of the + /// nested-name-specifier. + /// + /// \param Id as input, describes the template-name or operator-function-id + /// that precedes the '<'. If template arguments were parsed successfully, + /// will be updated with the template-id. + /// + /// \param AssumeTemplateId When true, this routine will assume that the name + /// refers to a template without performing name lookup to verify. + /// + /// \returns true if a parse error occurred, false otherwise. + bool ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, ParsedType ObjectType, + bool ObjectHadErrors, + SourceLocation TemplateKWLoc, + IdentifierInfo *Name, + SourceLocation NameLoc, + bool EnteringContext, UnqualifiedId &Id, + bool AssumeTemplateId); + + /// Parse an operator-function-id or conversion-function-id as part + /// of a C++ unqualified-id. + /// + /// This routine is responsible only for parsing the operator-function-id or + /// conversion-function-id; it does not handle template arguments in any way. + /// + /// \verbatim + /// operator-function-id: [C++ 13.5] + /// 'operator' operator + /// + /// operator: one of + /// new delete new[] delete[] + /// + - * / % ^ & | ~ + /// ! = < > += -= *= /= %= + /// ^= &= |= << >> >>= <<= == != + /// <= >= && || ++ -- , ->* -> + /// () [] <=> + /// + /// conversion-function-id: [C++ 12.3.2] + /// operator conversion-type-id + /// + /// conversion-type-id: + /// type-specifier-seq conversion-declarator[opt] + /// + /// conversion-declarator: + /// ptr-operator conversion-declarator[opt] + /// \endverbatim + /// + /// \param SS The nested-name-specifier that preceded this unqualified-id. If + /// non-empty, then we are parsing the unqualified-id of a qualified-id. + /// + /// \param EnteringContext whether we are entering the scope of the + /// nested-name-specifier. + /// + /// \param ObjectType if this unqualified-id occurs within a member access + /// expression, the type of the base object whose member is being accessed. + /// + /// \param Result on a successful parse, contains the parsed unqualified-id. + /// + /// \returns true if parsing fails, false otherwise. + bool ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, + ParsedType ObjectType, UnqualifiedId &Result); + + //===--------------------------------------------------------------------===// + // C++11/G++: Type Traits [Type-Traits.html in the GCC manual] + + /// Parse the built-in type-trait pseudo-functions that allow + /// implementation of the TR1/C++11 type traits templates. + /// + /// \verbatim + /// primary-expression: + /// unary-type-trait '(' type-id ')' + /// binary-type-trait '(' type-id ',' type-id ')' + /// type-trait '(' type-id-seq ')' + /// + /// type-id-seq: + /// type-id ...[opt] type-id-seq[opt] + /// \endverbatim + /// + ExprResult ParseTypeTrait(); + + //===--------------------------------------------------------------------===// + // Embarcadero: Arary and Expression Traits + + /// ParseArrayTypeTrait - Parse the built-in array type-trait + /// pseudo-functions. + /// + /// \verbatim + /// primary-expression: + /// [Embarcadero] '__array_rank' '(' type-id ')' + /// [Embarcadero] '__array_extent' '(' type-id ',' expression ')' + /// \endverbatim + /// + ExprResult ParseArrayTypeTrait(); + + /// ParseExpressionTrait - Parse built-in expression-trait + /// pseudo-functions like __is_lvalue_expr( xxx ). + /// + /// \verbatim + /// primary-expression: + /// [Embarcadero] expression-trait '(' expression ')' + /// \endverbatim + /// + ExprResult ParseExpressionTrait(); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name HLSL Constructs + /// Implementations are in ParseHLSL.cpp + ///@{ + +private: + bool MaybeParseHLSLAnnotations(Declarator &D, + SourceLocation *EndLoc = nullptr, + bool CouldBeBitField = false) { + assert(getLangOpts().HLSL && "MaybeParseHLSLAnnotations is for HLSL only"); + if (Tok.is(tok::colon)) { + ParsedAttributes Attrs(AttrFactory); + ParseHLSLAnnotations(Attrs, EndLoc, CouldBeBitField); + D.takeAttributes(Attrs); + return true; + } + return false; + } + + void MaybeParseHLSLAnnotations(ParsedAttributes &Attrs, + SourceLocation *EndLoc = nullptr) { + assert(getLangOpts().HLSL && "MaybeParseHLSLAnnotations is for HLSL only"); + if (Tok.is(tok::colon)) + ParseHLSLAnnotations(Attrs, EndLoc); + } + + void ParseHLSLAnnotations(ParsedAttributes &Attrs, + SourceLocation *EndLoc = nullptr, + bool CouldBeBitField = false); + Decl *ParseHLSLBuffer(SourceLocation &DeclEnd); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name Initializers + /// Implementations are in ParseInit.cpp + ///@{ + +private: + //===--------------------------------------------------------------------===// + // C99 6.7.8: Initialization. + + /// ParseInitializer + /// \verbatim + /// initializer: [C99 6.7.8] + /// assignment-expression + /// '{' ... + /// \endverbatim + ExprResult ParseInitializer() { + if (Tok.isNot(tok::l_brace)) + return ParseAssignmentExpression(); + return ParseBraceInitializer(); + } + + /// MayBeDesignationStart - Return true if the current token might be the + /// start of a designator. If we can tell it is impossible that it is a + /// designator, return false. + bool MayBeDesignationStart(); + + /// ParseBraceInitializer - Called when parsing an initializer that has a + /// leading open brace. + /// + /// \verbatim + /// initializer: [C99 6.7.8] + /// '{' initializer-list '}' + /// '{' initializer-list ',' '}' + /// [C23] '{' '}' + /// + /// initializer-list: + /// designation[opt] initializer ...[opt] + /// initializer-list ',' designation[opt] initializer ...[opt] + /// \endverbatim + /// + ExprResult ParseBraceInitializer(); + + struct DesignatorCompletionInfo { + SmallVectorImpl &InitExprs; + QualType PreferredBaseType; + }; + + /// ParseInitializerWithPotentialDesignator - Parse the 'initializer' + /// production checking to see if the token stream starts with a designator. + /// + /// C99: + /// + /// \verbatim + /// designation: + /// designator-list '=' + /// [GNU] array-designator + /// [GNU] identifier ':' + /// + /// designator-list: + /// designator + /// designator-list designator + /// + /// designator: + /// array-designator + /// '.' identifier + /// + /// array-designator: + /// '[' constant-expression ']' + /// [GNU] '[' constant-expression '...' constant-expression ']' + /// \endverbatim + /// + /// C++20: + /// + /// \verbatim + /// designated-initializer-list: + /// designated-initializer-clause + /// designated-initializer-list ',' designated-initializer-clause + /// + /// designated-initializer-clause: + /// designator brace-or-equal-initializer + /// + /// designator: + /// '.' identifier + /// \endverbatim + /// + /// We allow the C99 syntax extensions in C++20, but do not allow the C++20 + /// extension (a braced-init-list after the designator with no '=') in C99. + /// + /// NOTE: [OBC] allows '[ objc-receiver objc-message-args ]' as an + /// initializer (because it is an expression). We need to consider this case + /// when parsing array designators. + /// + /// \p CodeCompleteCB is called with Designation parsed so far. + ExprResult ParseInitializerWithPotentialDesignator(DesignatorCompletionInfo); + + ExprResult createEmbedExpr(); + + /// A SmallVector of expressions. + typedef SmallVector ExprVector; + + // Return true if a comma (or closing brace) is necessary after the + // __if_exists/if_not_exists statement. + bool ParseMicrosoftIfExistsBraceInitializer(ExprVector &InitExprs, + bool &InitExprsOk); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name Objective-C Constructs + /// Implementations are in ParseObjc.cpp + ///@{ + +public: + friend class InMessageExpressionRAIIObject; + friend class ObjCDeclContextSwitch; + + ObjCContainerDecl *getObjCDeclContext() const { + return Actions.ObjC().getObjCDeclContext(); + } + + /// Retrieve the underscored keyword (_Nonnull, _Nullable) that corresponds + /// to the given nullability kind. + IdentifierInfo *getNullabilityKeyword(NullabilityKind nullability) { + return Actions.getNullabilityKeyword(nullability); + } + +private: + /// Objective-C contextual keywords. + IdentifierInfo *Ident_instancetype; + + /// Ident_super - IdentifierInfo for "super", to support fast + /// comparison. + IdentifierInfo *Ident_super; + + /// When true, we are directly inside an Objective-C message + /// send expression. + /// + /// This is managed by the \c InMessageExpressionRAIIObject class, and + /// should not be set directly. + bool InMessageExpression; + + /// True if we are within an Objective-C container while parsing C-like decls. + /// + /// This is necessary because Sema thinks we have left the container + /// to parse the C-like decls, meaning Actions.ObjC().getObjCDeclContext() + /// will be NULL. + bool ParsingInObjCContainer; + + /// Returns true if the current token is the identifier 'instancetype'. + /// + /// Should only be used in Objective-C language modes. + bool isObjCInstancetype() { + assert(getLangOpts().ObjC); + if (Tok.isAnnotation()) + return false; + if (!Ident_instancetype) + Ident_instancetype = PP.getIdentifierInfo("instancetype"); + return Tok.getIdentifierInfo() == Ident_instancetype; + } + + /// ObjCDeclContextSwitch - An object used to switch context from + /// an objective-c decl context to its enclosing decl context and + /// back. + class ObjCDeclContextSwitch { + Parser &P; + ObjCContainerDecl *DC; + SaveAndRestore WithinObjCContainer; + + public: + explicit ObjCDeclContextSwitch(Parser &p) + : P(p), DC(p.getObjCDeclContext()), + WithinObjCContainer(P.ParsingInObjCContainer, DC != nullptr) { + if (DC) + P.Actions.ObjC().ActOnObjCTemporaryExitContainerContext(DC); + } + ~ObjCDeclContextSwitch() { + if (DC) + P.Actions.ObjC().ActOnObjCReenterContainerContext(DC); + } + }; + + void CheckNestedObjCContexts(SourceLocation AtLoc); + + void ParseLexedObjCMethodDefs(LexedMethod &LM, bool parseMethod); + + // Objective-C External Declarations + + /// Skips attributes after an Objective-C @ directive. Emits a diagnostic. + void MaybeSkipAttributes(tok::ObjCKeywordKind Kind); + + /// ParseObjCAtDirectives - Handle parts of the external-declaration + /// production: + /// \verbatim + /// external-declaration: [C99 6.9] + /// [OBJC] objc-class-definition + /// [OBJC] objc-class-declaration + /// [OBJC] objc-alias-declaration + /// [OBJC] objc-protocol-definition + /// [OBJC] objc-method-definition + /// [OBJC] '@' 'end' + /// \endverbatim + DeclGroupPtrTy ParseObjCAtDirectives(ParsedAttributes &DeclAttrs, + ParsedAttributes &DeclSpecAttrs); + + /// + /// \verbatim + /// objc-class-declaration: + /// '@' 'class' objc-class-forward-decl (',' objc-class-forward-decl)* ';' + /// + /// objc-class-forward-decl: + /// identifier objc-type-parameter-list[opt] + /// \endverbatim + /// + DeclGroupPtrTy ParseObjCAtClassDeclaration(SourceLocation atLoc); + + /// + /// \verbatim + /// objc-interface: + /// objc-class-interface-attributes[opt] objc-class-interface + /// objc-category-interface + /// + /// objc-class-interface: + /// '@' 'interface' identifier objc-type-parameter-list[opt] + /// objc-superclass[opt] objc-protocol-refs[opt] + /// objc-class-instance-variables[opt] + /// objc-interface-decl-list + /// @end + /// + /// objc-category-interface: + /// '@' 'interface' identifier objc-type-parameter-list[opt] + /// '(' identifier[opt] ')' objc-protocol-refs[opt] + /// objc-interface-decl-list + /// @end + /// + /// objc-superclass: + /// ':' identifier objc-type-arguments[opt] + /// + /// objc-class-interface-attributes: + /// __attribute__((visibility("default"))) + /// __attribute__((visibility("hidden"))) + /// __attribute__((deprecated)) + /// __attribute__((unavailable)) + /// __attribute__((objc_exception)) - used by NSException on 64-bit + /// __attribute__((objc_root_class)) + /// \endverbatim + /// + Decl *ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, + ParsedAttributes &prefixAttrs); + + /// Class to handle popping type parameters when leaving the scope. + class ObjCTypeParamListScope; + + /// Parse an objc-type-parameter-list. + ObjCTypeParamList *parseObjCTypeParamList(); + + /// Parse an Objective-C type parameter list, if present, or capture + /// the locations of the protocol identifiers for a list of protocol + /// references. + /// + /// \verbatim + /// objc-type-parameter-list: + /// '<' objc-type-parameter (',' objc-type-parameter)* '>' + /// + /// objc-type-parameter: + /// objc-type-parameter-variance? identifier objc-type-parameter-bound[opt] + /// + /// objc-type-parameter-bound: + /// ':' type-name + /// + /// objc-type-parameter-variance: + /// '__covariant' + /// '__contravariant' + /// \endverbatim + /// + /// \param lAngleLoc The location of the starting '<'. + /// + /// \param protocolIdents Will capture the list of identifiers, if the + /// angle brackets contain a list of protocol references rather than a + /// type parameter list. + /// + /// \param rAngleLoc The location of the ending '>'. + ObjCTypeParamList *parseObjCTypeParamListOrProtocolRefs( + ObjCTypeParamListScope &Scope, SourceLocation &lAngleLoc, + SmallVectorImpl &protocolIdents, SourceLocation &rAngleLoc, + bool mayBeProtocolList = true); + + void HelperActionsForIvarDeclarations(ObjCContainerDecl *interfaceDecl, + SourceLocation atLoc, + BalancedDelimiterTracker &T, + SmallVectorImpl &AllIvarDecls, + bool RBraceMissing); + + /// \verbatim + /// objc-class-instance-variables: + /// '{' objc-instance-variable-decl-list[opt] '}' + /// + /// objc-instance-variable-decl-list: + /// objc-visibility-spec + /// objc-instance-variable-decl ';' + /// ';' + /// objc-instance-variable-decl-list objc-visibility-spec + /// objc-instance-variable-decl-list objc-instance-variable-decl ';' + /// objc-instance-variable-decl-list static_assert-declaration + /// objc-instance-variable-decl-list ';' + /// + /// objc-visibility-spec: + /// @private + /// @protected + /// @public + /// @package [OBJC2] + /// + /// objc-instance-variable-decl: + /// struct-declaration + /// \endverbatim + /// + void ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl, + tok::ObjCKeywordKind visibility, + SourceLocation atLoc); + + /// \verbatim + /// objc-protocol-refs: + /// '<' identifier-list '>' + /// \endverbatim + /// + bool ParseObjCProtocolReferences( + SmallVectorImpl &P, SmallVectorImpl &PLocs, + bool WarnOnDeclarations, bool ForObjCContainer, SourceLocation &LAngleLoc, + SourceLocation &EndProtoLoc, bool consumeLastToken); + + /// Parse the first angle-bracket-delimited clause for an + /// Objective-C object or object pointer type, which may be either + /// type arguments or protocol qualifiers. + /// + /// \verbatim + /// objc-type-arguments: + /// '<' type-name '...'[opt] (',' type-name '...'[opt])* '>' + /// \endverbatim + /// + void parseObjCTypeArgsOrProtocolQualifiers( + ParsedType baseType, SourceLocation &typeArgsLAngleLoc, + SmallVectorImpl &typeArgs, SourceLocation &typeArgsRAngleLoc, + SourceLocation &protocolLAngleLoc, SmallVectorImpl &protocols, + SmallVectorImpl &protocolLocs, + SourceLocation &protocolRAngleLoc, bool consumeLastToken, + bool warnOnIncompleteProtocols); + + /// Parse either Objective-C type arguments or protocol qualifiers; if the + /// former, also parse protocol qualifiers afterward. + void parseObjCTypeArgsAndProtocolQualifiers( + ParsedType baseType, SourceLocation &typeArgsLAngleLoc, + SmallVectorImpl &typeArgs, SourceLocation &typeArgsRAngleLoc, + SourceLocation &protocolLAngleLoc, SmallVectorImpl &protocols, + SmallVectorImpl &protocolLocs, + SourceLocation &protocolRAngleLoc, bool consumeLastToken); + + /// Parse a protocol qualifier type such as '', which is + /// an anachronistic way of writing 'id'. + TypeResult parseObjCProtocolQualifierType(SourceLocation &rAngleLoc); + + /// Parse Objective-C type arguments and protocol qualifiers, extending the + /// current type with the parsed result. + TypeResult parseObjCTypeArgsAndProtocolQualifiers(SourceLocation loc, + ParsedType type, + bool consumeLastToken, + SourceLocation &endLoc); + + /// \verbatim + /// objc-interface-decl-list: + /// empty + /// objc-interface-decl-list objc-property-decl [OBJC2] + /// objc-interface-decl-list objc-method-requirement [OBJC2] + /// objc-interface-decl-list objc-method-proto ';' + /// objc-interface-decl-list declaration + /// objc-interface-decl-list ';' + /// + /// objc-method-requirement: [OBJC2] + /// @required + /// @optional + /// \endverbatim + /// + void ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey, Decl *CDecl); + + /// \verbatim + /// objc-protocol-declaration: + /// objc-protocol-definition + /// objc-protocol-forward-reference + /// + /// objc-protocol-definition: + /// \@protocol identifier + /// objc-protocol-refs[opt] + /// objc-interface-decl-list + /// \@end + /// + /// objc-protocol-forward-reference: + /// \@protocol identifier-list ';' + /// \endverbatim + /// + /// "\@protocol identifier ;" should be resolved as "\@protocol + /// identifier-list ;": objc-interface-decl-list may not start with a + /// semicolon in the first alternative if objc-protocol-refs are omitted. + DeclGroupPtrTy ParseObjCAtProtocolDeclaration(SourceLocation atLoc, + ParsedAttributes &prefixAttrs); + + struct ObjCImplParsingDataRAII { + Parser &P; + Decl *Dcl; + bool HasCFunction; + typedef SmallVector LateParsedObjCMethodContainer; + LateParsedObjCMethodContainer LateParsedObjCMethods; + + ObjCImplParsingDataRAII(Parser &parser, Decl *D) + : P(parser), Dcl(D), HasCFunction(false) { + P.CurParsedObjCImpl = this; + Finished = false; + } + ~ObjCImplParsingDataRAII(); + + void finish(SourceRange AtEnd); + bool isFinished() const { return Finished; } + + private: + bool Finished; + }; + ObjCImplParsingDataRAII *CurParsedObjCImpl; + + /// StashAwayMethodOrFunctionBodyTokens - Consume the tokens and store them + /// for later parsing. + void StashAwayMethodOrFunctionBodyTokens(Decl *MDecl); + + /// \verbatim + /// objc-implementation: + /// objc-class-implementation-prologue + /// objc-category-implementation-prologue + /// + /// objc-class-implementation-prologue: + /// @implementation identifier objc-superclass[opt] + /// objc-class-instance-variables[opt] + /// + /// objc-category-implementation-prologue: + /// @implementation identifier ( identifier ) + /// \endverbatim + DeclGroupPtrTy ParseObjCAtImplementationDeclaration(SourceLocation AtLoc, + ParsedAttributes &Attrs); + DeclGroupPtrTy ParseObjCAtEndDeclaration(SourceRange atEnd); + + /// \verbatim + /// compatibility-alias-decl: + /// @compatibility_alias alias-name class-name ';' + /// \endverbatim + /// + Decl *ParseObjCAtAliasDeclaration(SourceLocation atLoc); + + /// \verbatim + /// property-synthesis: + /// @synthesize property-ivar-list ';' + /// + /// property-ivar-list: + /// property-ivar + /// property-ivar-list ',' property-ivar + /// + /// property-ivar: + /// identifier + /// identifier '=' identifier + /// \endverbatim + /// + Decl *ParseObjCPropertySynthesize(SourceLocation atLoc); + + /// \verbatim + /// property-dynamic: + /// @dynamic property-list + /// + /// property-list: + /// identifier + /// property-list ',' identifier + /// \endverbatim + /// + Decl *ParseObjCPropertyDynamic(SourceLocation atLoc); + + /// \verbatim + /// objc-selector: + /// identifier + /// one of + /// enum struct union if else while do for switch case default + /// break continue return goto asm sizeof typeof __alignof + /// unsigned long const short volatile signed restrict _Complex + /// in out inout bycopy byref oneway int char float double void _Bool + /// \endverbatim + /// + IdentifierInfo *ParseObjCSelectorPiece(SourceLocation &MethodLocation); + + IdentifierInfo *ObjCTypeQuals[llvm::to_underlying(ObjCTypeQual::NumQuals)]; + + /// \verbatim + /// objc-for-collection-in: 'in' + /// \endverbatim + /// + bool isTokIdentifier_in() const; + + /// \verbatim + /// objc-type-name: + /// '(' objc-type-qualifiers[opt] type-name ')' + /// '(' objc-type-qualifiers[opt] ')' + /// \endverbatim + /// + ParsedType ParseObjCTypeName(ObjCDeclSpec &DS, DeclaratorContext Ctx, + ParsedAttributes *ParamAttrs); + + /// \verbatim + /// objc-method-proto: + /// objc-instance-method objc-method-decl objc-method-attributes[opt] + /// objc-class-method objc-method-decl objc-method-attributes[opt] + /// + /// objc-instance-method: '-' + /// objc-class-method: '+' + /// + /// objc-method-attributes: [OBJC2] + /// __attribute__((deprecated)) + /// \endverbatim + /// + Decl *ParseObjCMethodPrototype( + tok::ObjCKeywordKind MethodImplKind = tok::objc_not_keyword, + bool MethodDefinition = true); + + /// \verbatim + /// objc-method-decl: + /// objc-selector + /// objc-keyword-selector objc-parmlist[opt] + /// objc-type-name objc-selector + /// objc-type-name objc-keyword-selector objc-parmlist[opt] + /// + /// objc-keyword-selector: + /// objc-keyword-decl + /// objc-keyword-selector objc-keyword-decl + /// + /// objc-keyword-decl: + /// objc-selector ':' objc-type-name objc-keyword-attributes[opt] identifier + /// objc-selector ':' objc-keyword-attributes[opt] identifier + /// ':' objc-type-name objc-keyword-attributes[opt] identifier + /// ':' objc-keyword-attributes[opt] identifier + /// + /// objc-parmlist: + /// objc-parms objc-ellipsis[opt] + /// + /// objc-parms: + /// objc-parms , parameter-declaration + /// + /// objc-ellipsis: + /// , ... + /// + /// objc-keyword-attributes: [OBJC2] + /// __attribute__((unused)) + /// \endverbatim + /// + Decl *ParseObjCMethodDecl( + SourceLocation mLoc, tok::TokenKind mType, + tok::ObjCKeywordKind MethodImplKind = tok::objc_not_keyword, + bool MethodDefinition = true); + + /// Parse property attribute declarations. + /// + /// \verbatim + /// property-attr-decl: '(' property-attrlist ')' + /// property-attrlist: + /// property-attribute + /// property-attrlist ',' property-attribute + /// property-attribute: + /// getter '=' identifier + /// setter '=' identifier ':' + /// direct + /// readonly + /// readwrite + /// assign + /// retain + /// copy + /// nonatomic + /// atomic + /// strong + /// weak + /// unsafe_unretained + /// nonnull + /// nullable + /// null_unspecified + /// null_resettable + /// class + /// \endverbatim + /// + void ParseObjCPropertyAttribute(ObjCDeclSpec &DS); + + /// \verbatim + /// objc-method-def: objc-method-proto ';'[opt] '{' body '}' + /// \endverbatim + /// + Decl *ParseObjCMethodDefinition(); + + //===--------------------------------------------------------------------===// + // Objective-C Expressions + ExprResult ParseObjCAtExpression(SourceLocation AtLocation); + ExprResult ParseObjCStringLiteral(SourceLocation AtLoc); + + /// ParseObjCCharacterLiteral - + /// \verbatim + /// objc-scalar-literal : '@' character-literal + /// ; + /// \endverbatim + ExprResult ParseObjCCharacterLiteral(SourceLocation AtLoc); + + /// ParseObjCNumericLiteral - + /// \verbatim + /// objc-scalar-literal : '@' scalar-literal + /// ; + /// scalar-literal : | numeric-constant /* any numeric constant. */ + /// ; + /// \endverbatim + ExprResult ParseObjCNumericLiteral(SourceLocation AtLoc); + + /// ParseObjCBooleanLiteral - + /// \verbatim + /// objc-scalar-literal : '@' boolean-keyword + /// ; + /// boolean-keyword: 'true' | 'false' | '__objc_yes' | '__objc_no' + /// ; + /// \endverbatim + ExprResult ParseObjCBooleanLiteral(SourceLocation AtLoc, bool ArgValue); + + ExprResult ParseObjCArrayLiteral(SourceLocation AtLoc); + ExprResult ParseObjCDictionaryLiteral(SourceLocation AtLoc); + + /// ParseObjCBoxedExpr - + /// \verbatim + /// objc-box-expression: + /// @( assignment-expression ) + /// \endverbatim + ExprResult ParseObjCBoxedExpr(SourceLocation AtLoc); + + /// \verbatim + /// objc-encode-expression: + /// \@encode ( type-name ) + /// \endverbatim + ExprResult ParseObjCEncodeExpression(SourceLocation AtLoc); + + /// \verbatim + /// objc-selector-expression + /// @selector '(' '('[opt] objc-keyword-selector ')'[opt] ')' + /// \endverbatim + ExprResult ParseObjCSelectorExpression(SourceLocation AtLoc); + + /// \verbatim + /// objc-protocol-expression + /// \@protocol ( protocol-name ) + /// \endverbatim + ExprResult ParseObjCProtocolExpression(SourceLocation AtLoc); + + /// Determine whether the parser is currently referring to a an + /// Objective-C message send, using a simplified heuristic to avoid overhead. + /// + /// This routine will only return true for a subset of valid message-send + /// expressions. + bool isSimpleObjCMessageExpression(); + + /// \verbatim + /// objc-message-expr: + /// '[' objc-receiver objc-message-args ']' + /// + /// objc-receiver: [C] + /// 'super' + /// expression + /// class-name + /// type-name + /// \endverbatim + /// + ExprResult ParseObjCMessageExpression(); + + /// Parse the remainder of an Objective-C message following the + /// '[' objc-receiver. + /// + /// This routine handles sends to super, class messages (sent to a + /// class name), and instance messages (sent to an object), and the + /// target is represented by \p SuperLoc, \p ReceiverType, or \p + /// ReceiverExpr, respectively. Only one of these parameters may have + /// a valid value. + /// + /// \param LBracLoc The location of the opening '['. + /// + /// \param SuperLoc If this is a send to 'super', the location of the + /// 'super' keyword that indicates a send to the superclass. + /// + /// \param ReceiverType If this is a class message, the type of the + /// class we are sending a message to. + /// + /// \param ReceiverExpr If this is an instance message, the expression + /// used to compute the receiver object. + /// + /// \verbatim + /// objc-message-args: + /// objc-selector + /// objc-keywordarg-list + /// + /// objc-keywordarg-list: + /// objc-keywordarg + /// objc-keywordarg-list objc-keywordarg + /// + /// objc-keywordarg: + /// selector-name[opt] ':' objc-keywordexpr + /// + /// objc-keywordexpr: + /// nonempty-expr-list + /// + /// nonempty-expr-list: + /// assignment-expression + /// nonempty-expr-list , assignment-expression + /// \endverbatim + /// + ExprResult ParseObjCMessageExpressionBody(SourceLocation LBracloc, + SourceLocation SuperLoc, + ParsedType ReceiverType, + Expr *ReceiverExpr); + + /// Parse the receiver of an Objective-C++ message send. + /// + /// This routine parses the receiver of a message send in + /// Objective-C++ either as a type or as an expression. Note that this + /// routine must not be called to parse a send to 'super', since it + /// has no way to return such a result. + /// + /// \param IsExpr Whether the receiver was parsed as an expression. + /// + /// \param TypeOrExpr If the receiver was parsed as an expression (\c + /// IsExpr is true), the parsed expression. If the receiver was parsed + /// as a type (\c IsExpr is false), the parsed type. + /// + /// \returns True if an error occurred during parsing or semantic + /// analysis, in which case the arguments do not have valid + /// values. Otherwise, returns false for a successful parse. + /// + /// \verbatim + /// objc-receiver: [C++] + /// 'super' [not parsed here] + /// expression + /// simple-type-specifier + /// typename-specifier + /// \endverbatim + bool ParseObjCXXMessageReceiver(bool &IsExpr, void *&TypeOrExpr); + + //===--------------------------------------------------------------------===// + // Objective-C Statements + + enum class ParsedStmtContext; + + StmtResult ParseObjCAtStatement(SourceLocation atLoc, + ParsedStmtContext StmtCtx); + + /// \verbatim + /// objc-try-catch-statement: + /// @try compound-statement objc-catch-list[opt] + /// @try compound-statement objc-catch-list[opt] @finally compound-statement + /// + /// objc-catch-list: + /// @catch ( parameter-declaration ) compound-statement + /// objc-catch-list @catch ( catch-parameter-declaration ) compound-statement + /// catch-parameter-declaration: + /// parameter-declaration + /// '...' [OBJC2] + /// \endverbatim + /// + StmtResult ParseObjCTryStmt(SourceLocation atLoc); + + /// \verbatim + /// objc-throw-statement: + /// throw expression[opt]; + /// \endverbatim + /// + StmtResult ParseObjCThrowStmt(SourceLocation atLoc); + + /// \verbatim + /// objc-synchronized-statement: + /// @synchronized '(' expression ')' compound-statement + /// \endverbatim + /// + StmtResult ParseObjCSynchronizedStmt(SourceLocation atLoc); + + /// \verbatim + /// objc-autoreleasepool-statement: + /// @autoreleasepool compound-statement + /// \endverbatim + /// + StmtResult ParseObjCAutoreleasePoolStmt(SourceLocation atLoc); + + /// ParseObjCTypeQualifierList - This routine parses the objective-c's type + /// qualifier list and builds their bitmask representation in the input + /// argument. + /// + /// \verbatim + /// objc-type-qualifiers: + /// objc-type-qualifier + /// objc-type-qualifiers objc-type-qualifier + /// + /// objc-type-qualifier: + /// 'in' + /// 'out' + /// 'inout' + /// 'oneway' + /// 'bycopy's + /// 'byref' + /// 'nonnull' + /// 'nullable' + /// 'null_unspecified' + /// \endverbatim + /// + void ParseObjCTypeQualifierList(ObjCDeclSpec &DS, DeclaratorContext Context); + + /// Determine whether we are currently at the start of an Objective-C + /// class message that appears to be missing the open bracket '['. + bool isStartOfObjCClassMessageMissingOpenBracket(); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name OpenACC Constructs + /// Implementations are in ParseOpenACC.cpp + ///@{ + +public: + friend class ParsingOpenACCDirectiveRAII; + + /// Parse OpenACC directive on a declaration. + /// + /// Placeholder for now, should just ignore the directives after emitting a + /// diagnostic. Eventually will be split into a few functions to parse + /// different situations. + DeclGroupPtrTy ParseOpenACCDirectiveDecl(AccessSpecifier &AS, + ParsedAttributes &Attrs, + DeclSpec::TST TagType, + Decl *TagDecl); + + // Parse OpenACC Directive on a Statement. + StmtResult ParseOpenACCDirectiveStmt(); + +private: + /// Parsing OpenACC directive mode. + bool OpenACCDirectiveParsing = false; + + /// Currently parsing a situation where an OpenACC array section could be + /// legal, such as a 'var-list'. + bool AllowOpenACCArraySections = false; + + /// RAII object to set reset OpenACC parsing a context where Array Sections + /// are allowed. + class OpenACCArraySectionRAII { + Parser &P; + + public: + OpenACCArraySectionRAII(Parser &P) : P(P) { + assert(!P.AllowOpenACCArraySections); + P.AllowOpenACCArraySections = true; + } + ~OpenACCArraySectionRAII() { + assert(P.AllowOpenACCArraySections); + P.AllowOpenACCArraySections = false; + } + }; + + /// A struct to hold the information that got parsed by ParseOpenACCDirective, + /// so that the callers of it can use that to construct the appropriate AST + /// nodes. + struct OpenACCDirectiveParseInfo { + OpenACCDirectiveKind DirKind; + SourceLocation StartLoc; + SourceLocation DirLoc; + SourceLocation LParenLoc; + SourceLocation RParenLoc; + SourceLocation EndLoc; + SourceLocation MiscLoc; + OpenACCAtomicKind AtomicKind; + SmallVector Exprs; + SmallVector Clauses; + // TODO OpenACC: As we implement support for the Atomic, Routine, and Cache + // constructs, we likely want to put that information in here as well. + }; + + struct OpenACCWaitParseInfo { + bool Failed = false; + Expr *DevNumExpr = nullptr; + SourceLocation QueuesLoc; + SmallVector QueueIdExprs; + + SmallVector getAllExprs() { + SmallVector Out; + Out.push_back(DevNumExpr); + llvm::append_range(Out, QueueIdExprs); + return Out; + } + }; + struct OpenACCCacheParseInfo { + bool Failed = false; + SourceLocation ReadOnlyLoc; + SmallVector Vars; + }; + + /// Represents the 'error' state of parsing an OpenACC Clause, and stores + /// whether we can continue parsing, or should give up on the directive. + enum class OpenACCParseCanContinue { Cannot = 0, Can = 1 }; + + /// A type to represent the state of parsing an OpenACC Clause. Situations + /// that result in an OpenACCClause pointer are a success and can continue + /// parsing, however some other situations can also continue. + /// FIXME: This is better represented as a std::expected when we get C++23. + using OpenACCClauseParseResult = + llvm::PointerIntPair; + + OpenACCClauseParseResult OpenACCCanContinue(); + OpenACCClauseParseResult OpenACCCannotContinue(); + OpenACCClauseParseResult OpenACCSuccess(OpenACCClause *Clause); + + /// Parses the OpenACC directive (the entire pragma) including the clause + /// list, but does not produce the main AST node. + OpenACCDirectiveParseInfo ParseOpenACCDirective(); + /// Helper that parses an ID Expression based on the language options. + ExprResult ParseOpenACCIDExpression(); + + /// Parses the variable list for the `cache` construct. + /// + /// OpenACC 3.3, section 2.10: + /// In C and C++, the syntax of the cache directive is: + /// + /// #pragma acc cache ([readonly:]var-list) new-line + OpenACCCacheParseInfo ParseOpenACCCacheVarList(); + + /// Tries to parse the 'modifier-list' for a 'copy', 'copyin', 'copyout', or + /// 'create' clause. + OpenACCModifierKind tryParseModifierList(OpenACCClauseKind CK); + + using OpenACCVarParseResult = std::pair; + + /// Parses a single variable in a variable list for OpenACC. + /// + /// OpenACC 3.3, section 1.6: + /// In this spec, a 'var' (in italics) is one of the following: + /// - a variable name (a scalar, array, or composite variable name) + /// - a subarray specification with subscript ranges + /// - an array element + /// - a member of a composite variable + /// - a common block name between slashes (fortran only) + OpenACCVarParseResult ParseOpenACCVar(OpenACCDirectiveKind DK, + OpenACCClauseKind CK); + + /// Parses the variable list for the variety of places that take a var-list. + llvm::SmallVector ParseOpenACCVarList(OpenACCDirectiveKind DK, + OpenACCClauseKind CK); + + /// Parses any parameters for an OpenACC Clause, including required/optional + /// parens. + /// + /// The OpenACC Clause List is a comma or space-delimited list of clauses (see + /// the comment on ParseOpenACCClauseList). The concept of a 'clause' doesn't + /// really have its owner grammar and each individual one has its own + /// definition. However, they all are named with a single-identifier (or + /// auto/default!) token, followed in some cases by either braces or parens. OpenACCClauseParseResult ParseOpenACCClauseParams(ArrayRef ExistingClauses, OpenACCDirectiveKind DirKind, OpenACCClauseKind Kind, SourceLocation ClauseLoc); + /// Parses a single clause in a clause-list for OpenACC. Returns nullptr on /// error. OpenACCClauseParseResult ParseOpenACCClause(ArrayRef ExistingClauses, OpenACCDirectiveKind DirKind); + /// Parses the clause-list for an OpenACC directive. + /// + /// OpenACC 3.3, section 1.7: + /// To simplify the specification and convey appropriate constraint + /// information, a pqr-list is a comma-separated list of pdr items. The one + /// exception is a clause-list, which is a list of one or more clauses + /// optionally separated by commas. SmallVector ParseOpenACCClauseList(OpenACCDirectiveKind DirKind); + + /// OpenACC 3.3, section 2.16: + /// In this section and throughout the specification, the term wait-argument + /// means: + /// \verbatim + /// [ devnum : int-expr : ] [ queues : ] async-argument-list + /// \endverbatim OpenACCWaitParseInfo ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective); + /// Parses the clause of the 'bind' argument, which can be a string literal or /// an identifier. std::variant ParseOpenACCBindClauseArgument(); - /// A type to represent the state of parsing after an attempt to parse an - /// OpenACC int-expr. This is useful to determine whether an int-expr list can - /// continue parsing after a failed int-expr. - using OpenACCIntExprParseResult = - std::pair; - /// Parses the clause kind of 'int-expr', which can be any integral - /// expression. - OpenACCIntExprParseResult ParseOpenACCIntExpr(OpenACCDirectiveKind DK, - OpenACCClauseKind CK, - SourceLocation Loc); - /// Parses the argument list for 'num_gangs', which allows up to 3 - /// 'int-expr's. - bool ParseOpenACCIntExprList(OpenACCDirectiveKind DK, OpenACCClauseKind CK, - SourceLocation Loc, - llvm::SmallVectorImpl &IntExprs); - /// Parses the 'device-type-list', which is a list of identifiers. - bool ParseOpenACCDeviceTypeList(llvm::SmallVector &Archs); - /// Parses the 'async-argument', which is an integral value with two - /// 'special' values that are likely negative (but come from Macros). - OpenACCIntExprParseResult ParseOpenACCAsyncArgument(OpenACCDirectiveKind DK, - OpenACCClauseKind CK, - SourceLocation Loc); + /// A type to represent the state of parsing after an attempt to parse an + /// OpenACC int-expr. This is useful to determine whether an int-expr list can + /// continue parsing after a failed int-expr. + using OpenACCIntExprParseResult = + std::pair; + /// Parses the clause kind of 'int-expr', which can be any integral + /// expression. + OpenACCIntExprParseResult ParseOpenACCIntExpr(OpenACCDirectiveKind DK, + OpenACCClauseKind CK, + SourceLocation Loc); + /// Parses the argument list for 'num_gangs', which allows up to 3 + /// 'int-expr's. + bool ParseOpenACCIntExprList(OpenACCDirectiveKind DK, OpenACCClauseKind CK, + SourceLocation Loc, + llvm::SmallVectorImpl &IntExprs); + + /// Parses the 'device-type-list', which is a list of identifiers. + /// + /// OpenACC 3.3 Section 2.4: + /// The argument to the device_type clause is a comma-separated list of one or + /// more device architecture name identifiers, or an asterisk. + /// + /// The syntax of the device_type clause is + /// device_type( * ) + /// device_type( device-type-list ) + /// + /// The device_type clause may be abbreviated to dtype. + bool ParseOpenACCDeviceTypeList(llvm::SmallVector &Archs); + + /// Parses the 'async-argument', which is an integral value with two + /// 'special' values that are likely negative (but come from Macros). + /// + /// OpenACC 3.3 section 2.16: + /// In this section and throughout the specification, the term async-argument + /// means a nonnegative scalar integer expression (int for C or C++, integer + /// for Fortran), or one of the special values acc_async_noval or + /// acc_async_sync, as defined in the C header file and the Fortran openacc + /// module. The special values are negative values, so as not to conflict with + /// a user-specified nonnegative async-argument. + OpenACCIntExprParseResult ParseOpenACCAsyncArgument(OpenACCDirectiveKind DK, + OpenACCClauseKind CK, + SourceLocation Loc); + + /// Parses the 'size-expr', which is an integral value, or an asterisk. + /// Asterisk is represented by a OpenACCAsteriskSizeExpr + /// + /// OpenACC 3.3 Section 2.9: + /// size-expr is one of: + /// * + /// int-expr + /// Note that this is specified under 'gang-arg-list', but also applies to + /// 'tile' via reference. + ExprResult ParseOpenACCSizeExpr(OpenACCClauseKind CK); + + /// Parses a comma delimited list of 'size-expr's. + bool ParseOpenACCSizeExprList(OpenACCClauseKind CK, + llvm::SmallVectorImpl &SizeExprs); + + /// Parses a 'gang-arg-list', used for the 'gang' clause. + /// + /// OpenACC 3.3 Section 2.9: + /// + /// where gang-arg is one of: + /// \verbatim + /// [num:]int-expr + /// dim:int-expr + /// static:size-expr + /// \endverbatim + bool ParseOpenACCGangArgList(SourceLocation GangLoc, + llvm::SmallVectorImpl &GKs, + llvm::SmallVectorImpl &IntExprs); + + using OpenACCGangArgRes = std::pair; + /// Parses a 'gang-arg', used for the 'gang' clause. Returns a pair of the + /// ExprResult (which contains the validity of the expression), plus the gang + /// kind for the current argument. + OpenACCGangArgRes ParseOpenACCGangArg(SourceLocation GangLoc); + /// Parses a 'condition' expr, ensuring it results in a + ExprResult ParseOpenACCConditionExpr(); + DeclGroupPtrTy + ParseOpenACCAfterRoutineDecl(AccessSpecifier &AS, ParsedAttributes &Attrs, + DeclSpec::TST TagType, Decl *TagDecl, + OpenACCDirectiveParseInfo &DirInfo); + StmtResult ParseOpenACCAfterRoutineStmt(OpenACCDirectiveParseInfo &DirInfo); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name OpenMP Constructs + /// Implementations are in ParseOpenMP.cpp + ///@{ + +private: + friend class ParsingOpenMPDirectiveRAII; + + /// Parsing OpenMP directive mode. + bool OpenMPDirectiveParsing = false; + + /// Current kind of OpenMP clause + OpenMPClauseKind OMPClauseKind = llvm::omp::OMPC_unknown; + + void ReplayOpenMPAttributeTokens(CachedTokens &OpenMPTokens) { + // If parsing the attributes found an OpenMP directive, emit those tokens + // to the parse stream now. + if (!OpenMPTokens.empty()) { + PP.EnterToken(Tok, /*IsReinject*/ true); + PP.EnterTokenStream(OpenMPTokens, /*DisableMacroExpansion*/ true, + /*IsReinject*/ true); + ConsumeAnyToken(/*ConsumeCodeCompletionTok*/ true); + } + } + + //===--------------------------------------------------------------------===// + // OpenMP: Directives and clauses. + + /// Parse clauses for '#pragma omp declare simd'. + DeclGroupPtrTy ParseOMPDeclareSimdClauses(DeclGroupPtrTy Ptr, + CachedTokens &Toks, + SourceLocation Loc); + + /// Parse a property kind into \p TIProperty for the selector set \p Set and + /// selector \p Selector. + void parseOMPTraitPropertyKind(OMPTraitProperty &TIProperty, + llvm::omp::TraitSet Set, + llvm::omp::TraitSelector Selector, + llvm::StringMap &Seen); + + /// Parse a selector kind into \p TISelector for the selector set \p Set. + void parseOMPTraitSelectorKind(OMPTraitSelector &TISelector, + llvm::omp::TraitSet Set, + llvm::StringMap &Seen); + + /// Parse a selector set kind into \p TISet. + void parseOMPTraitSetKind(OMPTraitSet &TISet, + llvm::StringMap &Seen); + + /// Parses an OpenMP context property. + void parseOMPContextProperty(OMPTraitSelector &TISelector, + llvm::omp::TraitSet Set, + llvm::StringMap &Seen); + + /// Parses an OpenMP context selector. + /// + /// \verbatim + /// ['('[] [, ]* ')'] + /// \endverbatim + void parseOMPContextSelector(OMPTraitSelector &TISelector, + llvm::omp::TraitSet Set, + llvm::StringMap &SeenSelectors); + + /// Parses an OpenMP context selector set. + /// + /// \verbatim + /// '=' '{' [, ]* '}' + /// \endverbatim + void parseOMPContextSelectorSet(OMPTraitSet &TISet, + llvm::StringMap &SeenSets); + + /// Parse OpenMP context selectors: + /// + /// \verbatim + /// [, ]* + /// \endverbatim + bool parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI); + + /// Parse an 'append_args' clause for '#pragma omp declare variant'. + bool parseOpenMPAppendArgs(SmallVectorImpl &InteropInfos); + + /// Parse a `match` clause for an '#pragma omp declare variant'. Return true + /// if there was an error. + bool parseOMPDeclareVariantMatchClause(SourceLocation Loc, OMPTraitInfo &TI, + OMPTraitInfo *ParentTI); + + /// Parse clauses for '#pragma omp declare variant ( variant-func-id ) + /// clause'. + void ParseOMPDeclareVariantClauses(DeclGroupPtrTy Ptr, CachedTokens &Toks, + SourceLocation Loc); + + /// Parse 'omp [begin] assume[s]' directive. + /// + /// `omp assumes` or `omp begin/end assumes` [[,]]... + /// where + /// + /// \verbatim + /// clause: + /// 'ext_IMPL_DEFINED' + /// 'absent' '(' directive-name [, directive-name]* ')' + /// 'contains' '(' directive-name [, directive-name]* ')' + /// 'holds' '(' scalar-expression ')' + /// 'no_openmp' + /// 'no_openmp_routines' + /// 'no_openmp_constructs' (OpenMP 6.0) + /// 'no_parallelism' + /// \endverbatim + /// + void ParseOpenMPAssumesDirective(OpenMPDirectiveKind DKind, + SourceLocation Loc); + + /// Parse 'omp end assumes' directive. + void ParseOpenMPEndAssumesDirective(SourceLocation Loc); + + /// Parses clauses for directive. + /// + /// \verbatim + /// [clause[ [,] clause] ... ] + /// + /// clauses: for error directive + /// 'at' '(' compilation | execution ')' + /// 'severity' '(' fatal | warning ')' + /// 'message' '(' msg-string ')' + /// .... + /// \endverbatim + /// + /// \param DKind Kind of current directive. + /// \param clauses for current directive. + /// \param start location for clauses of current directive + void ParseOpenMPClauses(OpenMPDirectiveKind DKind, + SmallVectorImpl &Clauses, + SourceLocation Loc); + + /// Parse clauses for '#pragma omp [begin] declare target'. + void ParseOMPDeclareTargetClauses(SemaOpenMP::DeclareTargetContextInfo &DTCI); + + /// Parse '#pragma omp end declare target'. + void ParseOMPEndDeclareTargetDirective(OpenMPDirectiveKind BeginDKind, + OpenMPDirectiveKind EndDKind, + SourceLocation Loc); + + /// Skip tokens until a `annot_pragma_openmp_end` was found. Emit a warning if + /// it is not the current token. + void skipUntilPragmaOpenMPEnd(OpenMPDirectiveKind DKind); + + /// Check the \p FoundKind against the \p ExpectedKind, if not issue an error + /// that the "end" matching the "begin" directive of kind \p BeginKind was not + /// found. Finally, if the expected kind was found or if \p SkipUntilOpenMPEnd + /// is set, skip ahead using the helper `skipUntilPragmaOpenMPEnd`. + void parseOMPEndDirective(OpenMPDirectiveKind BeginKind, + OpenMPDirectiveKind ExpectedKind, + OpenMPDirectiveKind FoundKind, + SourceLocation MatchingLoc, SourceLocation FoundLoc, + bool SkipUntilOpenMPEnd); + + /// Parses declarative OpenMP directives. + /// + /// \verbatim + /// threadprivate-directive: + /// annot_pragma_openmp 'threadprivate' simple-variable-list + /// annot_pragma_openmp_end + /// + /// allocate-directive: + /// annot_pragma_openmp 'allocate' simple-variable-list [] + /// annot_pragma_openmp_end + /// + /// declare-reduction-directive: + /// annot_pragma_openmp 'declare' 'reduction' [...] + /// annot_pragma_openmp_end + /// + /// declare-mapper-directive: + /// annot_pragma_openmp 'declare' 'mapper' '(' [ ':'] + /// ')' [[[,] ] ... ] + /// annot_pragma_openmp_end + /// + /// declare-simd-directive: + /// annot_pragma_openmp 'declare simd' { [,]} + /// annot_pragma_openmp_end + /// + /// + /// requires directive: + /// annot_pragma_openmp 'requires' [[[,] ] ... ] + /// annot_pragma_openmp_end + /// + /// assumes directive: + /// annot_pragma_openmp 'assumes' [[[,] ] ... ] + /// annot_pragma_openmp_end + /// or + /// annot_pragma_openmp 'begin assumes' [[[,] ] ... ] + /// annot_pragma_openmp 'end assumes' + /// annot_pragma_openmp_end + /// \endverbatim + /// + DeclGroupPtrTy ParseOpenMPDeclarativeDirectiveWithExtDecl( + AccessSpecifier &AS, ParsedAttributes &Attrs, bool Delayed = false, + DeclSpec::TST TagType = DeclSpec::TST_unspecified, + Decl *TagDecl = nullptr); + + /// Parse 'omp declare reduction' construct. + /// + /// \verbatim + /// declare-reduction-directive: + /// annot_pragma_openmp 'declare' 'reduction' + /// '(' ':' {',' } ':' ')' + /// ['initializer' '(' ('omp_priv' '=' )| ')'] + /// annot_pragma_openmp_end + /// \endverbatim + /// is either a base language identifier or one of the + /// following operators: '+', '-', '*', '&', '|', '^', '&&' and '||'. + /// + DeclGroupPtrTy ParseOpenMPDeclareReductionDirective(AccessSpecifier AS); + + /// Parses initializer for provided omp_priv declaration inside the reduction + /// initializer. + void ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm); + + /// Parses 'omp declare mapper' directive. + /// + /// \verbatim + /// declare-mapper-directive: + /// annot_pragma_openmp 'declare' 'mapper' '(' [ ':'] + /// ')' [[[,] ] ... ] + /// annot_pragma_openmp_end + /// \endverbatim + /// and are base language identifiers. + /// + DeclGroupPtrTy ParseOpenMPDeclareMapperDirective(AccessSpecifier AS); + + /// Parses variable declaration in 'omp declare mapper' directive. + TypeResult parseOpenMPDeclareMapperVarDecl(SourceRange &Range, + DeclarationName &Name, + AccessSpecifier AS = AS_none); + + /// Parses simple list of variables. + /// + /// \verbatim + /// simple-variable-list: + /// '(' id-expression {, id-expression} ')' + /// \endverbatim + /// + /// \param Kind Kind of the directive. + /// \param Callback Callback function to be called for the list elements. + /// \param AllowScopeSpecifier true, if the variables can have fully + /// qualified names. + /// + bool ParseOpenMPSimpleVarList( + OpenMPDirectiveKind Kind, + const llvm::function_ref + &Callback, + bool AllowScopeSpecifier); + + /// Parses declarative or executable directive. + /// + /// \verbatim + /// threadprivate-directive: + /// annot_pragma_openmp 'threadprivate' simple-variable-list + /// annot_pragma_openmp_end + /// + /// allocate-directive: + /// annot_pragma_openmp 'allocate' simple-variable-list + /// annot_pragma_openmp_end + /// + /// declare-reduction-directive: + /// annot_pragma_openmp 'declare' 'reduction' '(' ':' + /// {',' } ':' ')' ['initializer' '(' + /// ('omp_priv' '=' |) ')'] + /// annot_pragma_openmp_end + /// + /// declare-mapper-directive: + /// annot_pragma_openmp 'declare' 'mapper' '(' [ ':'] + /// ')' [[[,] ] ... ] + /// annot_pragma_openmp_end + /// + /// executable-directive: + /// annot_pragma_openmp 'parallel' | 'simd' | 'for' | 'sections' | + /// 'section' | 'single' | 'master' | 'critical' [ '(' ')' ] | + /// 'parallel for' | 'parallel sections' | 'parallel master' | 'task' + /// | 'taskyield' | 'barrier' | 'taskwait' | 'flush' | 'ordered' | + /// 'error' | 'atomic' | 'for simd' | 'parallel for simd' | 'target' | + /// 'target data' | 'taskgroup' | 'teams' | 'taskloop' | 'taskloop + /// simd' | 'master taskloop' | 'master taskloop simd' | 'parallel + /// master taskloop' | 'parallel master taskloop simd' | 'distribute' + /// | 'target enter data' | 'target exit data' | 'target parallel' | + /// 'target parallel for' | 'target update' | 'distribute parallel + /// for' | 'distribute paralle for simd' | 'distribute simd' | 'target + /// parallel for simd' | 'target simd' | 'teams distribute' | 'teams + /// distribute simd' | 'teams distribute parallel for simd' | 'teams + /// distribute parallel for' | 'target teams' | 'target teams + /// distribute' | 'target teams distribute parallel for' | 'target + /// teams distribute parallel for simd' | 'target teams distribute + /// simd' | 'masked' | 'parallel masked' {clause} + /// annot_pragma_openmp_end + /// \endverbatim + /// + /// + /// \param StmtCtx The context in which we're parsing the directive. + /// \param ReadDirectiveWithinMetadirective true if directive is within a + /// metadirective and therefore ends on the closing paren. + StmtResult ParseOpenMPDeclarativeOrExecutableDirective( + ParsedStmtContext StmtCtx, bool ReadDirectiveWithinMetadirective = false); + + /// Parses executable directive. + /// + /// \param StmtCtx The context in which we're parsing the directive. + /// \param DKind The kind of the executable directive. + /// \param Loc Source location of the beginning of the directive. + /// \param ReadDirectiveWithinMetadirective true if directive is within a + /// metadirective and therefore ends on the closing paren. + StmtResult + ParseOpenMPExecutableDirective(ParsedStmtContext StmtCtx, + OpenMPDirectiveKind DKind, SourceLocation Loc, + bool ReadDirectiveWithinMetadirective); + + /// Parses informational directive. + /// + /// \param StmtCtx The context in which we're parsing the directive. + /// \param DKind The kind of the informational directive. + /// \param Loc Source location of the beginning of the directive. + /// \param ReadDirectiveWithinMetadirective true if directive is within a + /// metadirective and therefore ends on the closing paren. + StmtResult ParseOpenMPInformationalDirective( + ParsedStmtContext StmtCtx, OpenMPDirectiveKind DKind, SourceLocation Loc, + bool ReadDirectiveWithinMetadirective); + + /// Parses clause of kind \a CKind for directive of a kind \a Kind. + /// + /// \verbatim + /// clause: + /// if-clause | final-clause | num_threads-clause | safelen-clause | + /// default-clause | private-clause | firstprivate-clause | + /// shared-clause | linear-clause | aligned-clause | collapse-clause | + /// bind-clause | lastprivate-clause | reduction-clause | + /// proc_bind-clause | schedule-clause | copyin-clause | + /// copyprivate-clause | untied-clause | mergeable-clause | flush-clause + /// | read-clause | write-clause | update-clause | capture-clause | + /// seq_cst-clause | device-clause | simdlen-clause | threads-clause | + /// simd-clause | num_teams-clause | thread_limit-clause | + /// priority-clause | grainsize-clause | nogroup-clause | + /// num_tasks-clause | hint-clause | to-clause | from-clause | + /// is_device_ptr-clause | task_reduction-clause | in_reduction-clause | + /// allocator-clause | allocate-clause | acq_rel-clause | acquire-clause + /// | release-clause | relaxed-clause | depobj-clause | destroy-clause | + /// detach-clause | inclusive-clause | exclusive-clause | + /// uses_allocators-clause | use_device_addr-clause | has_device_addr + /// \endverbatim + /// + /// \param DKind Kind of current directive. + /// \param CKind Kind of current clause. + /// \param FirstClause true, if this is the first clause of a kind \a CKind + /// in current directive. + /// + OMPClause *ParseOpenMPClause(OpenMPDirectiveKind DKind, + OpenMPClauseKind CKind, bool FirstClause); + + /// Parses clause with a single expression of a kind \a Kind. + /// + /// Parsing of OpenMP clauses with single expressions like 'final', + /// 'collapse', 'safelen', 'num_threads', 'simdlen', 'num_teams', + /// 'thread_limit', 'simdlen', 'priority', 'grainsize', 'num_tasks', 'hint' or + /// 'detach'. + /// + /// \verbatim + /// final-clause: + /// 'final' '(' expression ')' + /// + /// num_threads-clause: + /// 'num_threads' '(' expression ')' + /// + /// safelen-clause: + /// 'safelen' '(' expression ')' + /// + /// simdlen-clause: + /// 'simdlen' '(' expression ')' + /// + /// collapse-clause: + /// 'collapse' '(' expression ')' + /// + /// priority-clause: + /// 'priority' '(' expression ')' + /// + /// grainsize-clause: + /// 'grainsize' '(' expression ')' + /// + /// num_tasks-clause: + /// 'num_tasks' '(' expression ')' + /// + /// hint-clause: + /// 'hint' '(' expression ')' + /// + /// allocator-clause: + /// 'allocator' '(' expression ')' + /// + /// detach-clause: + /// 'detach' '(' event-handler-expression ')' + /// + /// align-clause + /// 'align' '(' positive-integer-constant ')' + /// + /// holds-clause + /// 'holds' '(' expression ')' + /// \endverbatim + /// + /// \param Kind Kind of current clause. + /// \param ParseOnly true to skip the clause's semantic actions and return + /// nullptr. + /// + OMPClause *ParseOpenMPSingleExprClause(OpenMPClauseKind Kind, bool ParseOnly); + /// Parses simple clause like 'default' or 'proc_bind' of a kind \a Kind. + /// + /// \verbatim + /// default-clause: + /// 'default' '(' 'none' | 'shared' | 'private' | 'firstprivate' ')' + /// + /// proc_bind-clause: + /// 'proc_bind' '(' 'master' | 'close' | 'spread' ')' + /// + /// bind-clause: + /// 'bind' '(' 'teams' | 'parallel' | 'thread' ')' + /// + /// update-clause: + /// 'update' '(' 'in' | 'out' | 'inout' | 'mutexinoutset' | + /// 'inoutset' ')' + /// \endverbatim + /// + /// \param Kind Kind of current clause. + /// \param ParseOnly true to skip the clause's semantic actions and return + /// nullptr. + /// + OMPClause *ParseOpenMPSimpleClause(OpenMPClauseKind Kind, bool ParseOnly); + + /// Parse indirect clause for '#pragma omp declare target' directive. + /// 'indirect' '[' '(' invoked-by-fptr ')' ']' + /// where invoked-by-fptr is a constant boolean expression that evaluates to + /// true or false at compile time. + /// \param ParseOnly true to skip the clause's semantic actions and return + /// false; + bool ParseOpenMPIndirectClause(SemaOpenMP::DeclareTargetContextInfo &DTCI, + bool ParseOnly); + /// Parses clause with a single expression and an additional argument + /// of a kind \a Kind like 'schedule' or 'dist_schedule'. + /// + /// \verbatim + /// schedule-clause: + /// 'schedule' '(' [ modifier [ ',' modifier ] ':' ] kind [',' expression ] + /// ')' + /// + /// if-clause: + /// 'if' '(' [ directive-name-modifier ':' ] expression ')' + /// + /// defaultmap: + /// 'defaultmap' '(' modifier [ ':' kind ] ')' + /// + /// device-clause: + /// 'device' '(' [ device-modifier ':' ] expression ')' + /// \endverbatim + /// + /// \param DKind Directive kind. + /// \param Kind Kind of current clause. + /// \param ParseOnly true to skip the clause's semantic actions and return + /// nullptr. + /// + OMPClause *ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind, + OpenMPClauseKind Kind, + bool ParseOnly); + + /// Parses the 'sizes' clause of a '#pragma omp tile' directive. + OMPClause *ParseOpenMPSizesClause(); + + /// Parses the 'permutation' clause of a '#pragma omp interchange' directive. + OMPClause *ParseOpenMPPermutationClause(); + + /// Parses clause without any additional arguments like 'ordered'. + /// + /// \verbatim + /// ordered-clause: + /// 'ordered' + /// + /// nowait-clause: + /// 'nowait' + /// + /// untied-clause: + /// 'untied' + /// + /// mergeable-clause: + /// 'mergeable' + /// + /// read-clause: + /// 'read' + /// + /// threads-clause: + /// 'threads' + /// + /// simd-clause: + /// 'simd' + /// + /// nogroup-clause: + /// 'nogroup' + /// \endverbatim + /// + /// \param Kind Kind of current clause. + /// \param ParseOnly true to skip the clause's semantic actions and return + /// nullptr. + /// + OMPClause *ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly = false); + + /// Parses clause with the list of variables of a kind \a Kind: + /// 'private', 'firstprivate', 'lastprivate', + /// 'shared', 'copyin', 'copyprivate', 'flush', 'reduction', 'task_reduction', + /// 'in_reduction', 'nontemporal', 'exclusive' or 'inclusive'. + /// + /// \verbatim + /// private-clause: + /// 'private' '(' list ')' + /// firstprivate-clause: + /// 'firstprivate' '(' list ')' + /// lastprivate-clause: + /// 'lastprivate' '(' list ')' + /// shared-clause: + /// 'shared' '(' list ')' + /// linear-clause: + /// 'linear' '(' linear-list [ ':' linear-step ] ')' + /// aligned-clause: + /// 'aligned' '(' list [ ':' alignment ] ')' + /// reduction-clause: + /// 'reduction' '(' [ modifier ',' ] reduction-identifier ':' list ')' + /// task_reduction-clause: + /// 'task_reduction' '(' reduction-identifier ':' list ')' + /// in_reduction-clause: + /// 'in_reduction' '(' reduction-identifier ':' list ')' + /// copyprivate-clause: + /// 'copyprivate' '(' list ')' + /// flush-clause: + /// 'flush' '(' list ')' + /// depend-clause: + /// 'depend' '(' in | out | inout : list | source ')' + /// map-clause: + /// 'map' '(' [ [ always [,] ] [ close [,] ] + /// [ mapper '(' mapper-identifier ')' [,] ] + /// to | from | tofrom | alloc | release | delete ':' ] list ')'; + /// to-clause: + /// 'to' '(' [ mapper '(' mapper-identifier ')' ':' ] list ')' + /// from-clause: + /// 'from' '(' [ mapper '(' mapper-identifier ')' ':' ] list ')' + /// use_device_ptr-clause: + /// 'use_device_ptr' '(' list ')' + /// use_device_addr-clause: + /// 'use_device_addr' '(' list ')' + /// is_device_ptr-clause: + /// 'is_device_ptr' '(' list ')' + /// has_device_addr-clause: + /// 'has_device_addr' '(' list ')' + /// allocate-clause: + /// 'allocate' '(' [ allocator ':' ] list ')' + /// As of OpenMP 5.1 there's also + /// 'allocate' '(' allocate-modifier: list ')' + /// where allocate-modifier is: 'allocator' '(' allocator ')' + /// nontemporal-clause: + /// 'nontemporal' '(' list ')' + /// inclusive-clause: + /// 'inclusive' '(' list ')' + /// exclusive-clause: + /// 'exclusive' '(' list ')' + /// \endverbatim + /// + /// For 'linear' clause linear-list may have the following forms: + /// list + /// modifier(list) + /// where modifier is 'val' (C) or 'ref', 'val' or 'uval'(C++). + /// + /// \param Kind Kind of current clause. + /// \param ParseOnly true to skip the clause's semantic actions and return + /// nullptr. + /// + OMPClause *ParseOpenMPVarListClause(OpenMPDirectiveKind DKind, + OpenMPClauseKind Kind, bool ParseOnly); + + /// Parses a clause consisting of a list of expressions. + /// + /// \param Kind The clause to parse. + /// \param ClauseNameLoc [out] The location of the clause name. + /// \param OpenLoc [out] The location of '('. + /// \param CloseLoc [out] The location of ')'. + /// \param Exprs [out] The parsed expressions. + /// \param ReqIntConst If true, each expression must be an integer constant. + /// + /// \return Whether the clause was parsed successfully. + bool ParseOpenMPExprListClause(OpenMPClauseKind Kind, + SourceLocation &ClauseNameLoc, + SourceLocation &OpenLoc, + SourceLocation &CloseLoc, + SmallVectorImpl &Exprs, + bool ReqIntConst = false); + + /// Parses simple expression in parens for single-expression clauses of OpenMP + /// constructs. + /// \verbatim + /// = 'iterator' '(' { [ ] identifier = + /// }+ ')' + /// \endverbatim + ExprResult ParseOpenMPIteratorsExpr(); + + /// Parses allocators and traits in the context of the uses_allocator clause. + /// Expected format: + /// \verbatim + /// '(' { [ '(' ')' ] }+ ')' + /// \endverbatim + OMPClause *ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind); + + /// Parses the 'interop' parts of the 'append_args' and 'init' clauses. + bool ParseOMPInteropInfo(OMPInteropInfo &InteropInfo, OpenMPClauseKind Kind); + + /// Parses clause with an interop variable of kind \a Kind. + /// + /// \verbatim + /// init-clause: + /// init([interop-modifier, ]interop-type[[, interop-type] ... ]:interop-var) + /// + /// destroy-clause: + /// destroy(interop-var) + /// + /// use-clause: + /// use(interop-var) + /// + /// interop-modifier: + /// prefer_type(preference-list) + /// + /// preference-list: + /// foreign-runtime-id [, foreign-runtime-id]... + /// + /// foreign-runtime-id: + /// | + /// + /// interop-type: + /// target | targetsync + /// \endverbatim + /// + /// \param Kind Kind of current clause. + /// \param ParseOnly true to skip the clause's semantic actions and return + /// nullptr. + // + OMPClause *ParseOpenMPInteropClause(OpenMPClauseKind Kind, bool ParseOnly); + + /// Parses a ompx_attribute clause + /// + /// \param ParseOnly true to skip the clause's semantic actions and return + /// nullptr. + // + OMPClause *ParseOpenMPOMPXAttributesClause(bool ParseOnly); + +public: + /// Parses simple expression in parens for single-expression clauses of OpenMP + /// constructs. + /// \param RLoc Returned location of right paren. + ExprResult ParseOpenMPParensExpr(StringRef ClauseName, SourceLocation &RLoc, + bool IsAddressOfOperand = false); + + /// Parses a reserved locator like 'omp_all_memory'. + bool ParseOpenMPReservedLocator(OpenMPClauseKind Kind, + SemaOpenMP::OpenMPVarListDataTy &Data, + const LangOptions &LangOpts); + /// Parses clauses with list. + bool ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, + SmallVectorImpl &Vars, + SemaOpenMP::OpenMPVarListDataTy &Data); + + /// Parses the mapper modifier in map, to, and from clauses. + bool parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data); + + /// Parse map-type-modifiers in map clause. + /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] [map-type] : ] list) + /// where, map-type-modifier ::= always | close | mapper(mapper-identifier) | + /// present + /// where, map-type ::= alloc | delete | from | release | to | tofrom + bool parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data); + + /// Parses 'omp begin declare variant' directive. + /// The syntax is: + /// \verbatim + /// { #pragma omp begin declare variant clause } + /// + /// { #pragma omp end declare variant } + /// \endverbatim + /// + bool ParseOpenMPDeclareBeginVariantDirective(SourceLocation Loc); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name Pragmas + /// Implementations are in ParsePragma.cpp + ///@{ + +private: + std::unique_ptr AlignHandler; + std::unique_ptr GCCVisibilityHandler; + std::unique_ptr OptionsHandler; + std::unique_ptr PackHandler; + std::unique_ptr MSStructHandler; + std::unique_ptr UnusedHandler; + std::unique_ptr WeakHandler; + std::unique_ptr RedefineExtnameHandler; + std::unique_ptr FPContractHandler; + std::unique_ptr OpenCLExtensionHandler; + std::unique_ptr OpenMPHandler; + std::unique_ptr OpenACCHandler; + std::unique_ptr PCSectionHandler; + std::unique_ptr MSCommentHandler; + std::unique_ptr MSDetectMismatchHandler; + std::unique_ptr FPEvalMethodHandler; + std::unique_ptr FloatControlHandler; + std::unique_ptr MSPointersToMembers; + std::unique_ptr MSVtorDisp; + std::unique_ptr MSInitSeg; + std::unique_ptr MSDataSeg; + std::unique_ptr MSBSSSeg; + std::unique_ptr MSConstSeg; + std::unique_ptr MSCodeSeg; + std::unique_ptr MSSection; + std::unique_ptr MSStrictGuardStackCheck; + std::unique_ptr MSRuntimeChecks; + std::unique_ptr MSIntrinsic; + std::unique_ptr MSFunction; + std::unique_ptr MSOptimize; + std::unique_ptr MSFenvAccess; + std::unique_ptr MSAllocText; + std::unique_ptr CUDAForceHostDeviceHandler; + std::unique_ptr OptimizeHandler; + std::unique_ptr LoopHintHandler; + std::unique_ptr UnrollHintHandler; + std::unique_ptr NoUnrollHintHandler; + std::unique_ptr UnrollAndJamHintHandler; + std::unique_ptr NoUnrollAndJamHintHandler; + std::unique_ptr FPHandler; + std::unique_ptr STDCFenvAccessHandler; + std::unique_ptr STDCFenvRoundHandler; + std::unique_ptr STDCCXLIMITHandler; + std::unique_ptr STDCUnknownHandler; + std::unique_ptr AttributePragmaHandler; + std::unique_ptr MaxTokensHerePragmaHandler; + std::unique_ptr MaxTokensTotalPragmaHandler; + std::unique_ptr RISCVPragmaHandler; + + /// Initialize all pragma handlers. + void initializePragmaHandlers(); + + /// Destroy and reset all pragma handlers. + void resetPragmaHandlers(); + + /// Handle the annotation token produced for #pragma unused(...) + /// + /// Each annot_pragma_unused is followed by the argument token so e.g. + /// "#pragma unused(x,y)" becomes: + /// annot_pragma_unused 'x' annot_pragma_unused 'y' + void HandlePragmaUnused(); + + /// Handle the annotation token produced for + /// #pragma GCC visibility... + void HandlePragmaVisibility(); + + /// Handle the annotation token produced for + /// #pragma pack... + void HandlePragmaPack(); + + /// Handle the annotation token produced for + /// #pragma ms_struct... + void HandlePragmaMSStruct(); + + void HandlePragmaMSPointersToMembers(); + + void HandlePragmaMSVtorDisp(); + + void HandlePragmaMSPragma(); + bool HandlePragmaMSSection(StringRef PragmaName, + SourceLocation PragmaLocation); + bool HandlePragmaMSSegment(StringRef PragmaName, + SourceLocation PragmaLocation); + + // #pragma init_seg({ compiler | lib | user | "section-name" [, func-name]} ) + bool HandlePragmaMSInitSeg(StringRef PragmaName, + SourceLocation PragmaLocation); + + // #pragma strict_gs_check(pop) + // #pragma strict_gs_check(push, "on" | "off") + // #pragma strict_gs_check("on" | "off") + bool HandlePragmaMSStrictGuardStackCheck(StringRef PragmaName, + SourceLocation PragmaLocation); + bool HandlePragmaMSFunction(StringRef PragmaName, + SourceLocation PragmaLocation); + bool HandlePragmaMSAllocText(StringRef PragmaName, + SourceLocation PragmaLocation); + + // #pragma optimize("gsty", on|off) + bool HandlePragmaMSOptimize(StringRef PragmaName, + SourceLocation PragmaLocation); + + /// Handle the annotation token produced for + /// #pragma align... + void HandlePragmaAlign(); + + /// Handle the annotation token produced for + /// #pragma clang __debug dump... + void HandlePragmaDump(); + + /// Handle the annotation token produced for + /// #pragma weak id... + void HandlePragmaWeak(); + + /// Handle the annotation token produced for + /// #pragma weak id = id... + void HandlePragmaWeakAlias(); + + /// Handle the annotation token produced for + /// #pragma redefine_extname... + void HandlePragmaRedefineExtname(); + + /// Handle the annotation token produced for + /// #pragma STDC FP_CONTRACT... + void HandlePragmaFPContract(); + + /// Handle the annotation token produced for + /// #pragma STDC FENV_ACCESS... + void HandlePragmaFEnvAccess(); + + /// Handle the annotation token produced for + /// #pragma STDC FENV_ROUND... + void HandlePragmaFEnvRound(); + + /// Handle the annotation token produced for + /// #pragma STDC CX_LIMITED_RANGE... + void HandlePragmaCXLimitedRange(); + + /// Handle the annotation token produced for + /// #pragma float_control + void HandlePragmaFloatControl(); + + /// \brief Handle the annotation token produced for + /// #pragma clang fp ... + void HandlePragmaFP(); + + /// Handle the annotation token produced for + /// #pragma OPENCL EXTENSION... + void HandlePragmaOpenCLExtension(); + + /// Handle the annotation token produced for + /// #pragma clang __debug captured + StmtResult HandlePragmaCaptured(); + + /// Handle the annotation token produced for + /// #pragma clang loop and #pragma unroll. + bool HandlePragmaLoopHint(LoopHint &Hint); + + bool ParsePragmaAttributeSubjectMatchRuleSet( + attr::ParsedSubjectMatchRuleSet &SubjectMatchRules, + SourceLocation &AnyLoc, SourceLocation &LastMatchRuleEndLoc); + + void HandlePragmaAttribute(); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name Statements + /// Implementations are in ParseStmt.cpp + ///@{ + +public: + /// A SmallVector of statements. + typedef SmallVector StmtVector; + + /// The location of the first statement inside an else that might + /// have a missleading indentation. If there is no + /// MisleadingIndentationChecker on an else active, this location is invalid. + SourceLocation MisleadingIndentationElseLoc; + + private: + + /// Flags describing a context in which we're parsing a statement. + enum class ParsedStmtContext { + /// This context permits declarations in language modes where declarations + /// are not statements. + AllowDeclarationsInC = 0x1, + /// This context permits standalone OpenMP directives. + AllowStandaloneOpenMPDirectives = 0x2, + /// This context is at the top level of a GNU statement expression. + InStmtExpr = 0x4, + + /// The context of a regular substatement. + SubStmt = 0, + /// The context of a compound-statement. + Compound = AllowDeclarationsInC | AllowStandaloneOpenMPDirectives, + + LLVM_MARK_AS_BITMASK_ENUM(InStmtExpr) + }; + + /// Act on an expression statement that might be the last statement in a + /// GNU statement expression. Checks whether we are actually at the end of + /// a statement expression and builds a suitable expression statement. + StmtResult handleExprStmt(ExprResult E, ParsedStmtContext StmtCtx); + + //===--------------------------------------------------------------------===// + // C99 6.8: Statements and Blocks. + + /// Parse a standalone statement (for instance, as the body of an 'if', + /// 'while', or 'for'). + StmtResult + ParseStatement(SourceLocation *TrailingElseLoc = nullptr, + ParsedStmtContext StmtCtx = ParsedStmtContext::SubStmt); + + /// ParseStatementOrDeclaration - Read 'statement' or 'declaration'. + /// \verbatim + /// StatementOrDeclaration: + /// statement + /// declaration + /// + /// statement: + /// labeled-statement + /// compound-statement + /// expression-statement + /// selection-statement + /// iteration-statement + /// jump-statement + /// [C++] declaration-statement + /// [C++] try-block + /// [MS] seh-try-block + /// [OBC] objc-throw-statement + /// [OBC] objc-try-catch-statement + /// [OBC] objc-synchronized-statement + /// [GNU] asm-statement + /// [OMP] openmp-construct [TODO] + /// + /// labeled-statement: + /// identifier ':' statement + /// 'case' constant-expression ':' statement + /// 'default' ':' statement + /// + /// selection-statement: + /// if-statement + /// switch-statement + /// + /// iteration-statement: + /// while-statement + /// do-statement + /// for-statement + /// + /// expression-statement: + /// expression[opt] ';' + /// + /// jump-statement: + /// 'goto' identifier ';' + /// 'continue' ';' + /// 'break' ';' + /// 'return' expression[opt] ';' + /// [GNU] 'goto' '*' expression ';' + /// + /// [OBC] objc-throw-statement: + /// [OBC] '@' 'throw' expression ';' + /// [OBC] '@' 'throw' ';' + /// \endverbatim + /// + StmtResult + ParseStatementOrDeclaration(StmtVector &Stmts, ParsedStmtContext StmtCtx, + SourceLocation *TrailingElseLoc = nullptr); + + StmtResult ParseStatementOrDeclarationAfterAttributes( + StmtVector &Stmts, ParsedStmtContext StmtCtx, + SourceLocation *TrailingElseLoc, ParsedAttributes &DeclAttrs, + ParsedAttributes &DeclSpecAttrs); + + /// Parse an expression statement. + StmtResult ParseExprStatement(ParsedStmtContext StmtCtx); + + /// ParseLabeledStatement - We have an identifier and a ':' after it. + /// + /// \verbatim + /// label: + /// identifier ':' + /// [GNU] identifier ':' attributes[opt] + /// + /// labeled-statement: + /// label statement + /// \endverbatim + /// + StmtResult ParseLabeledStatement(ParsedAttributes &Attrs, + ParsedStmtContext StmtCtx); + + /// ParseCaseStatement + /// \verbatim + /// labeled-statement: + /// 'case' constant-expression ':' statement + /// [GNU] 'case' constant-expression '...' constant-expression ':' statement + /// \endverbatim + /// + StmtResult ParseCaseStatement(ParsedStmtContext StmtCtx, + bool MissingCase = false, + ExprResult Expr = ExprResult()); + + /// ParseDefaultStatement + /// \verbatim + /// labeled-statement: + /// 'default' ':' statement + /// \endverbatim + /// Note that this does not parse the 'statement' at the end. + /// + StmtResult ParseDefaultStatement(ParsedStmtContext StmtCtx); + + StmtResult ParseCompoundStatement(bool isStmtExpr = false); + + /// ParseCompoundStatement - Parse a "{}" block. + /// + /// \verbatim + /// compound-statement: [C99 6.8.2] + /// { block-item-list[opt] } + /// [GNU] { label-declarations block-item-list } [TODO] + /// + /// block-item-list: + /// block-item + /// block-item-list block-item + /// + /// block-item: + /// declaration + /// [GNU] '__extension__' declaration + /// statement + /// + /// [GNU] label-declarations: + /// [GNU] label-declaration + /// [GNU] label-declarations label-declaration + /// + /// [GNU] label-declaration: + /// [GNU] '__label__' identifier-list ';' + /// \endverbatim + /// + StmtResult ParseCompoundStatement(bool isStmtExpr, unsigned ScopeFlags); + + /// Parse any pragmas at the start of the compound expression. We handle these + /// separately since some pragmas (FP_CONTRACT) must appear before any C + /// statement in the compound, but may be intermingled with other pragmas. + void ParseCompoundStatementLeadingPragmas(); + + void DiagnoseLabelAtEndOfCompoundStatement(); + + /// Consume any extra semi-colons resulting in null statements, + /// returning true if any tok::semi were consumed. + bool ConsumeNullStmt(StmtVector &Stmts); + + /// ParseCompoundStatementBody - Parse a sequence of statements optionally + /// followed by a label and invoke the ActOnCompoundStmt action. This expects + /// the '{' to be the current token, and consume the '}' at the end of the + /// block. It does not manipulate the scope stack. + StmtResult ParseCompoundStatementBody(bool isStmtExpr = false); + + /// ParseParenExprOrCondition: + /// \verbatim + /// [C ] '(' expression ')' + /// [C++] '(' condition ')' + /// [C++1z] '(' init-statement[opt] condition ')' + /// \endverbatim + /// + /// This function parses and performs error recovery on the specified + /// condition or expression (depending on whether we're in C++ or C mode). + /// This function goes out of its way to recover well. It returns true if + /// there was a parser error (the right paren couldn't be found), which + /// indicates that the caller should try to recover harder. It returns false + /// if the condition is successfully parsed. Note that a successful parse can + /// still have semantic errors in the condition. Additionally, it will assign + /// the location of the outer-most '(' and ')', to LParenLoc and RParenLoc, + /// respectively. + bool ParseParenExprOrCondition(StmtResult *InitStmt, + Sema::ConditionResult &CondResult, + SourceLocation Loc, Sema::ConditionKind CK, + SourceLocation &LParenLoc, + SourceLocation &RParenLoc); + + /// ParseIfStatement + /// \verbatim + /// if-statement: [C99 6.8.4.1] + /// 'if' '(' expression ')' statement + /// 'if' '(' expression ')' statement 'else' statement + /// [C++] 'if' '(' condition ')' statement + /// [C++] 'if' '(' condition ')' statement 'else' statement + /// [C++23] 'if' '!' [opt] consteval compound-statement + /// [C++23] 'if' '!' [opt] consteval compound-statement 'else' statement + /// \endverbatim + /// + StmtResult ParseIfStatement(SourceLocation *TrailingElseLoc); + + /// ParseSwitchStatement + /// \verbatim + /// switch-statement: + /// 'switch' '(' expression ')' statement + /// [C++] 'switch' '(' condition ')' statement + /// \endverbatim + StmtResult ParseSwitchStatement(SourceLocation *TrailingElseLoc); + + /// ParseWhileStatement + /// \verbatim + /// while-statement: [C99 6.8.5.1] + /// 'while' '(' expression ')' statement + /// [C++] 'while' '(' condition ')' statement + /// \endverbatim + StmtResult ParseWhileStatement(SourceLocation *TrailingElseLoc); + + /// ParseDoStatement + /// \verbatim + /// do-statement: [C99 6.8.5.2] + /// 'do' statement 'while' '(' expression ')' ';' + /// \endverbatim + /// Note: this lets the caller parse the end ';'. + StmtResult ParseDoStatement(); + + /// ParseForStatement + /// \verbatim + /// for-statement: [C99 6.8.5.3] + /// 'for' '(' expr[opt] ';' expr[opt] ';' expr[opt] ')' statement + /// 'for' '(' declaration expr[opt] ';' expr[opt] ')' statement + /// [C++] 'for' '(' for-init-statement condition[opt] ';' expression[opt] ')' + /// [C++] statement + /// [C++0x] 'for' + /// 'co_await'[opt] [Coroutines] + /// '(' for-range-declaration ':' for-range-initializer ')' + /// statement + /// [OBJC2] 'for' '(' declaration 'in' expr ')' statement + /// [OBJC2] 'for' '(' expr 'in' expr ')' statement + /// + /// [C++] for-init-statement: + /// [C++] expression-statement + /// [C++] simple-declaration + /// [C++23] alias-declaration + /// + /// [C++0x] for-range-declaration: + /// [C++0x] attribute-specifier-seq[opt] type-specifier-seq declarator + /// [C++0x] for-range-initializer: + /// [C++0x] expression + /// [C++0x] braced-init-list [TODO] + /// \endverbatim + StmtResult ParseForStatement(SourceLocation *TrailingElseLoc); + + /// ParseGotoStatement + /// \verbatim + /// jump-statement: + /// 'goto' identifier ';' + /// [GNU] 'goto' '*' expression ';' + /// \endverbatim + /// + /// Note: this lets the caller parse the end ';'. + /// + StmtResult ParseGotoStatement(); + + /// ParseContinueStatement + /// \verbatim + /// jump-statement: + /// 'continue' ';' + /// \endverbatim + /// + /// Note: this lets the caller parse the end ';'. + /// + StmtResult ParseContinueStatement(); + + /// ParseBreakStatement + /// \verbatim + /// jump-statement: + /// 'break' ';' + /// \endverbatim + /// + /// Note: this lets the caller parse the end ';'. + /// + StmtResult ParseBreakStatement(); + + /// ParseReturnStatement + /// \verbatim + /// jump-statement: + /// 'return' expression[opt] ';' + /// 'return' braced-init-list ';' + /// 'co_return' expression[opt] ';' + /// 'co_return' braced-init-list ';' + /// \endverbatim + StmtResult ParseReturnStatement(); + + StmtResult ParsePragmaLoopHint(StmtVector &Stmts, ParsedStmtContext StmtCtx, + SourceLocation *TrailingElseLoc, + ParsedAttributes &Attrs); + + void ParseMicrosoftIfExistsStatement(StmtVector &Stmts); + + //===--------------------------------------------------------------------===// + // C++ 6: Statements and Blocks + + /// ParseCXXTryBlock - Parse a C++ try-block. + /// + /// \verbatim + /// try-block: + /// 'try' compound-statement handler-seq + /// \endverbatim + /// + StmtResult ParseCXXTryBlock(); + + /// ParseCXXTryBlockCommon - Parse the common part of try-block and + /// function-try-block. + /// + /// \verbatim + /// try-block: + /// 'try' compound-statement handler-seq + /// + /// function-try-block: + /// 'try' ctor-initializer[opt] compound-statement handler-seq + /// + /// handler-seq: + /// handler handler-seq[opt] + /// + /// [Borland] try-block: + /// 'try' compound-statement seh-except-block + /// 'try' compound-statement seh-finally-block + /// \endverbatim + /// + StmtResult ParseCXXTryBlockCommon(SourceLocation TryLoc, bool FnTry = false); + + /// ParseCXXCatchBlock - Parse a C++ catch block, called handler in the + /// standard + /// + /// \verbatim + /// handler: + /// 'catch' '(' exception-declaration ')' compound-statement + /// + /// exception-declaration: + /// attribute-specifier-seq[opt] type-specifier-seq declarator + /// attribute-specifier-seq[opt] type-specifier-seq abstract-declarator[opt] + /// '...' + /// \endverbatim + /// + StmtResult ParseCXXCatchBlock(bool FnCatch = false); + + //===--------------------------------------------------------------------===// + // MS: SEH Statements and Blocks + + /// ParseSEHTryBlockCommon + /// + /// \verbatim + /// seh-try-block: + /// '__try' compound-statement seh-handler + /// + /// seh-handler: + /// seh-except-block + /// seh-finally-block + /// \endverbatim + /// + StmtResult ParseSEHTryBlock(); + + /// ParseSEHExceptBlock - Handle __except + /// + /// \verbatim + /// seh-except-block: + /// '__except' '(' seh-filter-expression ')' compound-statement + /// \endverbatim + /// + StmtResult ParseSEHExceptBlock(SourceLocation Loc); + + /// ParseSEHFinallyBlock - Handle __finally + /// + /// \verbatim + /// seh-finally-block: + /// '__finally' compound-statement + /// \endverbatim + /// + StmtResult ParseSEHFinallyBlock(SourceLocation Loc); + + StmtResult ParseSEHLeaveStatement(); + + Decl *ParseFunctionStatementBody(Decl *Decl, ParseScope &BodyScope); + + /// ParseFunctionTryBlock - Parse a C++ function-try-block. + /// + /// \verbatim + /// function-try-block: + /// 'try' ctor-initializer[opt] compound-statement handler-seq + /// \endverbatim + /// + Decl *ParseFunctionTryBlock(Decl *Decl, ParseScope &BodyScope); + + /// When in code-completion, skip parsing of the function/method body + /// unless the body contains the code-completion point. + /// + /// \returns true if the function body was skipped. + bool trySkippingFunctionBody(); + + /// isDeclarationStatement - Disambiguates between a declaration or an + /// expression statement, when parsing function bodies. + /// + /// \param DisambiguatingWithExpression - True to indicate that the purpose of + /// this check is to disambiguate between an expression and a declaration. + /// Returns true for declaration, false for expression. + bool isDeclarationStatement(bool DisambiguatingWithExpression = false) { + if (getLangOpts().CPlusPlus) + return isCXXDeclarationStatement(DisambiguatingWithExpression); + return isDeclarationSpecifier(ImplicitTypenameContext::No, true); + } + + /// isForInitDeclaration - Disambiguates between a declaration or an + /// expression in the context of the C 'clause-1' or the C++ + // 'for-init-statement' part of a 'for' statement. + /// Returns true for declaration, false for expression. + bool isForInitDeclaration() { + if (getLangOpts().OpenMP) + Actions.OpenMP().startOpenMPLoop(); + if (getLangOpts().CPlusPlus) + return Tok.is(tok::kw_using) || + isCXXSimpleDeclaration(/*AllowForRangeDecl=*/true); + return isDeclarationSpecifier(ImplicitTypenameContext::No, true); + } + + /// Determine whether this is a C++1z for-range-identifier. + bool isForRangeIdentifier(); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name `inline asm` Statement + /// Implementations are in ParseStmtAsm.cpp + ///@{ + +public: + /// Parse an identifier in an MS-style inline assembly block. + ExprResult ParseMSAsmIdentifier(llvm::SmallVectorImpl &LineToks, + unsigned &NumLineToksConsumed, + bool IsUnevaluated); + +private: + /// ParseAsmStatement - Parse a GNU extended asm statement. + /// \verbatim + /// asm-statement: + /// gnu-asm-statement + /// ms-asm-statement + /// + /// [GNU] gnu-asm-statement: + /// 'asm' asm-qualifier-list[opt] '(' asm-argument ')' ';' + /// + /// [GNU] asm-argument: + /// asm-string-literal + /// asm-string-literal ':' asm-operands[opt] + /// asm-string-literal ':' asm-operands[opt] ':' asm-operands[opt] + /// asm-string-literal ':' asm-operands[opt] ':' asm-operands[opt] + /// ':' asm-clobbers + /// + /// [GNU] asm-clobbers: + /// asm-string-literal + /// asm-clobbers ',' asm-string-literal + /// \endverbatim + /// + StmtResult ParseAsmStatement(bool &msAsm); + + /// ParseMicrosoftAsmStatement. When -fms-extensions/-fasm-blocks is enabled, + /// this routine is called to collect the tokens for an MS asm statement. + /// + /// \verbatim + /// [MS] ms-asm-statement: + /// ms-asm-block + /// ms-asm-block ms-asm-statement + /// + /// [MS] ms-asm-block: + /// '__asm' ms-asm-line '\n' + /// '__asm' '{' ms-asm-instruction-block[opt] '}' ';'[opt] + /// + /// [MS] ms-asm-instruction-block + /// ms-asm-line + /// ms-asm-line '\n' ms-asm-instruction-block + /// \endverbatim + /// + StmtResult ParseMicrosoftAsmStatement(SourceLocation AsmLoc); + + /// ParseAsmOperands - Parse the asm-operands production as used by + /// asm-statement, assuming the leading ':' token was eaten. + /// + /// \verbatim + /// [GNU] asm-operands: + /// asm-operand + /// asm-operands ',' asm-operand + /// + /// [GNU] asm-operand: + /// asm-string-literal '(' expression ')' + /// '[' identifier ']' asm-string-literal '(' expression ')' + /// \endverbatim + /// + // FIXME: Avoid unnecessary std::string trashing. + bool ParseAsmOperandsOpt(SmallVectorImpl &Names, + SmallVectorImpl &Constraints, + SmallVectorImpl &Exprs); + + class GNUAsmQualifiers { + unsigned Qualifiers = AQ_unspecified; + + public: + enum AQ { + AQ_unspecified = 0, + AQ_volatile = 1, + AQ_inline = 2, + AQ_goto = 4, + }; + static const char *getQualifierName(AQ Qualifier); + bool setAsmQualifier(AQ Qualifier); + inline bool isVolatile() const { return Qualifiers & AQ_volatile; }; + inline bool isInline() const { return Qualifiers & AQ_inline; }; + inline bool isGoto() const { return Qualifiers & AQ_goto; } + }; + + // Determine if this is a GCC-style asm statement. + bool isGCCAsmStatement(const Token &TokAfterAsm) const; + + bool isGNUAsmQualifier(const Token &TokAfterAsm) const; + GNUAsmQualifiers::AQ getGNUAsmQualifier(const Token &Tok) const; + + /// parseGNUAsmQualifierListOpt - Parse a GNU extended asm qualifier list. + /// \verbatim + /// asm-qualifier: + /// volatile + /// inline + /// goto + /// + /// asm-qualifier-list: + /// asm-qualifier + /// asm-qualifier-list asm-qualifier + /// \endverbatim + bool parseGNUAsmQualifierListOpt(GNUAsmQualifiers &AQ); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name C++ Templates + /// Implementations are in ParseTemplate.cpp + ///@{ + +public: + typedef SmallVector TemplateParameterLists; + + /// Re-enter a possible template scope, creating as many template parameter + /// scopes as necessary. + /// \return The number of template parameter scopes entered. + unsigned ReenterTemplateScopes(MultiParseScope &S, Decl *D); + +private: + /// The "depth" of the template parameters currently being parsed. + unsigned TemplateParameterDepth; + + /// RAII class that manages the template parameter depth. + class TemplateParameterDepthRAII { + unsigned &Depth; + unsigned AddedLevels; + + public: + explicit TemplateParameterDepthRAII(unsigned &Depth) + : Depth(Depth), AddedLevels(0) {} + + ~TemplateParameterDepthRAII() { Depth -= AddedLevels; } + + void operator++() { + ++Depth; + ++AddedLevels; + } + void addDepth(unsigned D) { + Depth += D; + AddedLevels += D; + } + void setAddedDepth(unsigned D) { + Depth = Depth - AddedLevels + D; + AddedLevels = D; + } + + unsigned getDepth() const { return Depth; } + unsigned getOriginalDepth() const { return Depth - AddedLevels; } + }; + + /// Gathers and cleans up TemplateIdAnnotations when parsing of a + /// top-level declaration is finished. + SmallVector TemplateIds; + + /// Don't destroy template annotations in MaybeDestroyTemplateIds even if + /// we're at the end of a declaration. Instead, we defer the destruction until + /// after a top-level declaration. + /// Use DelayTemplateIdDestructionRAII rather than setting it directly. + bool DelayTemplateIdDestruction = false; + + void MaybeDestroyTemplateIds() { + if (DelayTemplateIdDestruction) + return; + if (!TemplateIds.empty() && + (Tok.is(tok::eof) || !PP.mightHavePendingAnnotationTokens())) + DestroyTemplateIds(); + } + void DestroyTemplateIds(); + + /// RAII object to destroy TemplateIdAnnotations where possible, from a + /// likely-good position during parsing. + struct DestroyTemplateIdAnnotationsRAIIObj { + Parser &Self; + + DestroyTemplateIdAnnotationsRAIIObj(Parser &Self) : Self(Self) {} + ~DestroyTemplateIdAnnotationsRAIIObj() { Self.MaybeDestroyTemplateIds(); } + }; + + struct DelayTemplateIdDestructionRAII { + Parser &Self; + bool PrevDelayTemplateIdDestruction; + + DelayTemplateIdDestructionRAII(Parser &Self, + bool DelayTemplateIdDestruction) noexcept + : Self(Self), + PrevDelayTemplateIdDestruction(Self.DelayTemplateIdDestruction) { + Self.DelayTemplateIdDestruction = DelayTemplateIdDestruction; + } + + ~DelayTemplateIdDestructionRAII() noexcept { + Self.DelayTemplateIdDestruction = PrevDelayTemplateIdDestruction; + } + }; + + /// Identifiers which have been declared within a tentative parse. + SmallVector TentativelyDeclaredIdentifiers; + + /// Tracker for '<' tokens that might have been intended to be treated as an + /// angle bracket instead of a less-than comparison. + /// + /// This happens when the user intends to form a template-id, but typoes the + /// template-name or forgets a 'template' keyword for a dependent template + /// name. + /// + /// We track these locations from the point where we see a '<' with a + /// name-like expression on its left until we see a '>' or '>>' that might + /// match it. + struct AngleBracketTracker { + /// Flags used to rank candidate template names when there is more than one + /// '<' in a scope. + enum Priority : unsigned short { + /// A non-dependent name that is a potential typo for a template name. + PotentialTypo = 0x0, + /// A dependent name that might instantiate to a template-name. + DependentName = 0x2, + + /// A space appears before the '<' token. + SpaceBeforeLess = 0x0, + /// No space before the '<' token + NoSpaceBeforeLess = 0x1, + + LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue*/ DependentName) + }; + + struct Loc { + Expr *TemplateName; + SourceLocation LessLoc; + AngleBracketTracker::Priority Priority; + unsigned short ParenCount, BracketCount, BraceCount; + + bool isActive(Parser &P) const { + return P.ParenCount == ParenCount && P.BracketCount == BracketCount && + P.BraceCount == BraceCount; + } + + bool isActiveOrNested(Parser &P) const { + return isActive(P) || P.ParenCount > ParenCount || + P.BracketCount > BracketCount || P.BraceCount > BraceCount; + } + }; + + SmallVector Locs; + + /// Add an expression that might have been intended to be a template name. + /// In the case of ambiguity, we arbitrarily select the innermost such + /// expression, for example in 'foo < bar < baz', 'bar' is the current + /// candidate. No attempt is made to track that 'foo' is also a candidate + /// for the case where we see a second suspicious '>' token. + void add(Parser &P, Expr *TemplateName, SourceLocation LessLoc, + Priority Prio) { + if (!Locs.empty() && Locs.back().isActive(P)) { + if (Locs.back().Priority <= Prio) { + Locs.back().TemplateName = TemplateName; + Locs.back().LessLoc = LessLoc; + Locs.back().Priority = Prio; + } + } else { + Locs.push_back({TemplateName, LessLoc, Prio, P.ParenCount, + P.BracketCount, P.BraceCount}); + } + } + + /// Mark the current potential missing template location as having been + /// handled (this happens if we pass a "corresponding" '>' or '>>' token + /// or leave a bracket scope). + void clear(Parser &P) { + while (!Locs.empty() && Locs.back().isActiveOrNested(P)) + Locs.pop_back(); + } + + /// Get the current enclosing expression that might hve been intended to be + /// a template name. + Loc *getCurrent(Parser &P) { + if (!Locs.empty() && Locs.back().isActive(P)) + return &Locs.back(); + return nullptr; + } + }; + + AngleBracketTracker AngleBrackets; + + /// Contains information about any template-specific + /// information that has been parsed prior to parsing declaration + /// specifiers. + struct ParsedTemplateInfo { + ParsedTemplateInfo() + : Kind(ParsedTemplateKind::NonTemplate), TemplateParams(nullptr) {} + + ParsedTemplateInfo(TemplateParameterLists *TemplateParams, + bool isSpecialization, + bool lastParameterListWasEmpty = false) + : Kind(isSpecialization ? ParsedTemplateKind::ExplicitSpecialization + : ParsedTemplateKind::Template), + TemplateParams(TemplateParams), + LastParameterListWasEmpty(lastParameterListWasEmpty) {} + + explicit ParsedTemplateInfo(SourceLocation ExternLoc, + SourceLocation TemplateLoc) + : Kind(ParsedTemplateKind::ExplicitInstantiation), + TemplateParams(nullptr), ExternLoc(ExternLoc), + TemplateLoc(TemplateLoc), LastParameterListWasEmpty(false) {} + + ParsedTemplateKind Kind; + + /// The template parameter lists, for template declarations + /// and explicit specializations. + TemplateParameterLists *TemplateParams; + + /// The location of the 'extern' keyword, if any, for an explicit + /// instantiation + SourceLocation ExternLoc; + + /// The location of the 'template' keyword, for an explicit + /// instantiation. + SourceLocation TemplateLoc; + + /// Whether the last template parameter list was empty. + bool LastParameterListWasEmpty; + + SourceRange getSourceRange() const LLVM_READONLY; + }; + + /// Lex a delayed template function for late parsing. + void LexTemplateFunctionForLateParsing(CachedTokens &Toks); + + /// Late parse a C++ function template in Microsoft mode. + void ParseLateTemplatedFuncDef(LateParsedTemplate &LPT); + + static void LateTemplateParserCallback(void *P, LateParsedTemplate &LPT); + + /// We've parsed something that could plausibly be intended to be a template + /// name (\p LHS) followed by a '<' token, and the following code can't + /// possibly be an expression. Determine if this is likely to be a template-id + /// and if so, diagnose it. + bool diagnoseUnknownTemplateId(ExprResult TemplateName, SourceLocation Less); + + void checkPotentialAngleBracket(ExprResult &PotentialTemplateName); + bool checkPotentialAngleBracketDelimiter(const AngleBracketTracker::Loc &, + const Token &OpToken); + bool checkPotentialAngleBracketDelimiter(const Token &OpToken) { + if (auto *Info = AngleBrackets.getCurrent(*this)) + return checkPotentialAngleBracketDelimiter(*Info, OpToken); + return false; + } + + //===--------------------------------------------------------------------===// + // C++ 14: Templates [temp] + + /// Parse a template declaration, explicit instantiation, or + /// explicit specialization. + DeclGroupPtrTy + ParseDeclarationStartingWithTemplate(DeclaratorContext Context, + SourceLocation &DeclEnd, + ParsedAttributes &AccessAttrs); + + /// Parse a template declaration or an explicit specialization. + /// + /// Template declarations include one or more template parameter lists + /// and either the function or class template declaration. Explicit + /// specializations contain one or more 'template < >' prefixes + /// followed by a (possibly templated) declaration. Since the + /// syntactic form of both features is nearly identical, we parse all + /// of the template headers together and let semantic analysis sort + /// the declarations from the explicit specializations. + /// + /// \verbatim + /// template-declaration: [C++ temp] + /// 'export'[opt] 'template' '<' template-parameter-list '>' declaration + /// + /// template-declaration: [C++2a] + /// template-head declaration + /// template-head concept-definition + /// + /// TODO: requires-clause + /// template-head: [C++2a] + /// 'template' '<' template-parameter-list '>' + /// requires-clause[opt] + /// + /// explicit-specialization: [ C++ temp.expl.spec] + /// 'template' '<' '>' declaration + /// \endverbatim + DeclGroupPtrTy ParseTemplateDeclarationOrSpecialization( + DeclaratorContext Context, SourceLocation &DeclEnd, + ParsedAttributes &AccessAttrs, AccessSpecifier AS); + + clang::Parser::DeclGroupPtrTy ParseTemplateDeclarationOrSpecialization( + DeclaratorContext Context, SourceLocation &DeclEnd, AccessSpecifier AS); + + /// Parse a single declaration that declares a template, + /// template specialization, or explicit instantiation of a template. + /// + /// \param DeclEnd will receive the source location of the last token + /// within this declaration. + /// + /// \param AS the access specifier associated with this + /// declaration. Will be AS_none for namespace-scope declarations. + /// + /// \returns the new declaration. + DeclGroupPtrTy ParseDeclarationAfterTemplate( + DeclaratorContext Context, ParsedTemplateInfo &TemplateInfo, + ParsingDeclRAIIObject &DiagsFromParams, SourceLocation &DeclEnd, + ParsedAttributes &AccessAttrs, AccessSpecifier AS = AS_none); + + /// ParseTemplateParameters - Parses a template-parameter-list enclosed in + /// angle brackets. Depth is the depth of this template-parameter-list, which + /// is the number of template headers directly enclosing this template header. + /// TemplateParams is the current list of template parameters we're building. + /// The template parameter we parse will be added to this list. LAngleLoc and + /// RAngleLoc will receive the positions of the '<' and '>', respectively, + /// that enclose this template parameter list. + /// + /// \returns true if an error occurred, false otherwise. + bool ParseTemplateParameters(MultiParseScope &TemplateScopes, unsigned Depth, + SmallVectorImpl &TemplateParams, + SourceLocation &LAngleLoc, + SourceLocation &RAngleLoc); + + /// ParseTemplateParameterList - Parse a template parameter list. If + /// the parsing fails badly (i.e., closing bracket was left out), this + /// will try to put the token stream in a reasonable position (closing + /// a statement, etc.) and return false. + /// + /// \verbatim + /// template-parameter-list: [C++ temp] + /// template-parameter + /// template-parameter-list ',' template-parameter + /// \endverbatim + bool ParseTemplateParameterList(unsigned Depth, + SmallVectorImpl &TemplateParams); + + enum class TPResult; + + /// Determine whether the parser is at the start of a template + /// type parameter. + TPResult isStartOfTemplateTypeParameter(); + + /// ParseTemplateParameter - Parse a template-parameter (C++ [temp.param]). + /// + /// \verbatim + /// template-parameter: [C++ temp.param] + /// type-parameter + /// parameter-declaration + /// + /// type-parameter: (See below) + /// type-parameter-key ...[opt] identifier[opt] + /// type-parameter-key identifier[opt] = type-id + /// (C++2a) type-constraint ...[opt] identifier[opt] + /// (C++2a) type-constraint identifier[opt] = type-id + /// 'template' '<' template-parameter-list '>' type-parameter-key + /// ...[opt] identifier[opt] + /// 'template' '<' template-parameter-list '>' type-parameter-key + /// identifier[opt] '=' id-expression + /// + /// type-parameter-key: + /// class + /// typename + /// \endverbatim + /// + NamedDecl *ParseTemplateParameter(unsigned Depth, unsigned Position); + + /// ParseTypeParameter - Parse a template type parameter (C++ [temp.param]). + /// Other kinds of template parameters are parsed in + /// ParseTemplateTemplateParameter and ParseNonTypeTemplateParameter. + /// + /// \verbatim + /// type-parameter: [C++ temp.param] + /// 'class' ...[opt][C++0x] identifier[opt] + /// 'class' identifier[opt] '=' type-id + /// 'typename' ...[opt][C++0x] identifier[opt] + /// 'typename' identifier[opt] '=' type-id + /// \endverbatim + NamedDecl *ParseTypeParameter(unsigned Depth, unsigned Position); + + /// ParseTemplateTemplateParameter - Handle the parsing of template + /// template parameters. + /// + /// \verbatim + /// type-parameter: [C++ temp.param] + /// template-head type-parameter-key ...[opt] identifier[opt] + /// template-head type-parameter-key identifier[opt] = id-expression + /// type-parameter-key: + /// 'class' + /// 'typename' [C++1z] + /// template-head: [C++2a] + /// 'template' '<' template-parameter-list '>' + /// requires-clause[opt] + /// \endverbatim + NamedDecl *ParseTemplateTemplateParameter(unsigned Depth, unsigned Position); + + /// ParseNonTypeTemplateParameter - Handle the parsing of non-type + /// template parameters (e.g., in "template class array;"). + /// + /// \verbatim + /// template-parameter: + /// ... + /// parameter-declaration + /// \endverbatim + NamedDecl *ParseNonTypeTemplateParameter(unsigned Depth, unsigned Position); + + /// Check whether the current token is a template-id annotation denoting a + /// type-constraint. + bool isTypeConstraintAnnotation(); + + /// Try parsing a type-constraint at the current location. + /// + /// \verbatim + /// type-constraint: + /// nested-name-specifier[opt] concept-name + /// nested-name-specifier[opt] concept-name + /// '<' template-argument-list[opt] '>'[opt] + /// \endverbatim + /// + /// \returns true if an error occurred, and false otherwise. + bool TryAnnotateTypeConstraint(); + + void DiagnoseMisplacedEllipsis(SourceLocation EllipsisLoc, + SourceLocation CorrectLoc, + bool AlreadyHasEllipsis, + bool IdentifierHasName); + void DiagnoseMisplacedEllipsisInDeclarator(SourceLocation EllipsisLoc, + Declarator &D); + // C++ 14.3: Template arguments [temp.arg] + typedef SmallVector TemplateArgList; + + /// Parses a '>' at the end of a template list. + /// + /// If this function encounters '>>', '>>>', '>=', or '>>=', it tries + /// to determine if these tokens were supposed to be a '>' followed by + /// '>', '>>', '>=', or '>='. It emits an appropriate diagnostic if necessary. + /// + /// \param RAngleLoc the location of the consumed '>'. + /// + /// \param ConsumeLastToken if true, the '>' is consumed. + /// + /// \param ObjCGenericList if true, this is the '>' closing an Objective-C + /// type parameter or type argument list, rather than a C++ template parameter + /// or argument list. + /// + /// \returns true, if current token does not start with '>', false otherwise. + bool ParseGreaterThanInTemplateList(SourceLocation LAngleLoc, + SourceLocation &RAngleLoc, + bool ConsumeLastToken, + bool ObjCGenericList); + + /// Parses a template-id that after the template name has + /// already been parsed. + /// + /// This routine takes care of parsing the enclosed template argument + /// list ('<' template-parameter-list [opt] '>') and placing the + /// results into a form that can be transferred to semantic analysis. + /// + /// \param ConsumeLastToken if true, then we will consume the last + /// token that forms the template-id. Otherwise, we will leave the + /// last token in the stream (e.g., so that it can be replaced with an + /// annotation token). + bool ParseTemplateIdAfterTemplateName(bool ConsumeLastToken, + SourceLocation &LAngleLoc, + TemplateArgList &TemplateArgs, + SourceLocation &RAngleLoc, + TemplateTy NameHint = nullptr); + + /// Replace the tokens that form a simple-template-id with an + /// annotation token containing the complete template-id. + /// + /// The first token in the stream must be the name of a template that + /// is followed by a '<'. This routine will parse the complete + /// simple-template-id and replace the tokens with a single annotation + /// token with one of two different kinds: if the template-id names a + /// type (and \p AllowTypeAnnotation is true), the annotation token is + /// a type annotation that includes the optional nested-name-specifier + /// (\p SS). Otherwise, the annotation token is a template-id + /// annotation that does not include the optional + /// nested-name-specifier. + /// + /// \param Template the declaration of the template named by the first + /// token (an identifier), as returned from \c Action::isTemplateName(). + /// + /// \param TNK the kind of template that \p Template + /// refers to, as returned from \c Action::isTemplateName(). + /// + /// \param SS if non-NULL, the nested-name-specifier that precedes + /// this template name. + /// + /// \param TemplateKWLoc if valid, specifies that this template-id + /// annotation was preceded by the 'template' keyword and gives the + /// location of that keyword. If invalid (the default), then this + /// template-id was not preceded by a 'template' keyword. + /// + /// \param AllowTypeAnnotation if true (the default), then a + /// simple-template-id that refers to a class template, template + /// template parameter, or other template that produces a type will be + /// replaced with a type annotation token. Otherwise, the + /// simple-template-id is always replaced with a template-id + /// annotation token. + /// + /// \param TypeConstraint if true, then this is actually a type-constraint, + /// meaning that the template argument list can be omitted (and the template + /// in question must be a concept). + /// + /// If an unrecoverable parse error occurs and no annotation token can be + /// formed, this function returns true. + /// + bool AnnotateTemplateIdToken(TemplateTy Template, TemplateNameKind TNK, + CXXScopeSpec &SS, SourceLocation TemplateKWLoc, + UnqualifiedId &TemplateName, + bool AllowTypeAnnotation = true, + bool TypeConstraint = false); + + /// Replaces a template-id annotation token with a type + /// annotation token. + /// + /// If there was a failure when forming the type from the template-id, + /// a type annotation token will still be created, but will have a + /// NULL type pointer to signify an error. + /// + /// \param SS The scope specifier appearing before the template-id, if any. + /// + /// \param AllowImplicitTypename whether this is a context where T::type + /// denotes a dependent type. + /// \param IsClassName Is this template-id appearing in a context where we + /// know it names a class, such as in an elaborated-type-specifier or + /// base-specifier? ('typename' and 'template' are unneeded and disallowed + /// in those contexts.) + void + AnnotateTemplateIdTokenAsType(CXXScopeSpec &SS, + ImplicitTypenameContext AllowImplicitTypename, + bool IsClassName = false); + + /// ParseTemplateArgumentList - Parse a C++ template-argument-list + /// (C++ [temp.names]). Returns true if there was an error. + /// + /// \verbatim + /// template-argument-list: [C++ 14.2] + /// template-argument + /// template-argument-list ',' template-argument + /// \endverbatim + /// + /// \param Template is only used for code completion, and may be null. + bool ParseTemplateArgumentList(TemplateArgList &TemplateArgs, + TemplateTy Template, SourceLocation OpenLoc); + + /// Parse a C++ template template argument. + ParsedTemplateArgument ParseTemplateTemplateArgument(); + + /// ParseTemplateArgument - Parse a C++ template argument (C++ [temp.names]). + /// + /// \verbatim + /// template-argument: [C++ 14.2] + /// constant-expression + /// type-id + /// id-expression + /// braced-init-list [C++26, DR] + /// \endverbatim + /// + ParsedTemplateArgument ParseTemplateArgument(); + + /// Parse a C++ explicit template instantiation + /// (C++ [temp.explicit]). + /// + /// \verbatim + /// explicit-instantiation: + /// 'extern' [opt] 'template' declaration + /// \endverbatim + /// + /// Note that the 'extern' is a GNU extension and C++11 feature. + DeclGroupPtrTy ParseExplicitInstantiation(DeclaratorContext Context, + SourceLocation ExternLoc, + SourceLocation TemplateLoc, + SourceLocation &DeclEnd, + ParsedAttributes &AccessAttrs, + AccessSpecifier AS = AS_none); + + /// \brief Parse a single declaration that declares a concept. + /// + /// \param DeclEnd will receive the source location of the last token + /// within this declaration. + /// + /// \returns the new declaration. + Decl *ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, + SourceLocation &DeclEnd); + + ///@} + + // + // + // ------------------------------------------------------------------------- + // + // + + /// \name Tentative Parsing + /// Implementations are in ParseTentative.cpp + ///@{ + +private: + /// TentativeParsingAction - An object that is used as a kind of "tentative + /// parsing transaction". It gets instantiated to mark the token position and + /// after the token consumption is done, Commit() or Revert() is called to + /// either "commit the consumed tokens" or revert to the previously marked + /// token position. Example: + /// + /// TentativeParsingAction TPA(*this); + /// ConsumeToken(); + /// .... + /// TPA.Revert(); + /// + /// If the Unannotated parameter is true, any token annotations created + /// during the tentative parse are reverted. + class TentativeParsingAction { + Parser &P; + PreferredTypeBuilder PrevPreferredType; + Token PrevTok; + size_t PrevTentativelyDeclaredIdentifierCount; + unsigned short PrevParenCount, PrevBracketCount, PrevBraceCount; + bool isActive; + + public: + explicit TentativeParsingAction(Parser &p, bool Unannotated = false) + : P(p), PrevPreferredType(P.PreferredType) { + PrevTok = P.Tok; + PrevTentativelyDeclaredIdentifierCount = + P.TentativelyDeclaredIdentifiers.size(); + PrevParenCount = P.ParenCount; + PrevBracketCount = P.BracketCount; + PrevBraceCount = P.BraceCount; + P.PP.EnableBacktrackAtThisPos(Unannotated); + isActive = true; + } + void Commit() { + assert(isActive && "Parsing action was finished!"); + P.TentativelyDeclaredIdentifiers.resize( + PrevTentativelyDeclaredIdentifierCount); + P.PP.CommitBacktrackedTokens(); + isActive = false; + } + void Revert() { + assert(isActive && "Parsing action was finished!"); + P.PP.Backtrack(); + P.PreferredType = PrevPreferredType; + P.Tok = PrevTok; + P.TentativelyDeclaredIdentifiers.resize( + PrevTentativelyDeclaredIdentifierCount); + P.ParenCount = PrevParenCount; + P.BracketCount = PrevBracketCount; + P.BraceCount = PrevBraceCount; + isActive = false; + } + ~TentativeParsingAction() { + assert(!isActive && "Forgot to call Commit or Revert!"); + } + }; + + /// A TentativeParsingAction that automatically reverts in its destructor. + /// Useful for disambiguation parses that will always be reverted. + class RevertingTentativeParsingAction + : private Parser::TentativeParsingAction { + public: + using TentativeParsingAction::TentativeParsingAction; + + ~RevertingTentativeParsingAction() { Revert(); } + }; + + /// isCXXDeclarationStatement - C++-specialized function that disambiguates + /// between a declaration or an expression statement, when parsing function + /// bodies. Returns true for declaration, false for expression. + /// + /// \verbatim + /// declaration-statement: + /// block-declaration + /// + /// block-declaration: + /// simple-declaration + /// asm-definition + /// namespace-alias-definition + /// using-declaration + /// using-directive + /// [C++0x] static_assert-declaration + /// + /// asm-definition: + /// 'asm' '(' string-literal ')' ';' + /// + /// namespace-alias-definition: + /// 'namespace' identifier = qualified-namespace-specifier ';' + /// + /// using-declaration: + /// 'using' typename[opt] '::'[opt] nested-name-specifier + /// unqualified-id ';' + /// 'using' '::' unqualified-id ; + /// + /// using-directive: + /// 'using' 'namespace' '::'[opt] nested-name-specifier[opt] + /// namespace-name ';' + /// \endverbatim + /// + bool isCXXDeclarationStatement(bool DisambiguatingWithExpression = false); + + /// isCXXSimpleDeclaration - C++-specialized function that disambiguates + /// between a simple-declaration or an expression-statement. + /// If during the disambiguation process a parsing error is encountered, + /// the function returns true to let the declaration parsing code handle it. + /// Returns false if the statement is disambiguated as expression. + /// + /// \verbatim + /// simple-declaration: + /// decl-specifier-seq init-declarator-list[opt] ';' + /// decl-specifier-seq ref-qualifier[opt] '[' identifier-list ']' + /// brace-or-equal-initializer ';' [C++17] + /// \endverbatim + /// + /// (if AllowForRangeDecl specified) + /// for ( for-range-declaration : for-range-initializer ) statement + /// + /// \verbatim + /// for-range-declaration: + /// decl-specifier-seq declarator + /// decl-specifier-seq ref-qualifier[opt] '[' identifier-list ']' + /// \endverbatim + /// + /// In any of the above cases there can be a preceding + /// attribute-specifier-seq, but the caller is expected to handle that. + bool isCXXSimpleDeclaration(bool AllowForRangeDecl); + + /// isCXXFunctionDeclarator - Disambiguates between a function declarator or + /// a constructor-style initializer, when parsing declaration statements. + /// Returns true for function declarator and false for constructor-style + /// initializer. Sets 'IsAmbiguous' to true to indicate that this declaration + /// might be a constructor-style initializer. + /// If during the disambiguation process a parsing error is encountered, + /// the function returns true to let the declaration parsing code handle it. + /// + /// '(' parameter-declaration-clause ')' cv-qualifier-seq[opt] + /// exception-specification[opt] + /// + bool isCXXFunctionDeclarator(bool *IsAmbiguous = nullptr, + ImplicitTypenameContext AllowImplicitTypename = + ImplicitTypenameContext::No); + + struct ConditionDeclarationOrInitStatementState; + enum class ConditionOrInitStatement { + Expression, ///< Disambiguated as an expression (either kind). + ConditionDecl, ///< Disambiguated as the declaration form of condition. + InitStmtDecl, ///< Disambiguated as a simple-declaration init-statement. + ForRangeDecl, ///< Disambiguated as a for-range declaration. + Error ///< Can't be any of the above! + }; + + /// Disambiguates between a declaration in a condition, a + /// simple-declaration in an init-statement, and an expression for + /// a condition of a if/switch statement. + /// + /// \verbatim + /// condition: + /// expression + /// type-specifier-seq declarator '=' assignment-expression + /// [C++11] type-specifier-seq declarator '=' initializer-clause + /// [C++11] type-specifier-seq declarator braced-init-list + /// [GNU] type-specifier-seq declarator simple-asm-expr[opt] attributes[opt] + /// '=' assignment-expression + /// simple-declaration: + /// decl-specifier-seq init-declarator-list[opt] ';' + /// \endverbatim + /// + /// Note that, unlike isCXXSimpleDeclaration, we must disambiguate all the way + /// to the ';' to disambiguate cases like 'int(x))' (an expression) from + /// 'int(x);' (a simple-declaration in an init-statement). + ConditionOrInitStatement + isCXXConditionDeclarationOrInitStatement(bool CanBeInitStmt, + bool CanBeForRangeDecl); + + /// Determine whether the next set of tokens contains a type-id. + /// + /// The context parameter states what context we're parsing right + /// now, which affects how this routine copes with the token + /// following the type-id. If the context is + /// TentativeCXXTypeIdContext::InParens, we have already parsed the '(' and we + /// will cease lookahead when we hit the corresponding ')'. If the context is + /// TentativeCXXTypeIdContext::AsTemplateArgument, we've already parsed the + /// '<' or ',' before this template argument, and will cease lookahead when we + /// hit a + /// '>', '>>' (in C++0x), or ','; or, in C++0x, an ellipsis immediately + /// preceding such. Returns true for a type-id and false for an expression. + /// If during the disambiguation process a parsing error is encountered, + /// the function returns true to let the declaration parsing code handle it. + /// + /// \verbatim + /// type-id: + /// type-specifier-seq abstract-declarator[opt] + /// \endverbatim + /// + bool isCXXTypeId(TentativeCXXTypeIdContext Context, bool &isAmbiguous); + + bool isCXXTypeId(TentativeCXXTypeIdContext Context) { + bool isAmbiguous; + return isCXXTypeId(Context, isAmbiguous); + } + + /// TPResult - Used as the result value for functions whose purpose is to + /// disambiguate C++ constructs by "tentatively parsing" them. + enum class TPResult { True, False, Ambiguous, Error }; + + /// Determine whether we could have an enum-base. + /// + /// \p AllowSemi If \c true, then allow a ';' after the enum-base; otherwise + /// only consider this to be an enum-base if the next token is a '{'. + /// + /// \return \c false if this cannot possibly be an enum base; \c true + /// otherwise. + bool isEnumBase(bool AllowSemi); + + /// isCXXDeclarationSpecifier - Returns TPResult::True if it is a declaration + /// specifier, TPResult::False if it is not, TPResult::Ambiguous if it could + /// be either a decl-specifier or a function-style cast, and TPResult::Error + /// if a parsing error was found and reported. + /// + /// Does not consume tokens. + /// + /// If InvalidAsDeclSpec is not null, some cases that would be ill-formed as + /// declaration specifiers but possibly valid as some other kind of construct + /// return TPResult::Ambiguous instead of TPResult::False. When this happens, + /// the intent is to keep trying to disambiguate, on the basis that we might + /// find a better reason to treat this construct as a declaration later on. + /// When this happens and the name could possibly be valid in some other + /// syntactic context, *InvalidAsDeclSpec is set to 'true'. The current cases + /// that trigger this are: + /// + /// * When parsing X::Y (with no 'typename') where X is dependent + /// * When parsing X where X is undeclared + /// + /// \verbatim + /// decl-specifier: + /// storage-class-specifier + /// type-specifier + /// function-specifier + /// 'friend' + /// 'typedef' + /// [C++11] 'constexpr' + /// [C++20] 'consteval' + /// [GNU] attributes declaration-specifiers[opt] + /// + /// storage-class-specifier: + /// 'register' + /// 'static' + /// 'extern' + /// 'mutable' + /// 'auto' + /// [GNU] '__thread' + /// [C++11] 'thread_local' + /// [C11] '_Thread_local' + /// + /// function-specifier: + /// 'inline' + /// 'virtual' + /// 'explicit' + /// + /// typedef-name: + /// identifier + /// + /// type-specifier: + /// simple-type-specifier + /// class-specifier + /// enum-specifier + /// elaborated-type-specifier + /// typename-specifier + /// cv-qualifier + /// + /// simple-type-specifier: + /// '::'[opt] nested-name-specifier[opt] type-name + /// '::'[opt] nested-name-specifier 'template' + /// simple-template-id [TODO] + /// 'char' + /// 'wchar_t' + /// 'bool' + /// 'short' + /// 'int' + /// 'long' + /// 'signed' + /// 'unsigned' + /// 'float' + /// 'double' + /// 'void' + /// [GNU] typeof-specifier + /// [GNU] '_Complex' + /// [C++11] 'auto' + /// [GNU] '__auto_type' + /// [C++11] 'decltype' ( expression ) + /// [C++1y] 'decltype' ( 'auto' ) + /// + /// type-name: + /// class-name + /// enum-name + /// typedef-name + /// + /// elaborated-type-specifier: + /// class-key '::'[opt] nested-name-specifier[opt] identifier + /// class-key '::'[opt] nested-name-specifier[opt] 'template'[opt] + /// simple-template-id + /// 'enum' '::'[opt] nested-name-specifier[opt] identifier + /// + /// enum-name: + /// identifier + /// + /// enum-specifier: + /// 'enum' identifier[opt] '{' enumerator-list[opt] '}' + /// 'enum' identifier[opt] '{' enumerator-list ',' '}' + /// + /// class-specifier: + /// class-head '{' member-specification[opt] '}' + /// + /// class-head: + /// class-key identifier[opt] base-clause[opt] + /// class-key nested-name-specifier identifier base-clause[opt] + /// class-key nested-name-specifier[opt] simple-template-id + /// base-clause[opt] + /// + /// class-key: + /// 'class' + /// 'struct' + /// 'union' + /// + /// cv-qualifier: + /// 'const' + /// 'volatile' + /// [GNU] restrict + /// \endverbatim + /// + TPResult + isCXXDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename, + TPResult BracedCastResult = TPResult::False, + bool *InvalidAsDeclSpec = nullptr); + + /// Given that isCXXDeclarationSpecifier returns \c TPResult::True or + /// \c TPResult::Ambiguous, determine whether the decl-specifier would be + /// a type-specifier other than a cv-qualifier. + bool isCXXDeclarationSpecifierAType(); - /// Parses the 'size-expr', which is an integral value, or an asterisk. - /// Asterisk is represented by a OpenACCAsteriskSizeExpr - ExprResult ParseOpenACCSizeExpr(OpenACCClauseKind CK); - /// Parses a comma delimited list of 'size-expr's. - bool ParseOpenACCSizeExprList(OpenACCClauseKind CK, - llvm::SmallVectorImpl &SizeExprs); - /// Parses a 'gang-arg-list', used for the 'gang' clause. - bool ParseOpenACCGangArgList(SourceLocation GangLoc, - llvm::SmallVectorImpl &GKs, - llvm::SmallVectorImpl &IntExprs); + /// Determine whether we might be looking at the '<' template-argument-list + /// '>' of a template-id or simple-template-id, rather than a less-than + /// comparison. This will often fail and produce an ambiguity, but should + /// never be wrong if it returns True or False. + TPResult isTemplateArgumentList(unsigned TokensToSkip); - using OpenACCGangArgRes = std::pair; - /// Parses a 'gang-arg', used for the 'gang' clause. Returns a pair of the - /// ExprResult (which contains the validity of the expression), plus the gang - /// kind for the current argument. - OpenACCGangArgRes ParseOpenACCGangArg(SourceLocation GangLoc); - /// Parses a 'condition' expr, ensuring it results in a - ExprResult ParseOpenACCConditionExpr(); - DeclGroupPtrTy - ParseOpenACCAfterRoutineDecl(AccessSpecifier &AS, ParsedAttributes &Attrs, - DeclSpec::TST TagType, Decl *TagDecl, - OpenACCDirectiveParseInfo &DirInfo); - StmtResult ParseOpenACCAfterRoutineStmt(OpenACCDirectiveParseInfo &DirInfo); + /// Determine whether an '(' after an 'explicit' keyword is part of a C++20 + /// 'explicit(bool)' declaration, in earlier language modes where that is an + /// extension. + TPResult isExplicitBool(); -private: - //===--------------------------------------------------------------------===// - // C++ 14: Templates [temp] + /// Determine whether an identifier has been tentatively declared as a + /// non-type. Such tentative declarations should not be found to name a type + /// during a tentative parse, but also should not be annotated as a non-type. + bool isTentativelyDeclared(IdentifierInfo *II); - // C++ 14.1: Template Parameters [temp.param] - DeclGroupPtrTy - ParseDeclarationStartingWithTemplate(DeclaratorContext Context, - SourceLocation &DeclEnd, - ParsedAttributes &AccessAttrs); - DeclGroupPtrTy ParseTemplateDeclarationOrSpecialization( - DeclaratorContext Context, SourceLocation &DeclEnd, - ParsedAttributes &AccessAttrs, AccessSpecifier AS); - clang::Parser::DeclGroupPtrTy ParseTemplateDeclarationOrSpecialization( - DeclaratorContext Context, SourceLocation &DeclEnd, AccessSpecifier AS); - DeclGroupPtrTy ParseDeclarationAfterTemplate( - DeclaratorContext Context, ParsedTemplateInfo &TemplateInfo, - ParsingDeclRAIIObject &DiagsFromParams, SourceLocation &DeclEnd, - ParsedAttributes &AccessAttrs, AccessSpecifier AS = AS_none); - bool ParseTemplateParameters(MultiParseScope &TemplateScopes, unsigned Depth, - SmallVectorImpl &TemplateParams, - SourceLocation &LAngleLoc, - SourceLocation &RAngleLoc); - bool ParseTemplateParameterList(unsigned Depth, - SmallVectorImpl &TemplateParams); - TPResult isStartOfTemplateTypeParameter(); - NamedDecl *ParseTemplateParameter(unsigned Depth, unsigned Position); - NamedDecl *ParseTypeParameter(unsigned Depth, unsigned Position); - NamedDecl *ParseTemplateTemplateParameter(unsigned Depth, unsigned Position); - NamedDecl *ParseNonTypeTemplateParameter(unsigned Depth, unsigned Position); - bool isTypeConstraintAnnotation(); - bool TryAnnotateTypeConstraint(); - void DiagnoseMisplacedEllipsis(SourceLocation EllipsisLoc, - SourceLocation CorrectLoc, - bool AlreadyHasEllipsis, - bool IdentifierHasName); - void DiagnoseMisplacedEllipsisInDeclarator(SourceLocation EllipsisLoc, - Declarator &D); - // C++ 14.3: Template arguments [temp.arg] - typedef SmallVector TemplateArgList; + // "Tentative parsing" functions, used for disambiguation. If a parsing error + // is encountered they will return TPResult::Error. + // Returning TPResult::True/False indicates that the ambiguity was + // resolved and tentative parsing may stop. TPResult::Ambiguous indicates + // that more tentative parsing is necessary for disambiguation. + // They all consume tokens, so backtracking should be used after calling them. - bool ParseGreaterThanInTemplateList(SourceLocation LAngleLoc, - SourceLocation &RAngleLoc, - bool ConsumeLastToken, - bool ObjCGenericList); - bool ParseTemplateIdAfterTemplateName(bool ConsumeLastToken, - SourceLocation &LAngleLoc, - TemplateArgList &TemplateArgs, - SourceLocation &RAngleLoc, - TemplateTy NameHint = nullptr); + /// \verbatim + /// simple-declaration: + /// decl-specifier-seq init-declarator-list[opt] ';' + /// + /// (if AllowForRangeDecl specified) + /// for ( for-range-declaration : for-range-initializer ) statement + /// for-range-declaration: + /// attribute-specifier-seqopt type-specifier-seq declarator + /// \endverbatim + /// + TPResult TryParseSimpleDeclaration(bool AllowForRangeDecl); - bool AnnotateTemplateIdToken(TemplateTy Template, TemplateNameKind TNK, - CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - UnqualifiedId &TemplateName, - bool AllowTypeAnnotation = true, - bool TypeConstraint = false); - void - AnnotateTemplateIdTokenAsType(CXXScopeSpec &SS, - ImplicitTypenameContext AllowImplicitTypename, - bool IsClassName = false); - bool ParseTemplateArgumentList(TemplateArgList &TemplateArgs, - TemplateTy Template, SourceLocation OpenLoc); - ParsedTemplateArgument ParseTemplateTemplateArgument(); - ParsedTemplateArgument ParseTemplateArgument(); - DeclGroupPtrTy ParseExplicitInstantiation(DeclaratorContext Context, - SourceLocation ExternLoc, - SourceLocation TemplateLoc, - SourceLocation &DeclEnd, - ParsedAttributes &AccessAttrs, - AccessSpecifier AS = AS_none); - // C++2a: Template, concept definition [temp] - Decl * - ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, - SourceLocation &DeclEnd); + /// \verbatim + /// [GNU] typeof-specifier: + /// 'typeof' '(' expressions ')' + /// 'typeof' '(' type-name ')' + /// \endverbatim + /// + TPResult TryParseTypeofSpecifier(); - /// Parse the given string as a type. + /// [ObjC] protocol-qualifiers: + /// '<' identifier-list '>' + TPResult TryParseProtocolQualifiers(); + + TPResult TryParsePtrOperatorSeq(); + + /// \verbatim + /// operator-function-id: + /// 'operator' operator /// - /// This is a dangerous utility function currently employed only by API notes. - /// It is not a general entry-point for safely parsing types from strings. + /// operator: one of + /// new delete new[] delete[] + - * / % ^ [...] /// - /// \param TypeStr The string to be parsed as a type. - /// \param Context The name of the context in which this string is being - /// parsed, which will be used in diagnostics. - /// \param IncludeLoc The location at which this parse was triggered. - TypeResult ParseTypeFromString(StringRef TypeStr, StringRef Context, - SourceLocation IncludeLoc); + /// conversion-function-id: + /// 'operator' conversion-type-id + /// + /// conversion-type-id: + /// type-specifier-seq conversion-declarator[opt] + /// + /// conversion-declarator: + /// ptr-operator conversion-declarator[opt] + /// + /// literal-operator-id: + /// 'operator' string-literal identifier + /// 'operator' user-defined-string-literal + /// \endverbatim + TPResult TryParseOperatorId(); - //===--------------------------------------------------------------------===// - // Modules - DeclGroupPtrTy ParseModuleDecl(Sema::ModuleImportState &ImportState); - Decl *ParseModuleImport(SourceLocation AtLoc, - Sema::ModuleImportState &ImportState); - bool parseMisplacedModuleImport(); - bool tryParseMisplacedModuleImport() { - tok::TokenKind Kind = Tok.getKind(); - if (Kind == tok::annot_module_begin || Kind == tok::annot_module_end || - Kind == tok::annot_module_include) - return parseMisplacedModuleImport(); - return false; - } + /// Tentatively parse an init-declarator-list in order to disambiguate it from + /// an expression. + /// + /// \verbatim + /// init-declarator-list: + /// init-declarator + /// init-declarator-list ',' init-declarator + /// + /// init-declarator: + /// declarator initializer[opt] + /// [GNU] declarator simple-asm-expr[opt] attributes[opt] initializer[opt] + /// + /// initializer: + /// brace-or-equal-initializer + /// '(' expression-list ')' + /// + /// brace-or-equal-initializer: + /// '=' initializer-clause + /// [C++11] braced-init-list + /// + /// initializer-clause: + /// assignment-expression + /// braced-init-list + /// + /// braced-init-list: + /// '{' initializer-list ','[opt] '}' + /// '{' '}' + /// \endverbatim + /// + TPResult TryParseInitDeclaratorList(bool MayHaveTrailingReturnType = false); - bool ParseModuleName(SourceLocation UseLoc, - SmallVectorImpl &Path, bool IsImport); + /// \verbatim + /// declarator: + /// direct-declarator + /// ptr-operator declarator + /// + /// direct-declarator: + /// declarator-id + /// direct-declarator '(' parameter-declaration-clause ')' + /// cv-qualifier-seq[opt] exception-specification[opt] + /// direct-declarator '[' constant-expression[opt] ']' + /// '(' declarator ')' + /// [GNU] '(' attributes declarator ')' + /// + /// abstract-declarator: + /// ptr-operator abstract-declarator[opt] + /// direct-abstract-declarator + /// + /// direct-abstract-declarator: + /// direct-abstract-declarator[opt] + /// '(' parameter-declaration-clause ')' cv-qualifier-seq[opt] + /// exception-specification[opt] + /// direct-abstract-declarator[opt] '[' constant-expression[opt] ']' + /// '(' abstract-declarator ')' + /// [C++0x] ... + /// + /// ptr-operator: + /// '*' cv-qualifier-seq[opt] + /// '&' + /// [C++0x] '&&' [TODO] + /// '::'[opt] nested-name-specifier '*' cv-qualifier-seq[opt] + /// + /// cv-qualifier-seq: + /// cv-qualifier cv-qualifier-seq[opt] + /// + /// cv-qualifier: + /// 'const' + /// 'volatile' + /// + /// declarator-id: + /// '...'[opt] id-expression + /// + /// id-expression: + /// unqualified-id + /// qualified-id [TODO] + /// + /// unqualified-id: + /// identifier + /// operator-function-id + /// conversion-function-id + /// literal-operator-id + /// '~' class-name [TODO] + /// '~' decltype-specifier [TODO] + /// template-id [TODO] + /// \endverbatim + /// + TPResult TryParseDeclarator(bool mayBeAbstract, bool mayHaveIdentifier = true, + bool mayHaveDirectInit = false, + bool mayHaveTrailingReturnType = false); - //===--------------------------------------------------------------------===// - // C++11/G++: Type Traits [Type-Traits.html in the GCC manual] - ExprResult ParseTypeTrait(); + /// \verbatim + /// parameter-declaration-clause: + /// parameter-declaration-list[opt] '...'[opt] + /// parameter-declaration-list ',' '...' + /// + /// parameter-declaration-list: + /// parameter-declaration + /// parameter-declaration-list ',' parameter-declaration + /// + /// parameter-declaration: + /// attribute-specifier-seq[opt] decl-specifier-seq declarator attributes[opt] + /// attribute-specifier-seq[opt] decl-specifier-seq declarator attributes[opt] + /// '=' assignment-expression + /// attribute-specifier-seq[opt] decl-specifier-seq abstract-declarator[opt] + /// attributes[opt] + /// attribute-specifier-seq[opt] decl-specifier-seq abstract-declarator[opt] + /// attributes[opt] '=' assignment-expression + /// \endverbatim + /// + TPResult TryParseParameterDeclarationClause( + bool *InvalidAsDeclaration = nullptr, bool VersusTemplateArg = false, + ImplicitTypenameContext AllowImplicitTypename = + ImplicitTypenameContext::No); - //===--------------------------------------------------------------------===// - // Embarcadero: Arary and Expression Traits - ExprResult ParseArrayTypeTrait(); - ExprResult ParseExpressionTrait(); + /// TryParseFunctionDeclarator - We parsed a '(' and we want to try to + /// continue parsing as a function declarator. If TryParseFunctionDeclarator + /// fully parsed the function declarator, it will return TPResult::Ambiguous, + /// otherwise it will return either False() or Error(). + /// + /// \verbatim + /// '(' parameter-declaration-clause ')' cv-qualifier-seq[opt] + /// exception-specification[opt] + /// + /// exception-specification: + /// 'throw' '(' type-id-list[opt] ')' + /// \endverbatim + /// + TPResult TryParseFunctionDeclarator(bool MayHaveTrailingReturnType = false); - ExprResult ParseBuiltinPtrauthTypeDiscriminator(); + // When parsing an identifier after an arrow it may be a member expression, + // in which case we should not annotate it as an independant expression + // so we just lookup that name, if it's not a type the construct is not + // a function declaration. + bool NameAfterArrowIsNonType(); + + /// \verbatim + /// '[' constant-expression[opt] ']' + /// \endverbatim + /// + TPResult TryParseBracketDeclarator(); + + /// Try to consume a token sequence that we've already identified as + /// (potentially) starting a decl-specifier. + TPResult TryConsumeDeclarationSpecifier(); + + /// Try to skip a possibly empty sequence of 'attribute-specifier's without + /// full validation of the syntactic structure of attributes. + bool TrySkipAttributes(); //===--------------------------------------------------------------------===// - // Preprocessor code-completion pass-through - void CodeCompleteDirective(bool InConditional) override; - void CodeCompleteInConditionalExclusion() override; - void CodeCompleteMacroName(bool IsDefinition) override; - void CodeCompletePreprocessorExpression() override; - void CodeCompleteMacroArgument(IdentifierInfo *Macro, MacroInfo *MacroInfo, - unsigned ArgumentIndex) override; - void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled) override; - void CodeCompleteNaturalLanguage() override; + // C++ 7: Declarations [dcl.dcl] - class GNUAsmQualifiers { - unsigned Qualifiers = AQ_unspecified; + /// Returns true if this is a C++11 attribute-specifier. Per + /// C++11 [dcl.attr.grammar]p6, two consecutive left square bracket tokens + /// always introduce an attribute. In Objective-C++11, this rule does not + /// apply if either '[' begins a message-send. + /// + /// If Disambiguate is true, we try harder to determine whether a '[[' starts + /// an attribute-specifier, and return + /// CXX11AttributeKind::InvalidAttributeSpecifier if not. + /// + /// If OuterMightBeMessageSend is true, we assume the outer '[' is either an + /// Obj-C message send or the start of an attribute. Otherwise, we assume it + /// is not an Obj-C message send. + /// + /// C++11 [dcl.attr.grammar]: + /// + /// \verbatim + /// attribute-specifier: + /// '[' '[' attribute-list ']' ']' + /// alignment-specifier + /// + /// attribute-list: + /// attribute[opt] + /// attribute-list ',' attribute[opt] + /// attribute '...' + /// attribute-list ',' attribute '...' + /// + /// attribute: + /// attribute-token attribute-argument-clause[opt] + /// + /// attribute-token: + /// identifier + /// identifier '::' identifier + /// + /// attribute-argument-clause: + /// '(' balanced-token-seq ')' + /// \endverbatim + CXX11AttributeKind + isCXX11AttributeSpecifier(bool Disambiguate = false, + bool OuterMightBeMessageSend = false); - public: - enum AQ { - AQ_unspecified = 0, - AQ_volatile = 1, - AQ_inline = 2, - AQ_goto = 4, - }; - static const char *getQualifierName(AQ Qualifier); - bool setAsmQualifier(AQ Qualifier); - inline bool isVolatile() const { return Qualifiers & AQ_volatile; }; - inline bool isInline() const { return Qualifiers & AQ_inline; }; - inline bool isGoto() const { return Qualifiers & AQ_goto; } - }; - bool isGCCAsmStatement(const Token &TokAfterAsm) const; - bool isGNUAsmQualifier(const Token &TokAfterAsm) const; - GNUAsmQualifiers::AQ getGNUAsmQualifier(const Token &Tok) const; - bool parseGNUAsmQualifierListOpt(GNUAsmQualifiers &AQ); + ///@} }; -} // end namespace clang +} // end namespace clang #endif diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 58452e159821a..a70335bef9dd4 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -430,8 +430,15 @@ class Sema; if (!ReferenceBinding) { #ifndef NDEBUG auto Decay = [&](QualType T) { - return (T->isArrayType() || T->isFunctionType()) ? C.getDecayedType(T) - : T; + if (T->isArrayType() || T->isFunctionType()) + T = C.getDecayedType(T); + + // A function pointer type can be resolved to a member function type, + // which is still an identity conversion. + if (auto *N = T->getAs(); + N && N->isMemberFunctionPointer()) + T = C.getDecayedType(N->getPointeeType()); + return T; }; // The types might differ if there is an array-to-pointer conversion // an function-to-pointer conversion, or lvalue-to-rvalue conversion. @@ -980,7 +987,8 @@ class Sema; /// Have we matched any packs on the parameter side, versus any non-packs on /// the argument side, in a context where the opposite matching is also /// allowed? - bool StrictPackMatch : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned StrictPackMatch : 1; /// True if the candidate was found using ADL. LLVM_PREFERRED_TYPE(CallExpr::ADLCallKind) @@ -996,7 +1004,8 @@ class Sema; /// FailureKind - The reason why this candidate is not viable. /// Actually an OverloadFailureKind. - unsigned char FailureKind; + LLVM_PREFERRED_TYPE(OverloadFailureKind) + unsigned FailureKind : 8; /// The number of call arguments that were explicitly provided, /// to be used while performing partial ordering of function templates. diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h index 6bf9ae8d074fb..94b247a689c2d 100644 --- a/clang/include/clang/Sema/ScopeInfo.h +++ b/clang/include/clang/Sema/ScopeInfo.h @@ -103,7 +103,7 @@ enum class FirstCoroutineStmtKind { CoReturn, CoAwait, CoYield }; /// currently being parsed. class FunctionScopeInfo { protected: - enum ScopeKind { + enum ScopeKind : uint8_t { SK_Function, SK_Block, SK_Lambda, diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 6ea7ee281e14d..5ec67087aeea4 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -12576,6 +12576,7 @@ class Sema final : public SemaBase { bool PartialOverloading, bool AggregateDeductionCandidate, bool PartialOrdering, QualType ObjectType, Expr::Classification ObjectClassification, + bool ForOverloadSetAddressResolution, llvm::function_ref)> CheckNonDependent); /// Deduce template arguments when taking the address of a function diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index e340547ff5f45..15182bb27bbdf 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -175,6 +175,8 @@ class SemaHLSL : public SemaBase { // buffer which will be created at the end of the translation unit. llvm::SmallVector DefaultCBufferDecls; + uint32_t ImplicitBindingNextOrderID = 0; + private: void collectResourceBindingsOnVarDecl(VarDecl *D); void collectResourceBindingsOnUserRecordDecl(const VarDecl *VD, @@ -182,6 +184,11 @@ class SemaHLSL : public SemaBase { void processExplicitBindingsOnDecl(VarDecl *D); void diagnoseAvailabilityViolations(TranslationUnitDecl *TU); + + bool initGlobalResourceDecl(VarDecl *VD); + uint32_t getNextImplicitBindingOrderID() { + return ImplicitBindingNextOrderID++; + } }; } // namespace clang diff --git a/clang/include/clang/Sema/SemaWasm.h b/clang/include/clang/Sema/SemaWasm.h index 8841fdff23035..2123e073516cb 100644 --- a/clang/include/clang/Sema/SemaWasm.h +++ b/clang/include/clang/Sema/SemaWasm.h @@ -29,6 +29,7 @@ class SemaWasm : public SemaBase { CallExpr *TheCall); bool BuiltinWasmRefNullExtern(CallExpr *TheCall); + bool BuiltinWasmRefIsNullExtern(CallExpr *TheCall); bool BuiltinWasmRefNullFunc(CallExpr *TheCall); bool BuiltinWasmTableGet(CallExpr *TheCall); bool BuiltinWasmTableSet(CallExpr *TheCall); diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def index fab19c76a22fe..90b80e5201aa8 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def @@ -7,6 +7,9 @@ //===----------------------------------------------------------------------===// // // This file defines the analyzer options avaible with -analyzer-config. +// Note that clang/docs/tools/generate_analyzer_options_docs.py relies on the +// structure of this file, so if this file is refactored, then make sure to +// update that script as well. // //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h index 54c2fb8a60ca1..7d0c2d8658f35 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h @@ -19,6 +19,7 @@ #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include #include #include @@ -269,7 +270,8 @@ class AnalyzerOptions { unsigned NoRetryExhausted : 1; /// Emit analyzer warnings as errors. - bool AnalyzerWerror : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned AnalyzerWerror : 1; /// The inlining stack depth limit. unsigned InlineMaxStackDepth; @@ -309,8 +311,7 @@ class AnalyzerOptions { return AnalyzerConfigCmdFlags; }(); - return !std::binary_search(AnalyzerConfigCmdFlags.begin(), - AnalyzerConfigCmdFlags.end(), Name); + return !llvm::binary_search(AnalyzerConfigCmdFlags, Name); } AnalyzerOptions() @@ -410,7 +411,7 @@ class AnalyzerOptions { // an alias to the new verbose filename option because this // closely mimics the behavior under the old option. ShouldWriteStableReportFilename || ShouldWriteVerboseReportFilename, - AnalyzerWerror, + static_cast(AnalyzerWerror), ShouldApplyFixIts, ShouldDisplayCheckerNameForText}; } diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h index 3754e25501635..e995151927c96 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h @@ -307,11 +307,9 @@ class ExplodedGraph { // Type definitions. using NodeVector = std::vector; - /// The roots of the simulation graph. Usually there will be only - /// one, but clients are free to establish multiple subgraphs within a single - /// SimulGraph. Moreover, these subgraphs can often merge when paths from - /// different roots reach the same state at the same program location. - NodeVector Roots; + /// The root of the simulation graph. Can be nullptr if the graph is empty or + /// if it was populated by `createUncachedNode()`. + ExplodedNode *Root = nullptr; /// The nodes in the simulation graph which have been /// specially marked as the endpoint of an abstract simulation path. @@ -345,31 +343,31 @@ class ExplodedGraph { ExplodedGraph(); ~ExplodedGraph(); - /// Retrieve the node associated with a (Location,State) pair, - /// where the 'Location' is a ProgramPoint in the CFG. If no node for - /// this pair exists, it is created. IsNew is set to true if - /// the node was freshly created. + /// Get the root node of the graph. This may return nullptr if the graph is + /// empty or under construction. + ExplodedNode *getRoot() const { return Root; } + + /// Retrieve the node associated with a (Location, State) pair, where the + /// 'Location' is a ProgramPoint in the CFG. If no node for this pair exists, + /// it is created. IsNew is set to true if the node was freshly created. ExplodedNode *getNode(const ProgramPoint &L, ProgramStateRef State, bool IsSink = false, bool* IsNew = nullptr); - /// Create a node for a (Location, State) pair, - /// but don't store it for deduplication later. This - /// is useful when copying an already completed - /// ExplodedGraph for further processing. + /// Create a node for a (Location, State) pair, but don't store it for + /// deduplication later. This is useful when copying some nodes from an + /// already completed ExplodedGraph for further processing. ExplodedNode *createUncachedNode(const ProgramPoint &L, ProgramStateRef State, int64_t Id, bool IsSink = false); - std::unique_ptr MakeEmptyGraph() const { - return std::make_unique(); - } - - /// addRoot - Add an untyped node to the set of roots. - ExplodedNode *addRoot(ExplodedNode *V) { - Roots.push_back(V); - return V; + /// Mark a node as the root of the graph. Calling this is an error if the + /// graph already has a root node. + void designateAsRoot(ExplodedNode *V) { + assert(V && "Cannot designate nullptr as root!"); + assert(!Root && "The graph already has a root, cannot designate another!"); + Root = V; } /// addEndOfPath - Add an untyped node to the set of EOP nodes. @@ -378,7 +376,6 @@ class ExplodedGraph { return V; } - unsigned num_roots() const { return Roots.size(); } unsigned num_eops() const { return EndNodes.size(); } bool empty() const { return NumNodes == 0; } @@ -389,8 +386,6 @@ class ExplodedGraph { // Iterators. using NodeTy = ExplodedNode; using AllNodesTy = llvm::FoldingSet; - using roots_iterator = NodeVector::iterator; - using const_roots_iterator = NodeVector::const_iterator; using eop_iterator = NodeVector::iterator; using const_eop_iterator = NodeVector::const_iterator; using node_iterator = AllNodesTy::iterator; @@ -400,14 +395,6 @@ class ExplodedGraph { llvm::iterator_range nodes() const { return Nodes; } - roots_iterator roots_begin() { return Roots.begin(); } - - roots_iterator roots_end() { return Roots.end(); } - - const_roots_iterator roots_begin() const { return Roots.begin(); } - - const_roots_iterator roots_end() const { return Roots.end(); } - eop_iterator eop_begin() { return EndNodes.begin(); } eop_iterator eop_end() { return EndNodes.end(); } @@ -508,9 +495,7 @@ namespace llvm { using ChildIteratorType = clang::ento::ExplodedNode::succ_iterator; using nodes_iterator = llvm::df_iterator; - static NodeRef getEntryNode(const GraphTy G) { - return *G->roots_begin(); - } + static NodeRef getEntryNode(const GraphTy G) { return G->getRoot(); } static bool predecessorOfTrivial(NodeRef N) { return N->succ_size() == 1 && N->getFirstSucc()->isTrivial(); diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h index 285194148d3d3..b8a4dcbc727a6 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h @@ -222,8 +222,8 @@ class ExprEngine { const Stmt *getStmt() const; const LocationContext *getRootLocationContext() const { - assert(G.roots_begin() != G.roots_end()); - return (*G.roots_begin())->getLocation().getLocationContext(); + assert(G.getRoot()); + return G.getRoot()->getLocation().getLocationContext(); } ConstCFGElementRef getCFGElementRef() const { diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h index d4052ef90de6e..3105dfa4dae55 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h @@ -301,8 +301,10 @@ class SMTConstraintManager : public clang::ento::SimpleConstraintManager { llvm_unreachable("Unsupported expression to reason about!"); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Dumps SMT formula LLVM_DUMP_METHOD void dump() const { Solver->dump(); } +#endif protected: // Check whether a new model is satisfiable, and update the program state. diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h index 9e7c98fdded17..86774ad5043dd 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h @@ -100,41 +100,7 @@ class SymbolConjured : public SymbolData { ConstCFGElementRef getCFGElementRef() const { return Elem; } // It might return null. - const Stmt *getStmt() const { - switch (Elem->getKind()) { - case CFGElement::Initializer: - return Elem->castAs().getInitializer()->getInit(); - case CFGElement::ScopeBegin: - return Elem->castAs().getTriggerStmt(); - case CFGElement::ScopeEnd: - return Elem->castAs().getTriggerStmt(); - case CFGElement::NewAllocator: - return Elem->castAs().getAllocatorExpr(); - case CFGElement::LifetimeEnds: - return Elem->castAs().getTriggerStmt(); - case CFGElement::LoopExit: - return Elem->castAs().getLoopStmt(); - case CFGElement::Statement: - return Elem->castAs().getStmt(); - case CFGElement::Constructor: - return Elem->castAs().getStmt(); - case CFGElement::CXXRecordTypedCall: - return Elem->castAs().getStmt(); - case CFGElement::AutomaticObjectDtor: - return Elem->castAs().getTriggerStmt(); - case CFGElement::DeleteDtor: - return Elem->castAs().getDeleteExpr(); - case CFGElement::BaseDtor: - return nullptr; - case CFGElement::MemberDtor: - return nullptr; - case CFGElement::TemporaryDtor: - return Elem->castAs().getBindTemporaryExpr(); - case CFGElement::CleanupFunction: - return nullptr; - } - return nullptr; - } + const Stmt *getStmt() const; unsigned getCount() const { return Count; } /// It might return null. diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp index f7ee0fb3ee92d..c9adccdbc77ef 100644 --- a/clang/lib/AST/ASTConcept.cpp +++ b/clang/lib/AST/ASTConcept.cpp @@ -40,9 +40,8 @@ ASTConstraintSatisfaction::ASTConstraintSatisfaction( IsSatisfied{Satisfaction.IsSatisfied}, ContainsErrors{ Satisfaction.ContainsErrors} { for (unsigned I = 0; I < NumRecords; ++I) - CreateUnsatisfiedConstraintRecord( - C, Satisfaction.Details[I], - getTrailingObjects() + I); + CreateUnsatisfiedConstraintRecord(C, Satisfaction.Details[I], + getTrailingObjects() + I); } ASTConstraintSatisfaction::ASTConstraintSatisfaction( @@ -51,9 +50,8 @@ ASTConstraintSatisfaction::ASTConstraintSatisfaction( IsSatisfied{Satisfaction.IsSatisfied}, ContainsErrors{Satisfaction.ContainsErrors} { for (unsigned I = 0; I < NumRecords; ++I) - CreateUnsatisfiedConstraintRecord( - C, *(Satisfaction.begin() + I), - getTrailingObjects() + I); + CreateUnsatisfiedConstraintRecord(C, *(Satisfaction.begin() + I), + getTrailingObjects() + I); } ASTConstraintSatisfaction * diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp index 6cb09b0492ac9..a00d5801f054b 100644 --- a/clang/lib/AST/ASTDiagnostic.cpp +++ b/clang/lib/AST/ASTDiagnostic.cpp @@ -20,6 +20,8 @@ #include "clang/AST/TemplateBase.h" #include "clang/AST/Type.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" using namespace clang; @@ -2190,3 +2192,31 @@ static bool FormatTemplateTypeDiff(ASTContext &Context, QualType FromType, TD.DiffTemplate(); return TD.Emit(); } + +std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) { + auto IsSingleCodeUnitCP = [](unsigned Value, QualType T) { + if (T->isChar8Type()) { + assert(Value <= 0xFF && "not a valid UTF-8 code unit"); + return Value <= 0x7F; + } + if (T->isChar16Type()) { + assert(Value <= 0xFFFF && "not a valid UTF-16 code unit"); + return llvm::IsSingleCodeUnitUTF16Codepoint(Value); + } + assert(T->isChar32Type()); + return llvm::IsSingleCodeUnitUTF32Codepoint(Value); + }; + llvm::SmallVector Str; + if (!IsSingleCodeUnitCP(Value, T)) { + llvm::raw_svector_ostream OS(Str); + OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">"; + return std::string(Str.begin(), Str.end()); + } + + char Buffer[UNI_MAX_UTF8_BYTES_PER_CODE_POINT]; + char *Ptr = Buffer; + [[maybe_unused]] bool Converted = llvm::ConvertCodePointToUTF8(Value, Ptr); + assert(Converted && "trying to encode invalid code unit"); + EscapeStringForDiagnostic(StringRef(Buffer, Ptr - Buffer), Str); + return std::string(Str.begin(), Str.end()); +} diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index c7fb5e8466686..5017c9b76e6d1 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -474,10 +474,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { return false; return this->emitDecayPtr(*FromT, *ToT, CE); } - - case CK_LValueToRValueBitCast: - return this->emitBuiltinBitCast(CE); - case CK_IntegralToBoolean: case CK_FixedPointToBoolean: { // HLSL uses this to cast to one-element vectors. @@ -735,6 +731,11 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { llvm_unreachable("Unhandled clang::CastKind enum"); } +template +bool Compiler::VisitBuiltinBitCastExpr(const BuiltinBitCastExpr *E) { + return this->emitBuiltinBitCast(E); +} + template bool Compiler::VisitIntegerLiteral(const IntegerLiteral *LE) { if (DiscardResult) @@ -2435,7 +2436,7 @@ bool Compiler::VisitStringLiteral(const StringLiteral *E) { // emitted. Read only the array length from the string literal. unsigned ArraySize = CAT->getZExtSize(); unsigned N = std::min(ArraySize, E->getLength()); - size_t CharWidth = E->getCharByteWidth(); + unsigned CharWidth = E->getCharByteWidth(); for (unsigned I = 0; I != N; ++I) { uint32_t CodeUnit = E->getCodeUnit(I); @@ -3477,6 +3478,8 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { if (PlacementDest) { if (!this->visit(PlacementDest)) return false; + if (!this->emitStartLifetime(E)) + return false; if (!this->emitGetLocal(SizeT, ArrayLen, E)) return false; if (!this->emitCheckNewTypeMismatchArray(SizeT, E, E)) @@ -3616,6 +3619,8 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { if (PlacementDest) { if (!this->visit(PlacementDest)) return false; + if (!this->emitStartLifetime(E)) + return false; if (!this->emitCheckNewTypeMismatch(E, E)) return false; } else { @@ -3883,7 +3888,7 @@ bool Compiler::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) { return false; } for (unsigned I = 0; I != NumOutputElems; ++I) { - APSInt ShuffleIndex = E->getShuffleMaskIdx(Ctx.getASTContext(), I); + APSInt ShuffleIndex = E->getShuffleMaskIdx(I); assert(ShuffleIndex >= -1); if (ShuffleIndex == -1) return this->emitInvalidShuffleVectorIndex(I, E); @@ -4107,11 +4112,8 @@ template bool Compiler::visitBool(const Expr *E) { return true; // Convert pointers to bool. - if (T == PT_Ptr) { - if (!this->emitNull(*T, 0, nullptr, E)) - return false; - return this->emitNE(*T, E); - } + if (T == PT_Ptr) + return this->emitIsNonNullPtr(E); // Or Floats. if (T == PT_Float) diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index ec5bd637453c5..56a972f452af9 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -126,6 +126,7 @@ class Compiler : public ConstStmtVisitor, bool>, // Expressions. bool VisitCastExpr(const CastExpr *E); + bool VisitBuiltinBitCastExpr(const BuiltinBitCastExpr *E); bool VisitIntegerLiteral(const IntegerLiteral *E); bool VisitFloatingLiteral(const FloatingLiteral *E); bool VisitImaginaryLiteral(const ImaginaryLiteral *E); diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp index dae94fc9829c7..c70a5259b77e2 100644 --- a/clang/lib/AST/ByteCode/Context.cpp +++ b/clang/lib/AST/ByteCode/Context.cpp @@ -37,6 +37,7 @@ bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) { Compiler(*this, *P).compileFunc( FD, const_cast(Func)); + ++EvalID; // And run it. if (!Run(Parent, Func)) return false; diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index bc860185fea21..74efc3c914504 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1373,6 +1373,10 @@ static bool checkConstructor(InterpState &S, CodePtr OpPC, const Function *Func, bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { if (!CheckLive(S, OpPC, Ptr, AK_Destroy)) return false; + if (!CheckTemporary(S, OpPC, Ptr, AK_Destroy)) + return false; + if (!CheckRange(S, OpPC, Ptr, AK_Destroy)) + return false; // Can't call a dtor on a global variable. if (Ptr.block()->isStatic()) { diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 35d97167135f7..9f1a6302eb856 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1326,6 +1326,14 @@ static inline bool Kill(InterpState &S, CodePtr OpPC) { return true; } +static inline bool StartLifetime(InterpState &S, CodePtr OpPC) { + const auto &Ptr = S.Stk.peek(); + if (!CheckDummy(S, OpPC, Ptr, AK_Destroy)) + return false; + Ptr.startLifetime(); + return true; +} + /// 1) Pops the value from the stack. /// 2) Writes the value to the local variable with the /// given offset. @@ -1855,10 +1863,8 @@ template ::T> bool Init(InterpState &S, CodePtr OpPC) { const T &Value = S.Stk.pop(); const Pointer &Ptr = S.Stk.peek(); - if (!CheckInit(S, OpPC, Ptr)) { - assert(false); + if (!CheckInit(S, OpPC, Ptr)) return false; - } Ptr.activate(); Ptr.initialize(); new (&Ptr.deref()) T(Value); diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 65a9a0cdad022..9dddcced8ca38 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -395,10 +395,8 @@ def GetLocal : AccessOpcode { let HasCustomEval = 1; } // [] -> [Pointer] def SetLocal : AccessOpcode { let HasCustomEval = 1; } -def Kill : Opcode { - let Types = []; - let Args = []; -} +def Kill : Opcode; +def StartLifetime : Opcode; def CheckDecl : Opcode { let Args = [ArgVarDecl]; diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index 19770aa3b97bc..479da09004685 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -722,6 +722,14 @@ class Pointer { getInlineDesc()->LifeState = Lifetime::Ended; } + void startLifetime() const { + if (!isBlockPointer()) + return; + if (asBlockPointer().Base < sizeof(InlineDescriptor)) + return; + getInlineDesc()->LifeState = Lifetime::Started; + } + /// Compare two pointers. ComparisonCategoryResult compare(const Pointer &Other) const { if (!hasSameBase(*this, Other)) diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 061fedb403ddd..8425e40567b27 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -4325,8 +4325,7 @@ DependentFunctionTemplateSpecializationInfo:: const ASTTemplateArgumentListInfo *TemplateArgsWritten) : NumCandidates(Candidates.size()), TemplateArgumentsAsWritten(TemplateArgsWritten) { - std::transform(Candidates.begin(), Candidates.end(), - getTrailingObjects(), + std::transform(Candidates.begin(), Candidates.end(), getTrailingObjects(), [](NamedDecl *ND) { return cast(ND->getUnderlyingDecl()); }); @@ -5380,7 +5379,7 @@ PragmaCommentDecl *PragmaCommentDecl::Create(const ASTContext &C, PragmaCommentDecl *PCD = new (C, DC, additionalSizeToAlloc(Arg.size() + 1)) PragmaCommentDecl(DC, CommentLoc, CommentKind); - memcpy(PCD->getTrailingObjects(), Arg.data(), Arg.size()); + memcpy(PCD->getTrailingObjects(), Arg.data(), Arg.size()); PCD->getTrailingObjects()[Arg.size()] = '\0'; return PCD; } @@ -5402,11 +5401,10 @@ PragmaDetectMismatchDecl::Create(const ASTContext &C, TranslationUnitDecl *DC, PragmaDetectMismatchDecl *PDMD = new (C, DC, additionalSizeToAlloc(ValueStart + Value.size() + 1)) PragmaDetectMismatchDecl(DC, Loc, ValueStart); - memcpy(PDMD->getTrailingObjects(), Name.data(), Name.size()); - PDMD->getTrailingObjects()[Name.size()] = '\0'; - memcpy(PDMD->getTrailingObjects() + ValueStart, Value.data(), - Value.size()); - PDMD->getTrailingObjects()[ValueStart + Value.size()] = '\0'; + memcpy(PDMD->getTrailingObjects(), Name.data(), Name.size()); + PDMD->getTrailingObjects()[Name.size()] = '\0'; + memcpy(PDMD->getTrailingObjects() + ValueStart, Value.data(), Value.size()); + PDMD->getTrailingObjects()[ValueStart + Value.size()] = '\0'; return PDMD; } @@ -5900,7 +5898,7 @@ ImportDecl::ImportDecl(DeclContext *DC, SourceLocation StartLoc, : Decl(Import, DC, StartLoc), ImportedModule(Imported), NextLocalImportAndComplete(nullptr, true) { assert(getNumModuleIdentifiers(Imported) == IdentifierLocs.size()); - auto *StoredLocs = getTrailingObjects(); + auto *StoredLocs = getTrailingObjects(); llvm::uninitialized_copy(IdentifierLocs, StoredLocs); } @@ -5908,7 +5906,7 @@ ImportDecl::ImportDecl(DeclContext *DC, SourceLocation StartLoc, Module *Imported, SourceLocation EndLoc) : Decl(Import, DC, StartLoc), ImportedModule(Imported), NextLocalImportAndComplete(nullptr, false) { - *getTrailingObjects() = EndLoc; + *getTrailingObjects() = EndLoc; } ImportDecl *ImportDecl::Create(ASTContext &C, DeclContext *DC, @@ -5939,14 +5937,12 @@ ArrayRef ImportDecl::getIdentifierLocs() const { if (!isImportComplete()) return {}; - const auto *StoredLocs = getTrailingObjects(); - return llvm::ArrayRef(StoredLocs, - getNumModuleIdentifiers(getImportedModule())); + return getTrailingObjects(getNumModuleIdentifiers(getImportedModule())); } SourceRange ImportDecl::getSourceRange() const { if (!isImportComplete()) - return SourceRange(getLocation(), *getTrailingObjects()); + return SourceRange(getLocation(), *getTrailingObjects()); return SourceRange(getLocation(), getIdentifierLocs().back()); } diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index f24ea815768a6..f1f31d8be78c9 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -3449,9 +3449,8 @@ UsingPackDecl *UsingPackDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, size_t Extra = additionalSizeToAlloc(NumExpansions); auto *Result = new (C, ID, Extra) UsingPackDecl(nullptr, nullptr, {}); Result->NumExpansions = NumExpansions; - auto *Trail = Result->getTrailingObjects(); - for (unsigned I = 0; I != NumExpansions; ++I) - new (Trail + I) NamedDecl*(nullptr); + auto *Trail = Result->getTrailingObjects(); + std::uninitialized_fill_n(Trail, NumExpansions, nullptr); return Result; } @@ -3610,9 +3609,8 @@ DecompositionDecl *DecompositionDecl::CreateDeserialized(ASTContext &C, QualType(), nullptr, StorageClass(), {}); // Set up and clean out the bindings array. Result->NumBindings = NumBindings; - auto *Trail = Result->getTrailingObjects(); - for (unsigned I = 0; I != NumBindings; ++I) - new (Trail + I) BindingDecl*(nullptr); + auto *Trail = Result->getTrailingObjects(); + std::uninitialized_fill_n(Trail, NumBindings, nullptr); return Result; } diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 6857eef87de38..b951e68b0a1b8 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -755,7 +755,7 @@ void TemplateTypeParmDecl::setTypeConstraint( "call setTypeConstraint"); assert(!TypeConstraintInitialized && "TypeConstraint was already initialized!"); - new (getTrailingObjects()) + new (getTrailingObjects()) TypeConstraint(Loc, ImmediatelyDeclaredConstraint, ArgPackSubstIndex); TypeConstraintInitialized = true; } @@ -880,8 +880,7 @@ TemplateTemplateParmDecl::TemplateTemplateParmDecl( : TemplateDecl(TemplateTemplateParm, DC, L, Id, Params), TemplateParmPosition(D, P), Typename(Typename), ParameterPack(true), ExpandedParameterPack(true), NumExpandedParams(Expansions.size()) { - llvm::uninitialized_copy(Expansions, - getTrailingObjects()); + llvm::uninitialized_copy(Expansions, getTrailingObjects()); } TemplateTemplateParmDecl * @@ -939,7 +938,7 @@ void TemplateTemplateParmDecl::setDefaultArgument( //===----------------------------------------------------------------------===// TemplateArgumentList::TemplateArgumentList(ArrayRef Args) : NumArguments(Args.size()) { - llvm::uninitialized_copy(Args, getTrailingObjects()); + llvm::uninitialized_copy(Args, getTrailingObjects()); } TemplateArgumentList * @@ -1166,7 +1165,7 @@ ImplicitConceptSpecializationDecl::CreateDeserialized( void ImplicitConceptSpecializationDecl::setTemplateArguments( ArrayRef Converted) { assert(Converted.size() == NumTemplateArgs); - llvm::uninitialized_copy(Converted, getTrailingObjects()); + llvm::uninitialized_copy(Converted, getTrailingObjects()); } //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 500d43accb082..ca1fbdf7e652f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -3314,7 +3314,11 @@ static bool HandleLValueBase(EvalInfo &Info, const Expr *E, LValue &Obj, return false; // Extract most-derived object and corresponding type. - DerivedDecl = D.MostDerivedType->getAsCXXRecordDecl(); + // FIXME: After implementing P2280R4 it became possible to get references + // here. We do MostDerivedType->getAsCXXRecordDecl() in several other + // locations and if we see crashes in those locations in the future + // it may make more sense to move this fix into Lvalue::set. + DerivedDecl = D.MostDerivedType.getNonReferenceType()->getAsCXXRecordDecl(); if (!CastToDerivedClass(Info, E, Obj, DerivedDecl, D.MostDerivedPathLength)) return false; @@ -5975,9 +5979,22 @@ static bool CheckConstexprFunction(EvalInfo &Info, SourceLocation CallLoc, Definition->hasAttr()))) return true; - if (Info.getLangOpts().CPlusPlus11) { - const FunctionDecl *DiagDecl = Definition ? Definition : Declaration; + const FunctionDecl *DiagDecl = Definition ? Definition : Declaration; + // Special note for the assert() macro, as the normal error message falsely + // implies we cannot use an assertion during constant evaluation. + if (CallLoc.isMacroID() && DiagDecl->getIdentifier()) { + // FIXME: Instead of checking for an implementation-defined function, + // check and evaluate the assert() macro. + StringRef Name = DiagDecl->getName(); + bool AssertFailed = + Name == "__assert_rtn" || Name == "__assert_fail" || Name == "_wassert"; + if (AssertFailed) { + Info.FFDiag(CallLoc, diag::note_constexpr_assert_failed); + return false; + } + } + if (Info.getLangOpts().CPlusPlus11) { // If this function is not constexpr because it is an inherited // non-constexpr constructor, diagnose that directly. auto *CD = dyn_cast(DiagDecl); @@ -11558,7 +11575,7 @@ static bool handleVectorShuffle(EvalInfo &Info, const ShuffleVectorExpr *E, unsigned const TotalElementsInInputVector1 = VecVal1.getVectorLength(); unsigned const TotalElementsInInputVector2 = VecVal2.getVectorLength(); - APSInt IndexVal = E->getShuffleMaskIdx(Info.Ctx, EltNum); + APSInt IndexVal = E->getShuffleMaskIdx(EltNum); int64_t index = IndexVal.getExtValue(); // The spec says that -1 should be treated as undef for optimizations, // but in constexpr we'd have to produce an APValue::Indeterminate, diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp index 447545c733050..7283ff837b04e 100644 --- a/clang/lib/AST/OpenACCClause.cpp +++ b/clang/lib/AST/OpenACCClause.cpp @@ -114,7 +114,7 @@ OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc, : OpenACCClauseWithParams(OpenACCClauseKind::Self, BeginLoc, LParenLoc, EndLoc), HasConditionExpr(std::nullopt), NumExprs(VarList.size()) { - llvm::uninitialized_copy(VarList, getTrailingObjects()); + llvm::uninitialized_copy(VarList, getTrailingObjects()); } OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc, @@ -126,8 +126,7 @@ OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc, assert((!ConditionExpr || ConditionExpr->isInstantiationDependent() || ConditionExpr->getType()->isScalarType()) && "Condition expression type not scalar/dependent"); - llvm::uninitialized_copy(ArrayRef(ConditionExpr), - getTrailingObjects()); + llvm::uninitialized_copy(ArrayRef(ConditionExpr), getTrailingObjects()); } OpenACCClause::child_range OpenACCClause::children() { @@ -166,8 +165,7 @@ OpenACCGangClause::OpenACCGangClause(SourceLocation BeginLoc, : OpenACCClauseWithExprs(OpenACCClauseKind::Gang, BeginLoc, LParenLoc, EndLoc) { assert(GangKinds.size() == IntExprs.size() && "Mismatch exprs/kind?"); - llvm::uninitialized_copy(IntExprs, getTrailingObjects()); - setExprs(getTrailingObjects(IntExprs.size())); + setExprs(getTrailingObjects(IntExprs.size()), IntExprs); llvm::uninitialized_copy(GangKinds, getTrailingObjects()); } diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index f7d1655f67ed1..19db338f760ba 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2189,7 +2189,10 @@ StmtProfiler::VisitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *S) { void StmtProfiler::VisitOverloadExpr(const OverloadExpr *S) { VisitExpr(S); - VisitNestedNameSpecifier(S->getQualifier()); + if (S->getNumDecls() == 1) + VisitDecl(*S->decls_begin()); + else + VisitNestedNameSpecifier(S->getQualifier()); VisitName(S->getName(), /*TreatAsDecl*/ true); ID.AddBoolean(S->hasExplicitTemplateArgs()); if (S->hasExplicitTemplateArgs()) diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 82a8cc99cd265..a20bc3ffba823 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2193,6 +2193,20 @@ bool Type::isAnyCharacterType() const { } } +bool Type::isUnicodeCharacterType() const { + const auto *BT = dyn_cast(CanonicalType); + if (!BT) + return false; + switch (BT->getKind()) { + default: + return false; + case BuiltinType::Char8: + case BuiltinType::Char16: + case BuiltinType::Char32: + return true; + } +} + /// isSignedIntegerType - Return true if this is an integer type that is /// signed, according to C99 6.2.5p4 [char, signed char, short, int, long..], /// an enum decl which has a signed representation @@ -2833,6 +2847,11 @@ static bool isTriviallyCopyableTypeImpl(const QualType &type, if (CanonicalType->isScalarType() || CanonicalType->isVectorType()) return true; + // Mfloat8 type is a special case as it not scalar, but is still trivially + // copyable. + if (CanonicalType->isMFloat8Type()) + return true; + if (const auto *RT = CanonicalType->getAs()) { if (const auto *ClassDecl = dyn_cast(RT->getDecl())) { if (IsCopyConstructible) { diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index e2d940c0d39e9..f01ff4df84e6a 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringTable.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Path.h" #include @@ -74,19 +75,21 @@ enum DiagnosticClass { struct StaticDiagInfoRec { uint16_t DiagID; LLVM_PREFERRED_TYPE(diag::Severity) - uint8_t DefaultSeverity : 3; + uint16_t DefaultSeverity : 3; LLVM_PREFERRED_TYPE(DiagnosticClass) - uint8_t Class : 3; + uint16_t Class : 3; LLVM_PREFERRED_TYPE(DiagnosticIDs::SFINAEResponse) - uint8_t SFINAE : 2; - uint8_t Category : 6; + uint16_t SFINAE : 2; + LLVM_PREFERRED_TYPE(diag::DiagCategory) + uint16_t Category : 6; LLVM_PREFERRED_TYPE(bool) - uint8_t WarnNoWerror : 1; + uint16_t WarnNoWerror : 1; LLVM_PREFERRED_TYPE(bool) - uint8_t WarnShowInSystemHeader : 1; + uint16_t WarnShowInSystemHeader : 1; LLVM_PREFERRED_TYPE(bool) - uint8_t WarnShowInSystemMacro : 1; + uint16_t WarnShowInSystemMacro : 1; + LLVM_PREFERRED_TYPE(diag::Group) uint16_t OptionGroupIndex : 15; LLVM_PREFERRED_TYPE(bool) uint16_t Deferrable : 1; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 3633bab6e0df9..e1f6c7b834dc7 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -794,12 +794,17 @@ AArch64TargetInfo::getTargetBuiltins() const { std::optional> AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts, - bool IsArmStreamingFunction) const { + bool IsArmStreamingFunction, + llvm::StringMap *FeatureMap) const { if (LangOpts.VScaleMin || LangOpts.VScaleMax) return std::pair( LangOpts.VScaleMin ? LangOpts.VScaleMin : 1, LangOpts.VScaleMax); - if (hasFeature("sve") || (IsArmStreamingFunction && hasFeature("sme"))) + if (hasFeature("sve") || (FeatureMap && (FeatureMap->lookup("sve")))) + return std::pair(1, 16); + + if (IsArmStreamingFunction && + (hasFeature("sme") || (FeatureMap && (FeatureMap->lookup("sme"))))) return std::pair(1, 16); return std::nullopt; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 2fab88cfca901..6eeac69af20df 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -187,8 +187,8 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { llvm::SmallVector getTargetBuiltins() const override; std::optional> - getVScaleRange(const LangOptions &LangOpts, - bool IsArmStreamingFunction) const override; + getVScaleRange(const LangOptions &LangOpts, bool IsArmStreamingFunction, + llvm::StringMap *FeatureMap = nullptr) const override; bool doesFeatureAffectCodeGen(StringRef Name) const override; bool validateCpuSupports(StringRef FeatureStr) const override; bool hasFeature(StringRef Feature) const override; diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index c368200f3f739..9a935948882fd 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -310,7 +310,7 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, // e.g. gfx10-1-generic -> gfx10_1_generic if (GPUKind >= llvm::AMDGPU::GK_AMDGCN_GENERIC_FIRST && GPUKind <= llvm::AMDGPU::GK_AMDGCN_GENERIC_LAST) { - std::replace(CanonName.begin(), CanonName.end(), '-', '_'); + llvm::replace(CanonName, '-', '_'); } Builder.defineMacro(Twine("__") + Twine(CanonName) + Twine("__")); @@ -329,7 +329,7 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, auto Loc = OffloadArchFeatures.find(F); if (Loc != OffloadArchFeatures.end()) { std::string NewF = F.str(); - std::replace(NewF.begin(), NewF.end(), '-', '_'); + llvm::replace(NewF, '-', '_'); Builder.defineMacro(Twine("__amdgcn_feature_") + Twine(NewF) + Twine("__"), Loc->second ? "1" : "0"); diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp index 425ad68bb9098..e6ef0ecc526ba 100644 --- a/clang/lib/Basic/Targets/PPC.cpp +++ b/clang/lib/Basic/Targets/PPC.cpp @@ -679,11 +679,17 @@ bool PPCTargetInfo::initFeatureMap( } } - if (!(ArchDefs & ArchDefinePwr8) && - llvm::is_contained(FeaturesVec, "+rop-protect")) { - // We can turn on ROP Protect on Power 8 and above. - Diags.Report(diag::err_opt_not_valid_with_opt) << "-mrop-protect" << CPU; - return false; + if (llvm::is_contained(FeaturesVec, "+rop-protect")) { + if (PointerWidth == 32) { + Diags.Report(diag::err_opt_not_valid_on_target) << "-mrop-protect"; + return false; + } + + if (!(ArchDefs & ArchDefinePwr8)) { + // We can turn on ROP Protect on Power 8 and above. + Diags.Report(diag::err_opt_not_valid_with_opt) << "-mrop-protect" << CPU; + return false; + } } if (!(ArchDefs & ArchDefinePwr8) && diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index 390ef0f3ac884..a1a2437f288a0 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -332,7 +332,8 @@ bool RISCVTargetInfo::initFeatureMap( std::optional> RISCVTargetInfo::getVScaleRange(const LangOptions &LangOpts, - bool IsArmStreamingFunction) const { + bool IsArmStreamingFunction, + llvm::StringMap *FeatureMap) const { // RISCV::RVVBitsPerBlock is 64. unsigned VScaleMin = ISAInfo->getMinVLen() / llvm::RISCV::RVVBitsPerBlock; diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h index c26aa19080162..0b36c9d5d9cc8 100644 --- a/clang/lib/Basic/Targets/RISCV.h +++ b/clang/lib/Basic/Targets/RISCV.h @@ -99,8 +99,8 @@ class RISCVTargetInfo : public TargetInfo { const std::vector &FeaturesVec) const override; std::optional> - getVScaleRange(const LangOptions &LangOpts, - bool IsArmStreamingFunction) const override; + getVScaleRange(const LangOptions &LangOpts, bool IsArmStreamingFunction, + llvm::StringMap *FeatureMap = nullptr) const override; bool hasFeature(StringRef Feature) const override; diff --git a/clang/lib/Basic/Targets/SystemZ.cpp b/clang/lib/Basic/Targets/SystemZ.cpp index ce532b72a89d1..13b86234eed79 100644 --- a/clang/lib/Basic/Targets/SystemZ.cpp +++ b/clang/lib/Basic/Targets/SystemZ.cpp @@ -188,8 +188,8 @@ void SystemZTargetInfo::getTargetDefines(const LangOptions &Opts, std::string Str("0x"); unsigned int Librel = 0x40000000; Librel |= V.getMajor() << 24; - Librel |= (V.getMinor() ? V.getMinor().value() : 1) << 16; - Librel |= V.getSubminor() ? V.getSubminor().value() : 0; + Librel |= V.getMinor().value_or(1) << 16; + Librel |= V.getSubminor().value_or(0); Str += llvm::utohexstr(Librel); Builder.defineMacro("__TARGET_LIB__", Str.c_str()); diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp index 5c65a43641844..17bfa19f9fd63 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp @@ -44,14 +44,11 @@ CIRGenFunctionInfo::create(CanQualType resultType, cir::FuncType CIRGenTypes::getFunctionType(const CIRGenFunctionInfo &fi) { mlir::Type resultType = convertType(fi.getReturnType()); + SmallVector argTypes; + argTypes.reserve(fi.getNumRequiredArgs()); - SmallVector argTypes(fi.getNumRequiredArgs()); - - unsigned argNo = 0; - llvm::ArrayRef argInfos(fi.argInfoBegin(), - fi.getNumRequiredArgs()); - for (const auto &argInfo : argInfos) - argTypes[argNo++] = convertType(argInfo.type); + for (const CIRGenFunctionInfoArgInfo &argInfo : fi.requiredArguments()) + argTypes.push_back(convertType(argInfo.type)); return cir::FuncType::get(argTypes, (resultType ? resultType : builder.getVoidTy()), @@ -63,6 +60,35 @@ CIRGenCallee CIRGenCallee::prepareConcreteCallee(CIRGenFunction &cgf) const { return *this; } +/// Adds the formal parameters in FPT to the given prefix. If any parameter in +/// FPT has pass_object_size_attrs, then we'll add parameters for those, too. +/// TODO(cir): this should be shared with LLVM codegen +static void appendParameterTypes(const CIRGenTypes &cgt, + SmallVectorImpl &prefix, + CanQual fpt) { + assert(!cir::MissingFeatures::opCallExtParameterInfo()); + // Fast path: don't touch param info if we don't need to. + if (!fpt->hasExtParameterInfos()) { + prefix.append(fpt->param_type_begin(), fpt->param_type_end()); + return; + } + + cgt.getCGModule().errorNYI("appendParameterTypes: hasExtParameterInfos"); +} + +/// Arrange the CIR function layout for a value of the given function type, on +/// top of any implicit parameters already stored. +static const CIRGenFunctionInfo & +arrangeCIRFunctionInfo(CIRGenTypes &cgt, SmallVectorImpl &prefix, + CanQual ftp) { + RequiredArgs required = + RequiredArgs::getFromProtoWithExtraSlots(ftp, prefix.size()); + assert(!cir::MissingFeatures::opCallExtParameterInfo()); + appendParameterTypes(cgt, prefix, ftp); + CanQualType resultType = ftp->getReturnType().getUnqualifiedType(); + return cgt.arrangeCIRFunctionInfo(resultType, prefix, required); +} + static const CIRGenFunctionInfo & arrangeFreeFunctionLikeCall(CIRGenTypes &cgt, CIRGenModule &cgm, const CallArgList &args, @@ -95,6 +121,34 @@ CIRGenTypes::arrangeFreeFunctionCall(const CallArgList &args, return arrangeFreeFunctionLikeCall(*this, cgm, args, fnType); } +/// Arrange the argument and result information for the declaration or +/// definition of the given function. +const CIRGenFunctionInfo & +CIRGenTypes::arrangeFunctionDeclaration(const FunctionDecl *fd) { + if (const auto *md = dyn_cast(fd)) { + if (md->isInstance()) { + cgm.errorNYI("arrangeFunctionDeclaration: instance method"); + } + } + + CanQualType funcTy = fd->getType()->getCanonicalTypeUnqualified(); + + assert(isa(funcTy)); + // TODO: setCUDAKernelCallingConvention + assert(!cir::MissingFeatures::cudaSupport()); + + // When declaring a function without a prototype, always use a non-variadic + // type. + if (CanQual noProto = + funcTy.getAs()) { + assert(!cir::MissingFeatures::opCallCIRGenFuncInfoExtParamInfo()); + return arrangeCIRFunctionInfo(noProto->getReturnType(), std::nullopt, + RequiredArgs::All); + } + + return arrangeFreeFunctionType(funcTy.castAs()); +} + static cir::CIRCallOpInterface emitCallLikeOp(CIRGenFunction &cgf, mlir::Location callLoc, cir::FuncOp directFuncOp, @@ -112,13 +166,8 @@ emitCallLikeOp(CIRGenFunction &cgf, mlir::Location callLoc, const CIRGenFunctionInfo & CIRGenTypes::arrangeFreeFunctionType(CanQual fpt) { - SmallVector argTypes; - for (unsigned i = 0, e = fpt->getNumParams(); i != e; ++i) - argTypes.push_back(fpt->getParamType(i)); - RequiredArgs required = RequiredArgs::forPrototypePlus(fpt); - - CanQualType resultType = fpt->getReturnType().getUnqualifiedType(); - return arrangeCIRFunctionInfo(resultType, argTypes, required); + SmallVector argTypes; + return ::arrangeCIRFunctionInfo(*this, argTypes, fpt); } const CIRGenFunctionInfo & diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp index 498d7533c2204..61af33053dc0a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp @@ -322,6 +322,8 @@ void CIRGenFunction::emitDecl(const Decl &d) { case Decl::ObjCTypeParam: case Decl::Binding: case Decl::UnresolvedUsingIfExists: + case Decl::HLSLBuffer: + case Decl::HLSLRootSignature: llvm_unreachable("Declaration should not be in declstmts!"); case Decl::Function: // void X(); @@ -374,7 +376,6 @@ void CIRGenFunction::emitDecl(const Decl &d) { return; } case Decl::ImplicitConceptSpecialization: - case Decl::HLSLBuffer: case Decl::TopLevelStmt: case Decl::UsingPack: case Decl::Decomposition: // This could be moved to join Decl::Var diff --git a/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h b/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h index 0556408fb98d1..1e06599575fbd 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunctionInfo.h @@ -47,19 +47,21 @@ class RequiredArgs { /// /// If FD is not null, this will consider pass_object_size params in FD. static RequiredArgs - forPrototypePlus(const clang::FunctionProtoType *prototype) { + getFromProtoWithExtraSlots(const clang::FunctionProtoType *prototype, + unsigned additional) { if (!prototype->isVariadic()) return All; if (prototype->hasExtParameterInfos()) llvm_unreachable("NYI"); - return RequiredArgs(prototype->getNumParams()); + return RequiredArgs(prototype->getNumParams() + additional); } static RequiredArgs - forPrototypePlus(clang::CanQual prototype) { - return forPrototypePlus(prototype.getTypePtr()); + getFromProtoWithExtraSlots(clang::CanQual prototype, + unsigned additional) { + return getFromProtoWithExtraSlots(prototype.getTypePtr(), additional); } unsigned getNumRequiredArgs() const { @@ -114,6 +116,14 @@ class CIRGenFunctionInfo final getReturnType().Profile(id); } + llvm::ArrayRef arguments() const { + return llvm::ArrayRef(argInfoBegin(), numArgs); + } + + llvm::ArrayRef requiredArguments() const { + return llvm::ArrayRef(argInfoBegin(), getNumRequiredArgs()); + } + CanQualType getReturnType() const { return getArgsBuffer()[0].type; } const_arg_iterator argInfoBegin() const { return getArgsBuffer() + 1; } diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index b4e27bc5fec6a..bd3aa37a92af6 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -249,21 +249,8 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd, return; } - cir::FuncType funcType; - // TODO: Move this to arrangeFunctionDeclaration when it is - // implemented. - // When declaring a function without a prototype, always use a - // non-variadic type. - if (CanQual noProto = - funcDecl->getType() - ->getCanonicalTypeUnqualified() - .getAs()) { - const CIRGenFunctionInfo &fi = getTypes().arrangeCIRFunctionInfo( - noProto->getReturnType(), {}, RequiredArgs::All); - funcType = getTypes().getFunctionType(fi); - } else { - funcType = cast(convertType(funcDecl->getType())); - } + const CIRGenFunctionInfo &fi = getTypes().arrangeGlobalDeclaration(gd); + cir::FuncType funcType = getTypes().getFunctionType(fi); cir::FuncOp funcOp = dyn_cast_if_present(op); if (!funcOp || funcOp.getFunctionType() != funcType) { diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h index 5b3fb5527334a..9adbe6a497214 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h @@ -266,10 +266,12 @@ class OpenACCClauseCIREmitter final else operation.getAsyncOperandMutable().append( createIntExpr(clause.getIntExpr())); + } else if constexpr (isCombinedType) { + applyToComputeOp(clause); } else { // TODO: When we've implemented this for everything, switch this to an // unreachable. Combined constructs remain. Data, enter data, exit data, - // update, combined constructs remain. + // update constructs remain. return clauseNotImplemented(clause); } } diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp index 097d14b370940..dc8872122995c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp @@ -553,3 +553,17 @@ CIRGenTypes::arrangeCIRFunctionInfo(CanQualType returnType, return *fi; } + +const CIRGenFunctionInfo &CIRGenTypes::arrangeGlobalDeclaration(GlobalDecl gd) { + assert(!dyn_cast(gd.getDecl()) && + "This is reported as a FIXME in LLVM codegen"); + const auto *fd = cast(gd.getDecl()); + + if (isa(gd.getDecl()) || + isa(gd.getDecl())) { + cgm.errorNYI(SourceLocation(), + "arrangeGlobalDeclaration for C++ constructor or destructor"); + } + + return arrangeFunctionDeclaration(fd); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.h b/clang/lib/CIR/CodeGen/CIRGenTypes.h index ff8ce3f87f362..53e79c3641c40 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.h @@ -117,6 +117,36 @@ class CIRGenTypes { // TODO: convert this comment to account for MLIR's equivalence mlir::Type convertTypeForMem(clang::QualType, bool forBitField = false); + /// Get the CIR function type for \arg Info. + cir::FuncType getFunctionType(const CIRGenFunctionInfo &info); + + // The arrangement methods are split into three families: + // - those meant to drive the signature and prologue/epilogue + // of a function declaration or definition, + // - those meant for the computation of the CIR type for an abstract + // appearance of a function, and + // - those meant for performing the CIR-generation of a call. + // They differ mainly in how they deal with optional (i.e. variadic) + // arguments, as well as unprototyped functions. + // + // Key points: + // - The CIRGenFunctionInfo for emitting a specific call site must include + // entries for the optional arguments. + // - The function type used at the call site must reflect the formal + // signature + // of the declaration being called, or else the call will go away. + // - For the most part, unprototyped functions are called by casting to a + // formal signature inferred from the specific argument types used at the + // call-site. However, some targets (e.g. x86-64) screw with this for + // compatability reasons. + + const CIRGenFunctionInfo &arrangeGlobalDeclaration(GlobalDecl gd); + + /// Free functions are functions that are compatible with an ordinary C + /// function pointer type. + const CIRGenFunctionInfo & + arrangeFunctionDeclaration(const clang::FunctionDecl *fd); + /// Return whether a type can be zero-initialized (in the C++ sense) with an /// LLVM zeroinitializer. bool isZeroInitializable(clang::QualType ty); @@ -134,8 +164,6 @@ class CIRGenTypes { arrangeFreeFunctionType(CanQual fpt); const CIRGenFunctionInfo & arrangeFreeFunctionType(CanQual fnpt); - - cir::FuncType getFunctionType(const CIRGenFunctionInfo &fi); }; } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenerator.cpp b/clang/lib/CIR/CodeGen/CIRGenerator.cpp index aa3864deb733c..40252ffecfba1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenerator.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenerator.cpp @@ -18,6 +18,7 @@ #include "clang/AST/DeclGroup.h" #include "clang/CIR/CIRGenerator.h" #include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.h" using namespace cir; using namespace clang; @@ -38,6 +39,12 @@ void CIRGenerator::Initialize(ASTContext &astContext) { mlirContext = std::make_unique(); mlirContext->loadDialect(); mlirContext->getOrLoadDialect(); + + // Register extensions to integrate CIR types with OpenACC. + mlir::DialectRegistry registry; + cir::acc::registerOpenACCExtensions(registry); + mlirContext->appendDialectRegistry(registry); + cgm = std::make_unique( *mlirContext.get(), astContext, codeGenOpts, diags); } diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index 7a701c3c0b82b..8f5796e59d3bb 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -35,6 +35,7 @@ add_clang_library(clangCIR clangBasic clangLex ${dialect_libs} + CIROpenACCSupport MLIRCIR MLIRCIRInterfaces ) diff --git a/clang/lib/CIR/Dialect/CMakeLists.txt b/clang/lib/CIR/Dialect/CMakeLists.txt index 9f57627c321fb..c825a61b2779b 100644 --- a/clang/lib/CIR/Dialect/CMakeLists.txt +++ b/clang/lib/CIR/Dialect/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(IR) +add_subdirectory(OpenACC) add_subdirectory(Transforms) diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 779114e09d834..9c80c48fa4039 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -22,6 +22,7 @@ #include "clang/CIR/Dialect/IR/CIROpsDialect.cpp.inc" #include "clang/CIR/Dialect/IR/CIROpsEnums.cpp.inc" #include "clang/CIR/MissingFeatures.h" +#include using namespace mlir; using namespace cir; @@ -962,6 +963,101 @@ bool cir::SwitchOp::isSimpleForm(llvm::SmallVectorImpl &cases) { }); } +//===----------------------------------------------------------------------===// +// SwitchFlatOp +//===----------------------------------------------------------------------===// + +void cir::SwitchFlatOp::build(OpBuilder &builder, OperationState &result, + Value value, Block *defaultDestination, + ValueRange defaultOperands, + ArrayRef caseValues, + BlockRange caseDestinations, + ArrayRef caseOperands) { + + std::vector caseValuesAttrs; + for (const APInt &val : caseValues) + caseValuesAttrs.push_back(cir::IntAttr::get(value.getType(), val)); + mlir::ArrayAttr attrs = ArrayAttr::get(builder.getContext(), caseValuesAttrs); + + build(builder, result, value, defaultOperands, caseOperands, attrs, + defaultDestination, caseDestinations); +} + +/// ::= `[` (case (`,` case )* )? `]` +/// ::= integer `:` bb-id (`(` ssa-use-and-type-list `)`)? +static ParseResult parseSwitchFlatOpCases( + OpAsmParser &parser, Type flagType, mlir::ArrayAttr &caseValues, + SmallVectorImpl &caseDestinations, + SmallVectorImpl> + &caseOperands, + SmallVectorImpl> &caseOperandTypes) { + if (failed(parser.parseLSquare())) + return failure(); + if (succeeded(parser.parseOptionalRSquare())) + return success(); + llvm::SmallVector values; + + auto parseCase = [&]() { + int64_t value = 0; + if (failed(parser.parseInteger(value))) + return failure(); + + values.push_back(cir::IntAttr::get(flagType, value)); + + Block *destination; + llvm::SmallVector operands; + llvm::SmallVector operandTypes; + if (parser.parseColon() || parser.parseSuccessor(destination)) + return failure(); + if (!parser.parseOptionalLParen()) { + if (parser.parseOperandList(operands, OpAsmParser::Delimiter::None, + /*allowResultNumber=*/false) || + parser.parseColonTypeList(operandTypes) || parser.parseRParen()) + return failure(); + } + caseDestinations.push_back(destination); + caseOperands.emplace_back(operands); + caseOperandTypes.emplace_back(operandTypes); + return success(); + }; + if (failed(parser.parseCommaSeparatedList(parseCase))) + return failure(); + + caseValues = ArrayAttr::get(flagType.getContext(), values); + + return parser.parseRSquare(); +} + +static void printSwitchFlatOpCases(OpAsmPrinter &p, cir::SwitchFlatOp op, + Type flagType, mlir::ArrayAttr caseValues, + SuccessorRange caseDestinations, + OperandRangeRange caseOperands, + const TypeRangeRange &caseOperandTypes) { + p << '['; + p.printNewline(); + if (!caseValues) { + p << ']'; + return; + } + + size_t index = 0; + llvm::interleave( + llvm::zip(caseValues, caseDestinations), + [&](auto i) { + p << " "; + mlir::Attribute a = std::get<0>(i); + p << mlir::cast(a).getValue(); + p << ": "; + p.printSuccessorAndUseList(std::get<1>(i), caseOperands[index++]); + }, + [&] { + p << ','; + p.printNewline(); + }); + p.printNewline(); + p << ']'; +} + //===----------------------------------------------------------------------===// // GlobalOp //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.cpp b/clang/lib/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.cpp new file mode 100644 index 0000000000000..de8dd9c55ee32 --- /dev/null +++ b/clang/lib/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of external dialect interfaces for CIR. +// +//===----------------------------------------------------------------------===// + +#include "clang/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.h" +#include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" + +namespace cir::acc { + +mlir::Type getBaseType(mlir::Value varPtr) { + mlir::Operation *op = varPtr.getDefiningOp(); + assert(op && "Expected a defining operation"); + + // This is the variable definition we're looking for. + if (auto allocaOp = mlir::dyn_cast(*op)) + return allocaOp.getAllocaType(); + + // Look through casts to the source pointer. + if (auto castOp = mlir::dyn_cast(*op)) + return getBaseType(castOp.getSrc()); + + // Follow the source of ptr strides. + if (auto ptrStrideOp = mlir::dyn_cast(*op)) + return getBaseType(ptrStrideOp.getBase()); + + if (auto getMemberOp = mlir::dyn_cast(*op)) + return getBaseType(getMemberOp.getAddr()); + + return mlir::cast(varPtr.getType()).getPointee(); +} + +template <> +mlir::acc::VariableTypeCategory +OpenACCPointerLikeModel::getPointeeTypeCategory( + mlir::Type pointer, mlir::TypedValue varPtr, + mlir::Type varType) const { + mlir::Type eleTy = getBaseType(varPtr); + + if (auto mappableTy = mlir::dyn_cast(eleTy)) + return mappableTy.getTypeCategory(varPtr); + + if (isAnyIntegerOrFloatingPointType(eleTy) || + mlir::isa(eleTy) || mlir::isa(eleTy)) + return mlir::acc::VariableTypeCategory::scalar; + if (mlir::isa(eleTy)) + return mlir::acc::VariableTypeCategory::array; + if (mlir::isa(eleTy)) + return mlir::acc::VariableTypeCategory::composite; + if (mlir::isa(eleTy) || mlir::isa(eleTy)) + return mlir::acc::VariableTypeCategory::nonscalar; + + // Without further checking, this type cannot be categorized. + return mlir::acc::VariableTypeCategory::uncategorized; +} + +} // namespace cir::acc diff --git a/clang/lib/CIR/Dialect/OpenACC/CMakeLists.txt b/clang/lib/CIR/Dialect/OpenACC/CMakeLists.txt new file mode 100644 index 0000000000000..de27f4cb27c59 --- /dev/null +++ b/clang/lib/CIR/Dialect/OpenACC/CMakeLists.txt @@ -0,0 +1,12 @@ +add_clang_library(CIROpenACCSupport + CIROpenACCTypeInterfaces.cpp + RegisterOpenACCExtensions.cpp + + DEPENDS + MLIRCIRTypeConstraintsIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRCIR + MLIROpenACCDialect + ) diff --git a/clang/lib/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.cpp b/clang/lib/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.cpp new file mode 100644 index 0000000000000..3dfe7aeb6b1d6 --- /dev/null +++ b/clang/lib/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.cpp @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Registration for OpenACC extensions as applied to CIR dialect. +// +//===----------------------------------------------------------------------===// + +#include "clang/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.h" +#include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" +#include "clang/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.h" + +namespace cir::acc { + +void registerOpenACCExtensions(mlir::DialectRegistry ®istry) { + registry.addExtension(+[](mlir::MLIRContext *ctx, cir::CIRDialect *dialect) { + cir::PointerType::attachInterface< + OpenACCPointerLikeModel>(*ctx); + }); +} + +} // namespace cir::acc diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp index 798bc0dab9384..fb000adee04c6 100644 --- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp +++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp @@ -84,6 +84,19 @@ struct RemoveEmptyScope : public OpRewritePattern { } }; +struct RemoveEmptySwitch : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(SwitchOp op, + PatternRewriter &rewriter) const final { + if (!(op.getBody().empty() || isa(op.getBody().front().front()))) + return failure(); + + rewriter.eraseOp(op); + return success(); + } +}; + //===----------------------------------------------------------------------===// // CIRCanonicalizePass //===----------------------------------------------------------------------===// @@ -127,8 +140,8 @@ void CIRCanonicalizePass::runOnOperation() { assert(!cir::MissingFeatures::callOp()); // CastOp, UnaryOp and VecExtractOp are here to perform a manual `fold` in // applyOpPatternsGreedily. - if (isa( - op)) + if (isa(op)) ops.push_back(op); }); diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp index 4a936d33b022a..26e5c0572f12e 100644 --- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp +++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp @@ -171,6 +171,233 @@ class CIRScopeOpFlattening : public mlir::OpRewritePattern { } }; +class CIRSwitchOpFlattening : public mlir::OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + inline void rewriteYieldOp(mlir::PatternRewriter &rewriter, + cir::YieldOp yieldOp, + mlir::Block *destination) const { + rewriter.setInsertionPoint(yieldOp); + rewriter.replaceOpWithNewOp(yieldOp, yieldOp.getOperands(), + destination); + } + + // Return the new defaultDestination block. + Block *condBrToRangeDestination(cir::SwitchOp op, + mlir::PatternRewriter &rewriter, + mlir::Block *rangeDestination, + mlir::Block *defaultDestination, + const APInt &lowerBound, + const APInt &upperBound) const { + assert(lowerBound.sle(upperBound) && "Invalid range"); + mlir::Block *resBlock = rewriter.createBlock(defaultDestination); + cir::IntType sIntType = cir::IntType::get(op.getContext(), 32, true); + cir::IntType uIntType = cir::IntType::get(op.getContext(), 32, false); + + cir::ConstantOp rangeLength = rewriter.create( + op.getLoc(), cir::IntAttr::get(sIntType, upperBound - lowerBound)); + + cir::ConstantOp lowerBoundValue = rewriter.create( + op.getLoc(), cir::IntAttr::get(sIntType, lowerBound)); + cir::BinOp diffValue = + rewriter.create(op.getLoc(), sIntType, cir::BinOpKind::Sub, + op.getCondition(), lowerBoundValue); + + // Use unsigned comparison to check if the condition is in the range. + cir::CastOp uDiffValue = rewriter.create( + op.getLoc(), uIntType, CastKind::integral, diffValue); + cir::CastOp uRangeLength = rewriter.create( + op.getLoc(), uIntType, CastKind::integral, rangeLength); + + cir::CmpOp cmpResult = rewriter.create( + op.getLoc(), cir::BoolType::get(op.getContext()), cir::CmpOpKind::le, + uDiffValue, uRangeLength); + rewriter.create(op.getLoc(), cmpResult, rangeDestination, + defaultDestination); + return resBlock; + } + + mlir::LogicalResult + matchAndRewrite(cir::SwitchOp op, + mlir::PatternRewriter &rewriter) const override { + llvm::SmallVector cases; + op.collectCases(cases); + + // Empty switch statement: just erase it. + if (cases.empty()) { + rewriter.eraseOp(op); + return mlir::success(); + } + + // Create exit block from the next node of cir.switch op. + mlir::Block *exitBlock = rewriter.splitBlock( + rewriter.getBlock(), op->getNextNode()->getIterator()); + + // We lower cir.switch op in the following process: + // 1. Inline the region from the switch op after switch op. + // 2. Traverse each cir.case op: + // a. Record the entry block, block arguments and condition for every + // case. b. Inline the case region after the case op. + // 3. Replace the empty cir.switch.op with the new cir.switchflat op by the + // recorded block and conditions. + + // inline everything from switch body between the switch op and the exit + // block. + { + cir::YieldOp switchYield = nullptr; + // Clear switch operation. + for (mlir::Block &block : + llvm::make_early_inc_range(op.getBody().getBlocks())) + if (auto yieldOp = dyn_cast(block.getTerminator())) + switchYield = yieldOp; + + assert(!op.getBody().empty()); + mlir::Block *originalBlock = op->getBlock(); + mlir::Block *swopBlock = + rewriter.splitBlock(originalBlock, op->getIterator()); + rewriter.inlineRegionBefore(op.getBody(), exitBlock); + + if (switchYield) + rewriteYieldOp(rewriter, switchYield, exitBlock); + + rewriter.setInsertionPointToEnd(originalBlock); + rewriter.create(op.getLoc(), swopBlock); + } + + // Allocate required data structures (disconsider default case in + // vectors). + llvm::SmallVector caseValues; + llvm::SmallVector caseDestinations; + llvm::SmallVector caseOperands; + + llvm::SmallVector> rangeValues; + llvm::SmallVector rangeDestinations; + llvm::SmallVector rangeOperands; + + // Initialize default case as optional. + mlir::Block *defaultDestination = exitBlock; + mlir::ValueRange defaultOperands = exitBlock->getArguments(); + + // Digest the case statements values and bodies. + for (cir::CaseOp caseOp : cases) { + mlir::Region ®ion = caseOp.getCaseRegion(); + + // Found default case: save destination and operands. + switch (caseOp.getKind()) { + case cir::CaseOpKind::Default: + defaultDestination = ®ion.front(); + defaultOperands = defaultDestination->getArguments(); + break; + case cir::CaseOpKind::Range: + assert(caseOp.getValue().size() == 2 && + "Case range should have 2 case value"); + rangeValues.push_back( + {cast(caseOp.getValue()[0]).getValue(), + cast(caseOp.getValue()[1]).getValue()}); + rangeDestinations.push_back(®ion.front()); + rangeOperands.push_back(rangeDestinations.back()->getArguments()); + break; + case cir::CaseOpKind::Anyof: + case cir::CaseOpKind::Equal: + // AnyOf cases kind can have multiple values, hence the loop below. + for (const mlir::Attribute &value : caseOp.getValue()) { + caseValues.push_back(cast(value).getValue()); + caseDestinations.push_back(®ion.front()); + caseOperands.push_back(caseDestinations.back()->getArguments()); + } + break; + } + + // Handle break statements. + walkRegionSkipping( + region, [&](mlir::Operation *op) { + if (!isa(op)) + return mlir::WalkResult::advance(); + + lowerTerminator(op, exitBlock, rewriter); + return mlir::WalkResult::skip(); + }); + + // Track fallthrough in cases. + for (mlir::Block &blk : region.getBlocks()) { + if (blk.getNumSuccessors()) + continue; + + if (auto yieldOp = dyn_cast(blk.getTerminator())) { + mlir::Operation *nextOp = caseOp->getNextNode(); + assert(nextOp && "caseOp is not expected to be the last op"); + mlir::Block *oldBlock = nextOp->getBlock(); + mlir::Block *newBlock = + rewriter.splitBlock(oldBlock, nextOp->getIterator()); + rewriter.setInsertionPointToEnd(oldBlock); + rewriter.create(nextOp->getLoc(), mlir::ValueRange(), + newBlock); + rewriteYieldOp(rewriter, yieldOp, newBlock); + } + } + + mlir::Block *oldBlock = caseOp->getBlock(); + mlir::Block *newBlock = + rewriter.splitBlock(oldBlock, caseOp->getIterator()); + + mlir::Block &entryBlock = caseOp.getCaseRegion().front(); + rewriter.inlineRegionBefore(caseOp.getCaseRegion(), newBlock); + + // Create a branch to the entry of the inlined region. + rewriter.setInsertionPointToEnd(oldBlock); + rewriter.create(caseOp.getLoc(), &entryBlock); + } + + // Remove all cases since we've inlined the regions. + for (cir::CaseOp caseOp : cases) { + mlir::Block *caseBlock = caseOp->getBlock(); + // Erase the block with no predecessors here to make the generated code + // simpler a little bit. + if (caseBlock->hasNoPredecessors()) + rewriter.eraseBlock(caseBlock); + else + rewriter.eraseOp(caseOp); + } + + for (auto [rangeVal, operand, destination] : + llvm::zip(rangeValues, rangeOperands, rangeDestinations)) { + APInt lowerBound = rangeVal.first; + APInt upperBound = rangeVal.second; + + // The case range is unreachable, skip it. + if (lowerBound.sgt(upperBound)) + continue; + + // If range is small, add multiple switch instruction cases. + // This magical number is from the original CGStmt code. + constexpr int kSmallRangeThreshold = 64; + if ((upperBound - lowerBound) + .ult(llvm::APInt(32, kSmallRangeThreshold))) { + for (APInt iValue = lowerBound; iValue.sle(upperBound); ++iValue) { + caseValues.push_back(iValue); + caseOperands.push_back(operand); + caseDestinations.push_back(destination); + } + continue; + } + + defaultDestination = + condBrToRangeDestination(op, rewriter, destination, + defaultDestination, lowerBound, upperBound); + defaultOperands = operand; + } + + // Set switch op to branch to the newly created blocks. + rewriter.setInsertionPoint(op); + rewriter.replaceOpWithNewOp( + op, op.getCondition(), defaultDestination, defaultOperands, caseValues, + caseDestinations, caseOperands); + + return mlir::success(); + } +}; + class CIRLoopOpInterfaceFlattening : public mlir::OpInterfaceRewritePattern { public: @@ -306,9 +533,10 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern { }; void populateFlattenCFGPatterns(RewritePatternSet &patterns) { - patterns.add( - patterns.getContext()); + patterns + .add( + patterns.getContext()); } void CIRFlattenCFGPass::runOnOperation() { @@ -321,7 +549,7 @@ void CIRFlattenCFGPass::runOnOperation() { assert(!cir::MissingFeatures::ifOp()); assert(!cir::MissingFeatures::switchOp()); assert(!cir::MissingFeatures::tryOp()); - if (isa(op)) + if (isa(op)) ops.push_back(op); }); diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 3c85bb4b6b41d..c8eac87f6cdff 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -54,11 +54,11 @@ namespace direct { namespace { /// If the given type is a vector type, return the vector's element type. /// Otherwise return the given type unchanged. -// TODO(cir): Return the vector element type once we have support for vectors -// instead of the identity type. mlir::Type elementTypeIfVector(mlir::Type type) { - assert(!cir::MissingFeatures::vectorType()); - return type; + return llvm::TypeSwitch(type) + .Case( + [](auto p) { return p.getElementType(); }) + .Default([](mlir::Type p) { return p; }); } } // namespace diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp index 40627d6ffd3c9..adb353d568cd2 100644 --- a/clang/lib/CodeGen/CGBlocks.cpp +++ b/clang/lib/CodeGen/CGBlocks.cpp @@ -126,7 +126,7 @@ static std::string getBlockDescriptorName(const CGBlockInfo &BlockInfo, CGM.getContext().getObjCEncodingForBlock(BlockInfo.getBlockExpr()); /// Replace occurrences of '@' with '\1'. '@' is reserved on ELF platforms /// as a separator between symbol name and symbol version. - std::replace(TypeAtEncoding.begin(), TypeAtEncoding.end(), '@', '\1'); + llvm::replace(TypeAtEncoding, '@', '\1'); } Name += "e" + llvm::to_string(TypeAtEncoding.size()) + "_" + TypeAtEncoding; Name += "l" + CGM.getObjCRuntime().getRCBlockLayoutStr(CGM, BlockInfo); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 45e0f69c46902..48cfbda12b2ac 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -909,7 +909,8 @@ class StructFieldAccess bool AddrOfSeen = false; public: - const ArraySubscriptExpr *ASE = nullptr; + const Expr *ArrayIndex = nullptr; + QualType ArrayElementTy; const Expr *VisitMemberExpr(const MemberExpr *E) { if (AddrOfSeen && E->getType()->isArrayType()) @@ -919,12 +920,13 @@ class StructFieldAccess } const Expr *VisitArraySubscriptExpr(const ArraySubscriptExpr *E) { - if (ASE) + if (ArrayIndex) // We don't support multiple subscripts. return nullptr; AddrOfSeen = false; // '&ptr->array[idx]' is okay. - ASE = E; + ArrayIndex = E->getIdx(); + ArrayElementTy = E->getBase()->getType(); return Visit(E->getBase()); } const Expr *VisitCastExpr(const CastExpr *E) { @@ -1016,12 +1018,10 @@ GetFieldOffset(ASTContext &Ctx, const RecordDecl *RD, const FieldDecl *FD) { return std::nullopt; } -llvm::Value * -CodeGenFunction::emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE, - unsigned Type, - llvm::IntegerType *ResType) { - ASTContext &Ctx = getContext(); - +llvm::Value *CodeGenFunction::emitCountedBySize(const Expr *E, + llvm::Value *EmittedE, + unsigned Type, + llvm::IntegerType *ResType) { // Note: If the whole struct is specificed in the __bdos (i.e. Visitor // returns a DeclRefExpr). The calculation of the whole size of the structure // with a flexible array member can be done in two ways: @@ -1040,14 +1040,224 @@ CodeGenFunction::emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE, // GCC does for consistency's sake. StructFieldAccess Visitor; - const MemberExpr *ME = dyn_cast_if_present(Visitor.Visit(E)); + E = Visitor.Visit(E); + if (!E) + return nullptr; + + const Expr *Idx = Visitor.ArrayIndex; + if (Idx) { + if (Idx->HasSideEffects(getContext())) + // We can't have side-effects. + return getDefaultBuiltinObjectSizeResult(Type, ResType); + + if (const auto *IL = dyn_cast(Idx)) { + int64_t Val = IL->getValue().getSExtValue(); + if (Val < 0) + return getDefaultBuiltinObjectSizeResult(Type, ResType); + + // The index is 0, so we don't need to take it into account. + if (Val == 0) + Idx = nullptr; + } + } + + // __counted_by on either a flexible array member or a pointer into a struct + // with a flexible array member. + if (const auto *ME = dyn_cast(E)) + return emitCountedByMemberSize(ME, Idx, EmittedE, Visitor.ArrayElementTy, + Type, ResType); + + // __counted_by on a pointer in a struct. + if (const auto *ICE = dyn_cast(E); + ICE && ICE->getCastKind() == CK_LValueToRValue) + return emitCountedByPointerSize(ICE, Idx, EmittedE, Visitor.ArrayElementTy, + Type, ResType); + + return nullptr; +} + +static llvm::Value *EmitPositiveResultOrZero(CodeGenFunction &CGF, + llvm::Value *Res, + llvm::Value *Index, + llvm::IntegerType *ResType, + bool IsSigned) { + // cmp = (array_size >= 0) + Value *Cmp = CGF.Builder.CreateIsNotNeg(Res); + if (Index) + // cmp = (cmp && index >= 0) + Cmp = CGF.Builder.CreateAnd(CGF.Builder.CreateIsNotNeg(Index), Cmp); + + // return cmp ? result : 0 + return CGF.Builder.CreateSelect(Cmp, Res, + ConstantInt::get(ResType, 0, IsSigned)); +} + +static std::pair +GetCountFieldAndIndex(CodeGenFunction &CGF, const MemberExpr *ME, + const FieldDecl *ArrayFD, const FieldDecl *CountFD, + const Expr *Idx, llvm::IntegerType *ResType, + bool IsSigned) { + // count = ptr->count; + Value *Count = CGF.EmitLoadOfCountedByField(ME, ArrayFD, CountFD); + if (!Count) + return std::make_pair(nullptr, nullptr); + Count = CGF.Builder.CreateIntCast(Count, ResType, IsSigned, "count"); + + // index = ptr->index; + Value *Index = nullptr; + if (Idx) { + bool IdxSigned = Idx->getType()->isSignedIntegerType(); + Index = CGF.EmitScalarExpr(Idx); + Index = CGF.Builder.CreateIntCast(Index, ResType, IdxSigned, "index"); + } + + return std::make_pair(Count, Index); +} + +llvm::Value *CodeGenFunction::emitCountedByPointerSize( + const ImplicitCastExpr *E, const Expr *Idx, llvm::Value *EmittedE, + QualType CastedArrayElementTy, unsigned Type, llvm::IntegerType *ResType) { + assert(E->getCastKind() == CK_LValueToRValue && + "must be an LValue to RValue cast"); + + const MemberExpr *ME = dyn_cast(E->getSubExpr()); if (!ME) return nullptr; + const auto *ArrayBaseFD = dyn_cast(ME->getMemberDecl()); + if (!ArrayBaseFD || !ArrayBaseFD->getType()->isPointerType() || + !ArrayBaseFD->getType()->isCountAttributedType()) + return nullptr; + + // Get the 'count' FieldDecl. + const FieldDecl *CountFD = ArrayBaseFD->findCountedByField(); + if (!CountFD) + // Can't find the field referenced by the "counted_by" attribute. + return nullptr; + + // Calculate the array's object size using these formulae. (Note: if the + // calculation is negative, we return 0.): + // + // struct p; + // struct s { + // /* ... */ + // struct p **array __attribute__((counted_by(count))); + // int count; + // }; + // + // 1) 'ptr->array': + // + // count = ptr->count; + // + // array_element_size = sizeof (*ptr->array); + // array_size = count * array_element_size; + // + // result = array_size; + // + // cmp = (result >= 0) + // return cmp ? result : 0; + // + // 2) '&((cast) ptr->array)[idx]': + // + // count = ptr->count; + // index = idx; + // + // array_element_size = sizeof (*ptr->array); + // array_size = count * array_element_size; + // + // casted_array_element_size = sizeof (*((cast) ptr->array)); + // + // index_size = index * casted_array_element_size; + // result = array_size - index_size; + // + // cmp = (result >= 0) + // if (index) + // cmp = (cmp && index > 0) + // return cmp ? result : 0; + + auto GetElementBaseSize = [&](QualType ElementTy) { + CharUnits ElementSize = + getContext().getTypeSizeInChars(ElementTy->getPointeeType()); + + if (ElementSize.isZero()) { + // This might be a __sized_by on a 'void *', which counts bytes, not + // elements. + auto *CAT = ElementTy->getAs(); + if (!CAT || (CAT->getKind() != CountAttributedType::SizedBy && + CAT->getKind() != CountAttributedType::SizedByOrNull)) + // Okay, not sure what it is now. + // FIXME: Should this be an assert? + return std::optional(); + + ElementSize = CharUnits::One(); + } + + return std::optional(ElementSize); + }; + + // Get the sizes of the original array element and the casted array element, + // if different. + std::optional ArrayElementBaseSize = + GetElementBaseSize(ArrayBaseFD->getType()); + if (!ArrayElementBaseSize) + return nullptr; + + std::optional CastedArrayElementBaseSize = ArrayElementBaseSize; + if (!CastedArrayElementTy.isNull() && CastedArrayElementTy->isPointerType()) { + CastedArrayElementBaseSize = GetElementBaseSize(CastedArrayElementTy); + if (!CastedArrayElementBaseSize) + return nullptr; + } + + bool IsSigned = CountFD->getType()->isSignedIntegerType(); + + // count = ptr->count; + // index = ptr->index; + Value *Count, *Index; + std::tie(Count, Index) = GetCountFieldAndIndex( + *this, ME, ArrayBaseFD, CountFD, Idx, ResType, IsSigned); + if (!Count) + return nullptr; + + // array_element_size = sizeof (*ptr->array) + auto *ArrayElementSize = llvm::ConstantInt::get( + ResType, ArrayElementBaseSize->getQuantity(), IsSigned); + + // casted_array_element_size = sizeof (*((cast) ptr->array)); + auto *CastedArrayElementSize = llvm::ConstantInt::get( + ResType, CastedArrayElementBaseSize->getQuantity(), IsSigned); + + // array_size = count * array_element_size; + Value *ArraySize = Builder.CreateMul(Count, ArrayElementSize, "array_size", + !IsSigned, IsSigned); + + // Option (1) 'ptr->array' + // result = array_size + Value *Result = ArraySize; + + if (Idx) { // Option (2) '&((cast) ptr->array)[idx]' + // index_size = index * casted_array_element_size; + Value *IndexSize = Builder.CreateMul(Index, CastedArrayElementSize, + "index_size", !IsSigned, IsSigned); + + // result = result - index_size; + Result = + Builder.CreateSub(Result, IndexSize, "result", !IsSigned, IsSigned); + } + + return EmitPositiveResultOrZero(*this, Result, Index, ResType, IsSigned); +} + +llvm::Value *CodeGenFunction::emitCountedByMemberSize( + const MemberExpr *ME, const Expr *Idx, llvm::Value *EmittedE, + QualType CastedArrayElementTy, unsigned Type, llvm::IntegerType *ResType) { const auto *FD = dyn_cast(ME->getMemberDecl()); if (!FD) return nullptr; + // Find the flexible array member and check that it has the __counted_by + // attribute. + ASTContext &Ctx = getContext(); const RecordDecl *RD = FD->getDeclContext()->getOuterLexicalRecordContext(); const FieldDecl *FlexibleArrayMemberFD = nullptr; @@ -1062,32 +1272,14 @@ CodeGenFunction::emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE, !FlexibleArrayMemberFD->getType()->isCountAttributedType()) return nullptr; + // Get the 'count' FieldDecl. const FieldDecl *CountFD = FlexibleArrayMemberFD->findCountedByField(); if (!CountFD) // Can't find the field referenced by the "counted_by" attribute. return nullptr; - const Expr *Idx = nullptr; - if (Visitor.ASE) { - Idx = Visitor.ASE->getIdx(); - - if (Idx->HasSideEffects(Ctx)) - // We can't have side-effects. - return getDefaultBuiltinObjectSizeResult(Type, ResType); - - if (const auto *IL = dyn_cast(Idx)) { - int64_t Val = IL->getValue().getSExtValue(); - if (Val < 0) - return getDefaultBuiltinObjectSizeResult(Type, ResType); - - // The index is 0, so we don't need to take it into account. - if (Val == 0) - Idx = nullptr; - } - } - - // Calculate the flexible array member's object size using these formulae - // (note: if the calculation is negative, we return 0.): + // Calculate the flexible array member's object size using these formulae. + // (Note: if the calculation is negative, we return 0.): // // struct p; // struct s { @@ -1100,76 +1292,84 @@ CodeGenFunction::emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE, // // count = ptr->count; // - // flexible_array_member_base_size = sizeof (*ptr->array); + // flexible_array_member_element_size = sizeof (*ptr->array); // flexible_array_member_size = - // count * flexible_array_member_base_size; + // count * flexible_array_member_element_size; + // + // result = flexible_array_member_size; // - // if (flexible_array_member_size < 0) - // return 0; - // return flexible_array_member_size; + // cmp = (result >= 0) + // return cmp ? result : 0; // - // 2) '&ptr->array[idx]': + // 2) '&((cast) ptr->array)[idx]': // // count = ptr->count; // index = idx; // - // flexible_array_member_base_size = sizeof (*ptr->array); + // flexible_array_member_element_size = sizeof (*ptr->array); // flexible_array_member_size = - // count * flexible_array_member_base_size; + // count * flexible_array_member_element_size; // - // index_size = index * flexible_array_member_base_size; + // casted_flexible_array_member_element_size = + // sizeof (*((cast) ptr->array)); + // index_size = index * casted_flexible_array_member_element_size; // - // if (flexible_array_member_size < 0 || index < 0) - // return 0; - // return flexible_array_member_size - index_size; + // result = flexible_array_member_size - index_size; + // + // cmp = (result >= 0) + // if (index != 0) + // cmp = (cmp && index >= 0) + // return cmp ? result : 0; // // 3) '&ptr->field': // // count = ptr->count; // sizeof_struct = sizeof (struct s); // - // flexible_array_member_base_size = sizeof (*ptr->array); + // flexible_array_member_element_size = sizeof (*ptr->array); // flexible_array_member_size = - // count * flexible_array_member_base_size; + // count * flexible_array_member_element_size; // // field_offset = offsetof (struct s, field); // offset_diff = sizeof_struct - field_offset; // - // if (flexible_array_member_size < 0) - // return 0; - // return offset_diff + flexible_array_member_size; + // result = offset_diff + flexible_array_member_size; + // + // cmp = (result >= 0) + // return cmp ? result : 0; // - // 4) '&ptr->field_array[idx]': + // 4) '&((cast) ptr->field_array)[idx]': // // count = ptr->count; // index = idx; // sizeof_struct = sizeof (struct s); // - // flexible_array_member_base_size = sizeof (*ptr->array); + // flexible_array_member_element_size = sizeof (*ptr->array); // flexible_array_member_size = - // count * flexible_array_member_base_size; + // count * flexible_array_member_element_size; // - // field_base_size = sizeof (*ptr->field_array); + // casted_field_element_size = sizeof (*((cast) ptr->field_array)); // field_offset = offsetof (struct s, field) - // field_offset += index * field_base_size; + // field_offset += index * casted_field_element_size; // // offset_diff = sizeof_struct - field_offset; // - // if (flexible_array_member_size < 0 || index < 0) - // return 0; - // return offset_diff + flexible_array_member_size; + // result = offset_diff + flexible_array_member_size; + // + // cmp = (result >= 0) + // if (index != 0) + // cmp = (cmp && index >= 0) + // return cmp ? result : 0; - QualType CountTy = CountFD->getType(); - bool IsSigned = CountTy->isSignedIntegerType(); + bool IsSigned = CountFD->getType()->isSignedIntegerType(); QualType FlexibleArrayMemberTy = FlexibleArrayMemberFD->getType(); - QualType FieldTy = FD->getType(); // Explicit cast because otherwise the CharWidth will promote an i32's into - // u64's leading to overflows.. + // u64's leading to overflows. int64_t CharWidth = static_cast(CGM.getContext().getCharWidth()); - // size_t field_offset = offsetof (struct s, field); + // field_offset = offsetof (struct s, field); Value *FieldOffset = nullptr; if (FlexibleArrayMemberFD != FD) { std::optional Offset = GetFieldOffset(Ctx, RD, FD); @@ -1179,81 +1379,90 @@ CodeGenFunction::emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE, llvm::ConstantInt::get(ResType, *Offset / CharWidth, IsSigned); } - // size_t count = (size_t) ptr->count; - Value *Count = EmitLoadOfCountedByField(ME, FlexibleArrayMemberFD, CountFD); + // count = ptr->count; + // index = ptr->index; + Value *Count, *Index; + std::tie(Count, Index) = GetCountFieldAndIndex( + *this, ME, FlexibleArrayMemberFD, CountFD, Idx, ResType, IsSigned); if (!Count) return nullptr; - Count = Builder.CreateIntCast(Count, ResType, IsSigned, "count"); - // size_t index = (size_t) ptr->index; - Value *Index = nullptr; - if (Idx) { - bool IdxSigned = Idx->getType()->isSignedIntegerType(); - Index = EmitScalarExpr(Idx); - Index = Builder.CreateIntCast(Index, ResType, IdxSigned, "index"); - } - - // size_t flexible_array_member_base_size = sizeof (*ptr->array); + // flexible_array_member_element_size = sizeof (*ptr->array); const ArrayType *ArrayTy = Ctx.getAsArrayType(FlexibleArrayMemberTy); CharUnits BaseSize = Ctx.getTypeSizeInChars(ArrayTy->getElementType()); - auto *FlexibleArrayMemberBaseSize = + auto *FlexibleArrayMemberElementSize = llvm::ConstantInt::get(ResType, BaseSize.getQuantity(), IsSigned); - // size_t flexible_array_member_size = - // count * flexible_array_member_base_size; + // flexible_array_member_size = count * flexible_array_member_element_size; Value *FlexibleArrayMemberSize = - Builder.CreateMul(Count, FlexibleArrayMemberBaseSize, + Builder.CreateMul(Count, FlexibleArrayMemberElementSize, "flexible_array_member_size", !IsSigned, IsSigned); - Value *Res = nullptr; + Value *Result = nullptr; if (FlexibleArrayMemberFD == FD) { - if (Idx) { // Option (2) '&ptr->array[idx]' - // size_t index_size = index * flexible_array_member_base_size; - Value *IndexSize = Builder.CreateMul(FlexibleArrayMemberBaseSize, Index, - "index_size", !IsSigned, IsSigned); - - // return flexible_array_member_size - index_size; - Res = Builder.CreateSub(FlexibleArrayMemberSize, IndexSize, "result", - !IsSigned, IsSigned); + if (Idx) { // Option (2) '&((cast) ptr->array)[idx]' + // casted_flexible_array_member_element_size = + // sizeof (*((cast) ptr->array)); + llvm::ConstantInt *CastedFlexibleArrayMemberElementSize = + FlexibleArrayMemberElementSize; + if (!CastedArrayElementTy.isNull() && + CastedArrayElementTy->isPointerType()) { + CharUnits BaseSize = + Ctx.getTypeSizeInChars(CastedArrayElementTy->getPointeeType()); + CastedFlexibleArrayMemberElementSize = + llvm::ConstantInt::get(ResType, BaseSize.getQuantity(), IsSigned); + } + + // index_size = index * casted_flexible_array_member_element_size; + Value *IndexSize = + Builder.CreateMul(Index, CastedFlexibleArrayMemberElementSize, + "index_size", !IsSigned, IsSigned); + + // result = flexible_array_member_size - index_size; + Result = Builder.CreateSub(FlexibleArrayMemberSize, IndexSize, "result", + !IsSigned, IsSigned); } else { // Option (1) 'ptr->array' - // return flexible_array_member_size; - Res = FlexibleArrayMemberSize; + // result = flexible_array_member_size; + Result = FlexibleArrayMemberSize; } } else { - // size_t sizeof_struct = sizeof (struct s); + // sizeof_struct = sizeof (struct s); llvm::StructType *StructTy = getTypes().getCGRecordLayout(RD).getLLVMType(); const llvm::DataLayout &Layout = CGM.getDataLayout(); TypeSize Size = Layout.getTypeSizeInBits(StructTy); Value *SizeofStruct = llvm::ConstantInt::get(ResType, Size.getKnownMinValue() / CharWidth); - if (Idx) { // Option (4) '&ptr->field_array[idx]' - // size_t field_base_size = sizeof (*ptr->field_array); - const ArrayType *ArrayTy = Ctx.getAsArrayType(FieldTy); - CharUnits BaseSize = Ctx.getTypeSizeInChars(ArrayTy->getElementType()); - auto *FieldBaseSize = + if (Idx) { // Option (4) '&((cast) ptr->field_array)[idx]' + // casted_field_element_size = sizeof (*((cast) ptr->field_array)); + CharUnits BaseSize; + if (!CastedArrayElementTy.isNull() && + CastedArrayElementTy->isPointerType()) { + BaseSize = + Ctx.getTypeSizeInChars(CastedArrayElementTy->getPointeeType()); + } else { + const ArrayType *ArrayTy = Ctx.getAsArrayType(FD->getType()); + BaseSize = Ctx.getTypeSizeInChars(ArrayTy->getElementType()); + } + + llvm::ConstantInt *CastedFieldElementSize = llvm::ConstantInt::get(ResType, BaseSize.getQuantity(), IsSigned); - // field_offset += index * field_base_size; - Value *Mul = Builder.CreateMul(Index, FieldBaseSize, "field_offset", - !IsSigned, IsSigned); + // field_offset += index * casted_field_element_size; + Value *Mul = Builder.CreateMul(Index, CastedFieldElementSize, + "field_offset", !IsSigned, IsSigned); FieldOffset = Builder.CreateAdd(FieldOffset, Mul); } // Option (3) '&ptr->field', and Option (4) continuation. - - // size_t offset_diff = flexible_array_member_offset - field_offset; + // offset_diff = flexible_array_member_offset - field_offset; Value *OffsetDiff = Builder.CreateSub(SizeofStruct, FieldOffset, "offset_diff", !IsSigned, IsSigned); - // return offset_diff + flexible_array_member_size; - Res = Builder.CreateAdd(FlexibleArrayMemberSize, OffsetDiff, "result"); + // result = offset_diff + flexible_array_member_size; + Result = Builder.CreateAdd(FlexibleArrayMemberSize, OffsetDiff, "result"); } - Value *Cmp = Builder.CreateIsNotNeg(Res); - if (Idx) - Cmp = Builder.CreateAnd(Builder.CreateIsNotNeg(Index), Cmp); - - return Builder.CreateSelect(Cmp, Res, ConstantInt::get(ResType, 0, IsSigned)); + return EmitPositiveResultOrZero(*this, Result, Index, ResType, IsSigned); } /// Returns a Value corresponding to the size of the given expression. @@ -1301,7 +1510,7 @@ CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type, if (IsDynamic) // Emit special code for a flexible array member with the "counted_by" // attribute. - if (Value *V = emitCountedByMemberSize(E, Ptr, Type, ResType)) + if (Value *V = emitCountedBySize(E, Ptr, Type, ResType)) return V; Function *F = @@ -5906,8 +6115,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_thread_pointer: { if (!getContext().getTargetInfo().isTLSSupported()) CGM.ErrorUnsupported(E, "__builtin_thread_pointer"); - // Fall through - it's already mapped to the intrinsic by ClangBuiltin. - break; + + return RValue::get(Builder.CreateIntrinsic(llvm::Intrinsic::thread_pointer, + {GlobalsInt8PtrTy}, {})); } case Builtin::BI__builtin_os_log_format: return emitBuiltinOSLogFormat(*E); diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index aa1909443e8cd..bcd579454413e 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1366,19 +1366,23 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty, // If we are casting a fixed i8 vector to a scalable i1 predicate // vector, use a vector insert and bitcast the result. if (ScalableDstTy->getElementType()->isIntegerTy(1) && - ScalableDstTy->getElementCount().isKnownMultipleOf(8) && FixedSrcTy->getElementType()->isIntegerTy(8)) { ScalableDstTy = llvm::ScalableVectorType::get( FixedSrcTy->getElementType(), - ScalableDstTy->getElementCount().getKnownMinValue() / 8); + llvm::divideCeil( + ScalableDstTy->getElementCount().getKnownMinValue(), 8)); } if (ScalableDstTy->getElementType() == FixedSrcTy->getElementType()) { auto *Load = CGF.Builder.CreateLoad(Src); auto *PoisonVec = llvm::PoisonValue::get(ScalableDstTy); llvm::Value *Result = CGF.Builder.CreateInsertVector( ScalableDstTy, PoisonVec, Load, uint64_t(0), "cast.scalable"); - if (ScalableDstTy != Ty) - Result = CGF.Builder.CreateBitCast(Result, Ty); + ScalableDstTy = cast( + llvm::VectorType::getWithSizeAndScalar(ScalableDstTy, Ty)); + if (Result->getType() != ScalableDstTy) + Result = CGF.Builder.CreateBitCast(Result, ScalableDstTy); + if (Result->getType() != Ty) + Result = CGF.Builder.CreateExtractVector(Ty, Result, uint64_t(0)); return Result; } } @@ -1476,8 +1480,14 @@ CoerceScalableToFixed(CodeGenFunction &CGF, llvm::FixedVectorType *ToTy, // If we are casting a scalable i1 predicate vector to a fixed i8 // vector, first bitcast the source. if (FromTy->getElementType()->isIntegerTy(1) && - FromTy->getElementCount().isKnownMultipleOf(8) && ToTy->getElementType() == CGF.Builder.getInt8Ty()) { + if (!FromTy->getElementCount().isKnownMultipleOf(8)) { + FromTy = llvm::ScalableVectorType::get( + FromTy->getElementType(), + llvm::alignTo<8>(FromTy->getElementCount().getKnownMinValue())); + llvm::Value *ZeroVec = llvm::Constant::getNullValue(FromTy); + V = CGF.Builder.CreateInsertVector(FromTy, ZeroVec, V, uint64_t(0)); + } FromTy = llvm::ScalableVectorType::get( ToTy->getElementType(), FromTy->getElementCount().getKnownMinValue() / 8); diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index befbfc64a680c..7710b1aee6f28 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -2042,6 +2042,8 @@ void CodeGenFunction::EmitCXXAggrConstructorCall(const CXXConstructorDecl *ctor, cur->addIncoming(arrayBegin, entryBB); // Inside the loop body, emit the constructor call on the array element. + if (CGM.shouldEmitConvergenceTokens()) + ConvergenceTokenStack.push_back(emitConvergenceLoopToken(loopBB)); // The alignment of the base, adjusted by the size of a single element, // provides a conservative estimate of the alignment of every element. @@ -2101,6 +2103,9 @@ void CodeGenFunction::EmitCXXAggrConstructorCall(const CXXConstructorDecl *ctor, // Patch the earlier check to skip over the loop. if (zeroCheckBranch) zeroCheckBranch->setSuccessor(0, contBB); + if (CGM.shouldEmitConvergenceTokens()) + ConvergenceTokenStack.pop_back(); + EmitBlock(contBB); } @@ -2779,6 +2784,29 @@ void CodeGenFunction::EmitTypeMetadataCodeForVCall(const CXXRecordDecl *RD, } } +/// Converts the CFITypeCheckKind into SanitizerKind::SanitizerOrdinal and +/// llvm::SanitizerStatKind. +static std::pair +SanitizerInfoFromCFICheckKind(CodeGenFunction::CFITypeCheckKind TCK) { + switch (TCK) { + case CodeGenFunction::CFITCK_VCall: + return std::make_pair(SanitizerKind::SO_CFIVCall, llvm::SanStat_CFI_VCall); + case CodeGenFunction::CFITCK_NVCall: + return std::make_pair(SanitizerKind::SO_CFINVCall, + llvm::SanStat_CFI_NVCall); + case CodeGenFunction::CFITCK_DerivedCast: + return std::make_pair(SanitizerKind::SO_CFIDerivedCast, + llvm::SanStat_CFI_DerivedCast); + case CodeGenFunction::CFITCK_UnrelatedCast: + return std::make_pair(SanitizerKind::SO_CFIUnrelatedCast, + llvm::SanStat_CFI_UnrelatedCast); + case CodeGenFunction::CFITCK_ICall: + case CodeGenFunction::CFITCK_NVMFCall: + case CodeGenFunction::CFITCK_VMFCall: + llvm_unreachable("unexpected sanitizer kind"); + } +} + void CodeGenFunction::EmitVTablePtrCheckForCall(const CXXRecordDecl *RD, llvm::Value *VTable, CFITypeCheckKind TCK, @@ -2842,30 +2870,7 @@ void CodeGenFunction::EmitVTablePtrCheck(const CXXRecordDecl *RD, !CGM.HasHiddenLTOVisibility(RD)) return; - SanitizerKind::SanitizerOrdinal M; - llvm::SanitizerStatKind SSK; - switch (TCK) { - case CFITCK_VCall: - M = SanitizerKind::SO_CFIVCall; - SSK = llvm::SanStat_CFI_VCall; - break; - case CFITCK_NVCall: - M = SanitizerKind::SO_CFINVCall; - SSK = llvm::SanStat_CFI_NVCall; - break; - case CFITCK_DerivedCast: - M = SanitizerKind::SO_CFIDerivedCast; - SSK = llvm::SanStat_CFI_DerivedCast; - break; - case CFITCK_UnrelatedCast: - M = SanitizerKind::SO_CFIUnrelatedCast; - SSK = llvm::SanStat_CFI_UnrelatedCast; - break; - case CFITCK_ICall: - case CFITCK_NVMFCall: - case CFITCK_VMFCall: - llvm_unreachable("unexpected sanitizer kind"); - } + auto [M, SSK] = SanitizerInfoFromCFICheckKind(TCK); std::string TypeName = RD->getQualifiedNameAsString(); if (getContext().getNoSanitizeList().containsType( diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 0d03923951a16..37a5678aa61d5 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -1226,16 +1226,8 @@ void CodeGenFunction::EmitBoundsCheckImpl(const Expr *E, llvm::Value *Bound, SanitizerScope SanScope(this); - llvm::DILocation *CheckDI = Builder.getCurrentDebugLocation(); auto CheckKind = SanitizerKind::SO_ArrayBounds; - // TODO: deprecate ClArrayBoundsPseudoFn - if ((ClArrayBoundsPseudoFn || - CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo.has(CheckKind)) && - CheckDI) { - CheckDI = getDebugInfo()->CreateSyntheticInlineAt( - Builder.getCurrentDebugLocation(), "__ubsan_check_array_bounds"); - } - ApplyDebugLocation ApplyTrapDI(*this, CheckDI); + ApplyDebugLocation ApplyTrapDI(*this, SanitizerAnnotateDebugInfo(CheckKind)); bool IndexSigned = IndexType->isSignedIntegerOrEnumerationType(); llvm::Value *IndexVal = Builder.CreateIntCast(Index, SizeTy, IndexSigned); @@ -1252,6 +1244,35 @@ void CodeGenFunction::EmitBoundsCheckImpl(const Expr *E, llvm::Value *Bound, StaticData, Index); } +llvm::DILocation *CodeGenFunction::SanitizerAnnotateDebugInfo( + SanitizerKind::SanitizerOrdinal CheckKindOrdinal) { + std::string Label; + switch (CheckKindOrdinal) { +#define SANITIZER(NAME, ID) \ + case SanitizerKind::SO_##ID: \ + Label = "__ubsan_check_" NAME; \ + break; +#include "clang/Basic/Sanitizers.def" + default: + llvm_unreachable("unexpected sanitizer kind"); + } + + // Sanitize label + for (unsigned int i = 0; i < Label.length(); i++) + if (!std::isalpha(Label[i])) + Label[i] = '_'; + + llvm::DILocation *CheckDI = Builder.getCurrentDebugLocation(); + // TODO: deprecate ClArrayBoundsPseudoFn + if (((ClArrayBoundsPseudoFn && + CheckKindOrdinal == SanitizerKind::SO_ArrayBounds) || + CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo.has(CheckKindOrdinal)) && + CheckDI) + CheckDI = getDebugInfo()->CreateSyntheticInlineAt(CheckDI, Label); + + return CheckDI; +} + CodeGenFunction::ComplexPairTy CodeGenFunction:: EmitComplexPrePostIncDec(const UnaryOperator *E, LValue LV, bool isInc, bool isPre) { @@ -4274,6 +4295,24 @@ static Address emitArraySubscriptGEP(CodeGenFunction &CGF, Address addr, return Address(eltPtr, CGF.ConvertTypeForMem(eltType), eltAlign); } +namespace { + +/// StructFieldAccess is a simple visitor class to grab the first l-value to +/// r-value cast Expr. +struct StructFieldAccess + : public ConstStmtVisitor { + const Expr *VisitCastExpr(const CastExpr *E) { + if (E->getCastKind() == CK_LValueToRValue) + return E; + return Visit(E->getSubExpr()); + } + const Expr *VisitParenExpr(const ParenExpr *E) { + return Visit(E->getSubExpr()); + } +}; + +} // end anonymous namespace + /// The offset of a field from the beginning of the record. static bool getFieldOffsetInBits(CodeGenFunction &CGF, const RecordDecl *RD, const FieldDecl *Field, int64_t &Offset) { @@ -4329,6 +4368,60 @@ static std::optional getOffsetDifferenceInBits(CodeGenFunction &CGF, return std::make_optional(FD1Offset - FD2Offset); } +/// EmitCountedByBoundsChecking - If the array being accessed has a "counted_by" +/// attribute, generate bounds checking code. The "count" field is at the top +/// level of the struct or in an anonymous struct, that's also at the top level. +/// Future expansions may allow the "count" to reside at any place in the +/// struct, but the value of "counted_by" will be a "simple" path to the count, +/// i.e. "a.b.count", so we shouldn't need the full force of EmitLValue or +/// similar to emit the correct GEP. +void CodeGenFunction::EmitCountedByBoundsChecking( + const Expr *E, llvm::Value *Idx, Address Addr, QualType IdxTy, + QualType ArrayTy, bool Accessed, bool FlexibleArray) { + const auto *ME = dyn_cast(E->IgnoreImpCasts()); + if (!ME || !ME->getMemberDecl()->getType()->isCountAttributedType()) + return; + + const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel = + getLangOpts().getStrictFlexArraysLevel(); + if (FlexibleArray && + !ME->isFlexibleArrayMemberLike(getContext(), StrictFlexArraysLevel)) + return; + + const FieldDecl *FD = cast(ME->getMemberDecl()); + const FieldDecl *CountFD = FD->findCountedByField(); + if (!CountFD) + return; + + if (std::optional Diff = + getOffsetDifferenceInBits(*this, CountFD, FD)) { + if (!Addr.isValid()) { + // An invalid Address indicates we're checking a pointer array access. + // Emit the checked L-Value here. + LValue LV = EmitCheckedLValue(E, TCK_MemberAccess); + Addr = LV.getAddress(); + } + + // FIXME: The 'static_cast' is necessary, otherwise the result turns into a + // uint64_t, which messes things up if we have a negative offset difference. + Diff = *Diff / static_cast(CGM.getContext().getCharWidth()); + + // Create a GEP with the byte offset between the counted object and the + // count and use that to load the count value. + Addr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Int8PtrTy, Int8Ty); + + llvm::Type *CountTy = ConvertType(CountFD->getType()); + llvm::Value *Res = + Builder.CreateInBoundsGEP(Int8Ty, Addr.emitRawPointer(*this), + Builder.getInt32(*Diff), ".counted_by.gep"); + Res = Builder.CreateAlignedLoad(CountTy, Res, getIntAlign(), + ".counted_by.load"); + + // Now emit the bounds checking. + EmitBoundsCheckImpl(E, Res, Idx, IdxTy, ArrayTy, Accessed); + } +} + LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, bool Accessed) { // The index must always be an integer, which is not an aggregate. Emit it @@ -4455,46 +4548,10 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, ArrayLV = EmitLValue(Array); auto *Idx = EmitIdxAfterBase(/*Promote*/true); - if (SanOpts.has(SanitizerKind::ArrayBounds)) { - // If the array being accessed has a "counted_by" attribute, generate - // bounds checking code. The "count" field is at the top level of the - // struct or in an anonymous struct, that's also at the top level. Future - // expansions may allow the "count" to reside at any place in the struct, - // but the value of "counted_by" will be a "simple" path to the count, - // i.e. "a.b.count", so we shouldn't need the full force of EmitLValue or - // similar to emit the correct GEP. - const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel = - getLangOpts().getStrictFlexArraysLevel(); - - if (const auto *ME = dyn_cast(Array); - ME && - ME->isFlexibleArrayMemberLike(getContext(), StrictFlexArraysLevel) && - ME->getMemberDecl()->getType()->isCountAttributedType()) { - const FieldDecl *FAMDecl = cast(ME->getMemberDecl()); - if (const FieldDecl *CountFD = FAMDecl->findCountedByField()) { - if (std::optional Diff = - getOffsetDifferenceInBits(*this, CountFD, FAMDecl)) { - CharUnits OffsetDiff = CGM.getContext().toCharUnitsFromBits(*Diff); - - // Create a GEP with a byte offset between the FAM and count and - // use that to load the count value. - Addr = Builder.CreatePointerBitCastOrAddrSpaceCast( - ArrayLV.getAddress(), Int8PtrTy, Int8Ty); - - llvm::Type *CountTy = ConvertType(CountFD->getType()); - llvm::Value *Res = Builder.CreateInBoundsGEP( - Int8Ty, Addr.emitRawPointer(*this), - Builder.getInt32(OffsetDiff.getQuantity()), ".counted_by.gep"); - Res = Builder.CreateAlignedLoad(CountTy, Res, getIntAlign(), - ".counted_by.load"); - - // Now emit the bounds checking. - EmitBoundsCheckImpl(E, Res, Idx, E->getIdx()->getType(), - Array->getType(), Accessed); - } - } - } - } + if (SanOpts.has(SanitizerKind::ArrayBounds)) + EmitCountedByBoundsChecking(Array, Idx, ArrayLV.getAddress(), + E->getIdx()->getType(), Array->getType(), + Accessed, /*FlexibleArray=*/true); // Propagate the alignment from the array itself to the result. QualType arrayType = Array->getType(); @@ -4506,12 +4563,25 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, EltTBAAInfo = CGM.getTBAAInfoForSubobject(ArrayLV, E->getType()); } else { // The base must be a pointer; emit it with an estimate of its alignment. - Addr = EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); + Address BaseAddr = + EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); auto *Idx = EmitIdxAfterBase(/*Promote*/true); QualType ptrType = E->getBase()->getType(); - Addr = emitArraySubscriptGEP( - *this, Addr, Idx, E->getType(), !getLangOpts().PointerOverflowDefined, - SignedIndices, E->getExprLoc(), &ptrType, E->getBase()); + Addr = emitArraySubscriptGEP(*this, BaseAddr, Idx, E->getType(), + !getLangOpts().PointerOverflowDefined, + SignedIndices, E->getExprLoc(), &ptrType, + E->getBase()); + + if (SanOpts.has(SanitizerKind::ArrayBounds)) { + StructFieldAccess Visitor; + const Expr *Base = Visitor.Visit(E->getBase()); + + if (const auto *CE = dyn_cast_if_present(Base); + CE && CE->getCastKind() == CK_LValueToRValue) + EmitCountedByBoundsChecking(CE, Idx, Address::invalid(), + E->getIdx()->getType(), ptrType, Accessed, + /*FlexibleArray=*/false); + } } LValue LV = MakeAddrLValue(Addr, E->getType(), EltBaseInfo, EltTBAAInfo); diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 6765008c99c4a..5d618658bc615 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1906,7 +1906,7 @@ Value *ScalarExprEmitter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { SmallVector Indices; for (unsigned i = 2; i < E->getNumSubExprs(); ++i) { - llvm::APSInt Idx = E->getShuffleMaskIdx(CGF.getContext(), i-2); + llvm::APSInt Idx = E->getShuffleMaskIdx(i - 2); // Check for -1 and output it as undef in the IR. if (Idx.isSigned() && Idx.isAllOnes()) Indices.push_back(-1); @@ -2491,18 +2491,22 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) { // If we are casting a fixed i8 vector to a scalable i1 predicate // vector, use a vector insert and bitcast the result. if (ScalableDstTy->getElementType()->isIntegerTy(1) && - ScalableDstTy->getElementCount().isKnownMultipleOf(8) && FixedSrcTy->getElementType()->isIntegerTy(8)) { ScalableDstTy = llvm::ScalableVectorType::get( FixedSrcTy->getElementType(), - ScalableDstTy->getElementCount().getKnownMinValue() / 8); + llvm::divideCeil( + ScalableDstTy->getElementCount().getKnownMinValue(), 8)); } if (FixedSrcTy->getElementType() == ScalableDstTy->getElementType()) { llvm::Value *PoisonVec = llvm::PoisonValue::get(ScalableDstTy); llvm::Value *Result = Builder.CreateInsertVector( ScalableDstTy, PoisonVec, Src, uint64_t(0), "cast.scalable"); + ScalableDstTy = cast( + llvm::VectorType::getWithSizeAndScalar(ScalableDstTy, DstTy)); + if (Result->getType() != ScalableDstTy) + Result = Builder.CreateBitCast(Result, ScalableDstTy); if (Result->getType() != DstTy) - Result = Builder.CreateBitCast(Result, DstTy); + Result = Builder.CreateExtractVector(DstTy, Result, uint64_t(0)); return Result; } } @@ -2516,8 +2520,17 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) { // If we are casting a scalable i1 predicate vector to a fixed i8 // vector, bitcast the source and use a vector extract. if (ScalableSrcTy->getElementType()->isIntegerTy(1) && - ScalableSrcTy->getElementCount().isKnownMultipleOf(8) && FixedDstTy->getElementType()->isIntegerTy(8)) { + if (!ScalableSrcTy->getElementCount().isKnownMultipleOf(8)) { + ScalableSrcTy = llvm::ScalableVectorType::get( + ScalableSrcTy->getElementType(), + llvm::alignTo<8>( + ScalableSrcTy->getElementCount().getKnownMinValue())); + llvm::Value *ZeroVec = llvm::Constant::getNullValue(ScalableSrcTy); + Src = Builder.CreateInsertVector(ScalableSrcTy, ZeroVec, Src, + uint64_t(0)); + } + ScalableSrcTy = llvm::ScalableVectorType::get( FixedDstTy->getElementType(), ScalableSrcTy->getElementCount().getKnownMinValue() / 8); diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 5d93df34c66b2..d4a0714da07b3 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -303,6 +303,21 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, HandleTy, CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic(), ArrayRef{SpaceOp, RegisterOp, RangeOp, IndexOp, NonUniform}); } + case Builtin::BI__builtin_hlsl_resource_handlefromimplicitbinding: { + llvm::Type *HandleTy = CGM.getTypes().ConvertType(E->getType()); + Value *SpaceOp = EmitScalarExpr(E->getArg(1)); + Value *RangeOp = EmitScalarExpr(E->getArg(2)); + Value *IndexOp = EmitScalarExpr(E->getArg(3)); + Value *OrderID = EmitScalarExpr(E->getArg(4)); + // FIXME: NonUniformResourceIndex bit is not yet implemented + // (llvm/llvm-project#135452) + Value *NonUniform = + llvm::ConstantInt::get(llvm::Type::getInt1Ty(getLLVMContext()), false); + return Builder.CreateIntrinsic( + HandleTy, + CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic(), + ArrayRef{OrderID, SpaceOp, RangeOp, IndexOp, NonUniform}); + } case Builtin::BI__builtin_hlsl_all: { Value *Op0 = EmitScalarExpr(E->getArg(0)); return Builder.CreateIntrinsic( diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 0eb4bb062e02e..a708b3aea129d 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -23,6 +23,7 @@ #include "clang/AST/Type.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" @@ -42,8 +43,8 @@ using namespace llvm; using llvm::hlsl::CBufferRowSizeInBytes; static void initializeBufferFromBinding(CodeGenModule &CGM, - llvm::GlobalVariable *GV, unsigned Slot, - unsigned Space); + llvm::GlobalVariable *GV, + HLSLResourceBindingAttr *RBA); namespace { @@ -68,6 +69,20 @@ void addDxilValVersion(StringRef ValVersionStr, llvm::Module &M) { DXILValMD->addOperand(Val); } +void addRootSignature(ArrayRef Elements, + llvm::Function *Fn, llvm::Module &M) { + auto &Ctx = M.getContext(); + + llvm::hlsl::rootsig::MetadataBuilder Builder(Ctx, Elements); + MDNode *RootSignature = Builder.BuildRootSignature(); + MDNode *FnPairing = + MDNode::get(Ctx, {ValueAsMetadata::get(Fn), RootSignature}); + + StringRef RootSignatureValKey = "dx.rootsignatures"; + auto *RootSignatureValMD = M.getOrInsertNamedMetadata(RootSignatureValKey); + RootSignatureValMD->addOperand(FnPairing); +} + } // namespace llvm::Type * @@ -257,13 +272,10 @@ void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) { emitBufferGlobalsAndMetadata(BufDecl, BufGV); // Initialize cbuffer from binding (implicit or explicit) - const HLSLResourceBindingAttr *RBA = - BufDecl->getAttr(); - // FIXME: handle implicit binding if no binding attribute is found - // (llvm/llvm-project#110722) - if (RBA && RBA->hasRegisterSlot()) - initializeBufferFromBinding(CGM, BufGV, RBA->getSlotNumber(), - RBA->getSpaceNumber()); + HLSLResourceBindingAttr *RBA = BufDecl->getAttr(); + assert(RBA && + "cbuffer/tbuffer should always have resource binding attribute"); + initializeBufferFromBinding(CGM, BufGV, RBA); } llvm::TargetExtType * @@ -423,6 +435,13 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD, // FIXME: Handle codegen for return type semantics. // See: https://github.com/llvm/llvm-project/issues/57875 B.CreateRetVoid(); + + // Add and identify root signature to function, if applicable + for (const Attr *Attr : FD->getAttrs()) { + if (const auto *RSAttr = dyn_cast(Attr)) + addRootSignature(RSAttr->getSignatureDecl()->getRootElements(), EntryFn, + M); + } } void CGHLSLRuntime::setHLSLFunctionAttributes(const FunctionDecl *FD, @@ -539,19 +558,29 @@ static void initializeBuffer(CodeGenModule &CGM, llvm::GlobalVariable *GV, } static void initializeBufferFromBinding(CodeGenModule &CGM, - llvm::GlobalVariable *GV, unsigned Slot, - unsigned Space) { + llvm::GlobalVariable *GV, + HLSLResourceBindingAttr *RBA) { llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGM.getLLVMContext()); - llvm::Value *Args[] = { - llvm::ConstantInt::get(CGM.IntTy, Space), /* reg_space */ - llvm::ConstantInt::get(CGM.IntTy, Slot), /* lower_bound */ - llvm::ConstantInt::get(CGM.IntTy, 1), /* range_size */ - llvm::ConstantInt::get(CGM.IntTy, 0), /* index */ - llvm::ConstantInt::get(Int1Ty, false) /* non-uniform */ - }; - initializeBuffer(CGM, GV, - CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic(), - Args); + auto *NonUniform = llvm::ConstantInt::get(Int1Ty, false); + auto *Index = llvm::ConstantInt::get(CGM.IntTy, 0); + auto *RangeSize = llvm::ConstantInt::get(CGM.IntTy, 1); + auto *Space = + llvm::ConstantInt::get(CGM.IntTy, RBA ? RBA->getSpaceNumber() : 0); + + if (RBA->hasRegisterSlot()) { + auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, RBA->getSlotNumber()); + Intrinsic::ID Intr = + CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic(); + initializeBuffer(CGM, GV, Intr, + {Space, RegSlot, RangeSize, Index, NonUniform}); + } else { + auto *OrderID = + llvm::ConstantInt::get(CGM.IntTy, RBA->getImplicitBindingOrderID()); + Intrinsic::ID Intr = + CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic(); + initializeBuffer(CGM, GV, Intr, + {OrderID, Space, RangeSize, Index, NonUniform}); + } } llvm::Instruction *CGHLSLRuntime::getConvergenceToken(BasicBlock &BB) { diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 4d6db3f5d9f3e..e40864d8ed854 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -119,6 +119,8 @@ class CGHLSLRuntime { resource_getpointer) GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding, resource_handlefrombinding) + GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromImplicitBinding, + resource_handlefromimplicitbinding) GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter) GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync, group_memory_barrier_with_group_sync) diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp index e03539d1ccdd0..3fc837c12a925 100644 --- a/clang/lib/CodeGen/CGObjCGNU.cpp +++ b/clang/lib/CodeGen/CGObjCGNU.cpp @@ -1454,10 +1454,10 @@ class CGObjCGNUstep2 : public CGObjCGNUstep { // character that is not a valid type encoding character (and, being // non-printable, never will be!) if (CGM.getTriple().isOSBinFormatELF()) - std::replace(MangledTypes.begin(), MangledTypes.end(), '@', '\1'); + llvm::replace(MangledTypes, '@', '\1'); // = in dll exported names causes lld to fail when linking on Windows. if (CGM.getTriple().isOSWindows()) - std::replace(MangledTypes.begin(), MangledTypes.end(), '=', '\2'); + llvm::replace(MangledTypes, '=', '\2'); return MangledTypes; } llvm::Constant *GetTypeString(llvm::StringRef TypeEncoding) { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index d773cdd505ff4..ac40aab97820d 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -1115,9 +1115,15 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, Fn->removeFnAttr("zero-call-used-regs"); // Add vscale_range attribute if appropriate. + llvm::StringMap FeatureMap; + bool IsArmStreaming = false; + if (FD) { + getContext().getFunctionFeatureMap(FeatureMap, FD); + IsArmStreaming = IsArmStreamingFunction(FD, true); + } std::optional> VScaleRange = - getContext().getTargetInfo().getVScaleRange( - getLangOpts(), FD ? IsArmStreamingFunction(FD, true) : false); + getContext().getTargetInfo().getVScaleRange(getLangOpts(), IsArmStreaming, + &FeatureMap); if (VScaleRange) { CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs( getLLVMContext(), VScaleRange->first, VScaleRange->second)); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index c0bc3825f0188..7104303cba50e 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -2816,6 +2816,11 @@ class CodeGenFunction : public CodeGenTypeCache { void emitStoresForInitAfterBZero(llvm::Constant *Init, Address Loc, bool isVolatile, bool IsAutoInit); + /// Returns debug info, with additional annotation if enabled by + /// CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo[CheckKindOrdinal]. + llvm::DILocation * + SanitizerAnnotateDebugInfo(SanitizerKind::SanitizerOrdinal CheckKindOrdinal); + public: // Captures all the allocas created during the scope of its RAII object. struct AllocaTrackerRAII { @@ -3369,6 +3374,13 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitLoadOfCountedByField(const Expr *Base, const FieldDecl *FD, const FieldDecl *CountDecl); + // Emit bounds checking for flexible array and pointer members with the + // counted_by attribute. + void EmitCountedByBoundsChecking(const Expr *E, llvm::Value *Idx, + Address Addr, QualType IdxTy, + QualType ArrayTy, bool Accessed, + bool FlexibleArray); + llvm::Value *EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV, bool isInc, bool isPre); ComplexPairTy EmitComplexPrePostIncDec(const UnaryOperator *E, LValue LV, @@ -5422,10 +5434,21 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::IntegerType *ResType, llvm::Value *EmittedE, bool IsDynamic); - llvm::Value *emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE, + llvm::Value *emitCountedBySize(const Expr *E, llvm::Value *EmittedE, + unsigned Type, llvm::IntegerType *ResType); + + llvm::Value *emitCountedByMemberSize(const MemberExpr *E, const Expr *Idx, + llvm::Value *EmittedE, + QualType CastedArrayElementTy, unsigned Type, llvm::IntegerType *ResType); + llvm::Value *emitCountedByPointerSize(const ImplicitCastExpr *E, + const Expr *Idx, llvm::Value *EmittedE, + QualType CastedArrayElementTy, + unsigned Type, + llvm::IntegerType *ResType); + void emitZeroOrPatternForAutoVarInit(QualType type, const VarDecl &D, Address Loc); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 50041f883cfe5..16e010adbeb5f 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1689,7 +1689,7 @@ static bool shouldAssumeDSOLocal(const CodeGenModule &CGM, const llvm::Triple &TT = CGM.getTriple(); const auto &CGOpts = CGM.getCodeGenOpts(); - if (TT.isWindowsGNUEnvironment()) { + if (TT.isOSCygMing()) { // In MinGW, variables without DLLImport can still be automatically // imported from a DLL by the linker; don't mark variables that // potentially could come from another DLL as DSO local. diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index d1b292f23c2d2..843733ba6604d 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -108,9 +108,6 @@ llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) { MT->getNumRows() * MT->getNumColumns()); } - if (T->isMFloat8Type()) - return llvm::Type::getInt8Ty(getLLVMContext()); - llvm::Type *R = ConvertType(T); // Check for the boolean vector case. diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index 70b53be7e77a3..8826085c596da 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -3754,7 +3754,7 @@ static bool ShouldUseExternalRTTIDescriptor(CodeGenModule &CGM, bool IsDLLImport = RD->hasAttr(); // Don't import the RTTI but emit it locally. - if (CGM.getTriple().isWindowsGNUEnvironment()) + if (CGM.getTriple().isOSCygMing()) return false; if (CGM.getVTables().isVTableExternal(RD)) { @@ -4041,10 +4041,7 @@ static llvm::GlobalVariable::LinkageTypes getTypeInfoLinkage(CodeGenModule &CGM, return llvm::GlobalValue::ExternalLinkage; // MinGW always uses LinkOnceODRLinkage for type info. if (RD->isDynamicClass() && - !CGM.getContext() - .getTargetInfo() - .getTriple() - .isWindowsGNUEnvironment()) + !CGM.getContext().getTargetInfo().getTriple().isOSCygMing()) return CGM.getVTableLinkage(RD); } diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index d37e68508373c..1cf8f6819b75a 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -2624,22 +2624,26 @@ static bool HasExtraNeonArgument(unsigned BuiltinID) { case NEON::BI__builtin_neon_vget_lane_bf16: case NEON::BI__builtin_neon_vget_lane_i32: case NEON::BI__builtin_neon_vget_lane_i64: + case NEON::BI__builtin_neon_vget_lane_mf8: case NEON::BI__builtin_neon_vget_lane_f32: case NEON::BI__builtin_neon_vgetq_lane_i8: case NEON::BI__builtin_neon_vgetq_lane_i16: case NEON::BI__builtin_neon_vgetq_lane_bf16: case NEON::BI__builtin_neon_vgetq_lane_i32: case NEON::BI__builtin_neon_vgetq_lane_i64: + case NEON::BI__builtin_neon_vgetq_lane_mf8: case NEON::BI__builtin_neon_vgetq_lane_f32: case NEON::BI__builtin_neon_vduph_lane_bf16: case NEON::BI__builtin_neon_vduph_laneq_bf16: case NEON::BI__builtin_neon_vset_lane_i8: + case NEON::BI__builtin_neon_vset_lane_mf8: case NEON::BI__builtin_neon_vset_lane_i16: case NEON::BI__builtin_neon_vset_lane_bf16: case NEON::BI__builtin_neon_vset_lane_i32: case NEON::BI__builtin_neon_vset_lane_i64: case NEON::BI__builtin_neon_vset_lane_f32: case NEON::BI__builtin_neon_vsetq_lane_i8: + case NEON::BI__builtin_neon_vsetq_lane_mf8: case NEON::BI__builtin_neon_vsetq_lane_i16: case NEON::BI__builtin_neon_vsetq_lane_bf16: case NEON::BI__builtin_neon_vsetq_lane_i32: @@ -4179,9 +4183,17 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, unsigned IntrinsicID, bool IsZExtReturn) { QualType LangPTy = E->getArg(1)->getType(); - llvm::Type *MemEltTy = CGM.getTypes().ConvertTypeForMem( + llvm::Type *MemEltTy = CGM.getTypes().ConvertType( LangPTy->castAs()->getPointeeType()); + // Mfloat8 types is stored as a vector, so extra work + // to extract sclar element type is necessary. + if (MemEltTy->isVectorTy()) { + assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && + "Only <1 x i8> expected"); + MemEltTy = cast(MemEltTy)->getElementType(); + } + // The vector type that is returned may be different from the // eventual type loaded from memory. auto VectorTy = cast(ReturnTy); @@ -4226,9 +4238,17 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, SmallVectorImpl &Ops, unsigned IntrinsicID) { QualType LangPTy = E->getArg(1)->getType(); - llvm::Type *MemEltTy = CGM.getTypes().ConvertTypeForMem( + llvm::Type *MemEltTy = CGM.getTypes().ConvertType( LangPTy->castAs()->getPointeeType()); + // Mfloat8 types is stored as a vector, so extra work + // to extract sclar element type is necessary. + if (MemEltTy->isVectorTy()) { + assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && + "Only <1 x i8> expected"); + MemEltTy = cast(MemEltTy)->getElementType(); + } + // The vector type that is stored may be different from the // eventual type stored to memory. auto VectorTy = cast(Ops.back()->getType()); @@ -6162,6 +6182,13 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1)); Ops.push_back(EmitScalarExpr(E->getArg(2))); return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); + case NEON::BI__builtin_neon_vset_lane_mf8: + case NEON::BI__builtin_neon_vsetq_lane_mf8: + Ops.push_back(EmitScalarExpr(E->getArg(2))); + // The input vector type needs a cast to scalar type. + Ops[0] = + Builder.CreateBitCast(Ops[0], llvm::Type::getInt8Ty(getLLVMContext())); + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); case NEON::BI__builtin_neon_vsetq_lane_f64: // The vector type needs a cast for the v2f64 variant. Ops[1] = @@ -6181,6 +6208,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16)); return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vgetq_lane"); + case NEON::BI__builtin_neon_vget_lane_mf8: + case NEON::BI__builtin_neon_vdupb_lane_mf8: + case NEON::BI__builtin_neon_vgetq_lane_mf8: + case NEON::BI__builtin_neon_vdupb_laneq_mf8: + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); case NEON::BI__builtin_neon_vget_lane_i16: case NEON::BI__builtin_neon_vduph_lane_i16: Ops[0] = @@ -7630,6 +7663,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); } + case NEON::BI__builtin_neon_vluti2_laneq_mf8: case NEON::BI__builtin_neon_vluti2_laneq_bf16: case NEON::BI__builtin_neon_vluti2_laneq_f16: case NEON::BI__builtin_neon_vluti2_laneq_p16: @@ -7645,6 +7679,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ false)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); } + case NEON::BI__builtin_neon_vluti2q_laneq_mf8: case NEON::BI__builtin_neon_vluti2q_laneq_bf16: case NEON::BI__builtin_neon_vluti2q_laneq_f16: case NEON::BI__builtin_neon_vluti2q_laneq_p16: @@ -7660,6 +7695,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ true)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); } + case NEON::BI__builtin_neon_vluti2_lane_mf8: case NEON::BI__builtin_neon_vluti2_lane_bf16: case NEON::BI__builtin_neon_vluti2_lane_f16: case NEON::BI__builtin_neon_vluti2_lane_p16: @@ -7675,6 +7711,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ false)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); } + case NEON::BI__builtin_neon_vluti2q_lane_mf8: case NEON::BI__builtin_neon_vluti2q_lane_bf16: case NEON::BI__builtin_neon_vluti2q_lane_f16: case NEON::BI__builtin_neon_vluti2q_lane_p16: @@ -7690,12 +7727,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ true)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); } + case NEON::BI__builtin_neon_vluti4q_lane_mf8: case NEON::BI__builtin_neon_vluti4q_lane_p8: case NEON::BI__builtin_neon_vluti4q_lane_s8: case NEON::BI__builtin_neon_vluti4q_lane_u8: { Int = Intrinsic::aarch64_neon_vluti4q_lane; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane"); } + case NEON::BI__builtin_neon_vluti4q_laneq_mf8: case NEON::BI__builtin_neon_vluti4q_laneq_p8: case NEON::BI__builtin_neon_vluti4q_laneq_s8: case NEON::BI__builtin_neon_vluti4q_laneq_u8: { diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp index 3335239b0b6c2..0cd4f3c935e92 100644 --- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp @@ -357,6 +357,12 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, return Store; } + // Zihintpause + case RISCV::BI__builtin_riscv_pause: { + llvm::Function *Fn = CGM.getIntrinsic(llvm::Intrinsic::riscv_pause); + return Builder.CreateCall(Fn, {}); + } + // XCValu case RISCV::BI__builtin_riscv_cv_alu_addN: ID = Intrinsic::riscv_cv_alu_addN; diff --git a/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp b/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp index 698f43215a1be..b7fd70e855d40 100644 --- a/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/WebAssembly.cpp @@ -209,6 +209,11 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_extern); return Builder.CreateCall(Callee); } + case WebAssembly::BI__builtin_wasm_ref_is_null_extern: { + Value *Src = EmitScalarExpr(E->getArg(0)); + Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_is_null_extern); + return Builder.CreateCall(Callee, {Src}); + } case WebAssembly::BI__builtin_wasm_ref_null_func: { Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_func); return Builder.CreateCall(Callee); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 192f97996f69e..a5a0393ad7912 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6208,7 +6208,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, if (is_style_windows(llvm::sys::path::Style::native)) { // BoundArch may contains ':', which is invalid in file names on Windows, // therefore replace it with '%'. - std::replace(BoundArch.begin(), BoundArch.end(), ':', '@'); + llvm::replace(BoundArch, ':', '@'); } llvm::PrettyStackTraceString CrashInfo("Computing output path"); diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp index e7a737796925e..3dfd51ee2365a 100644 --- a/clang/lib/Driver/OffloadBundler.cpp +++ b/clang/lib/Driver/OffloadBundler.cpp @@ -1906,8 +1906,7 @@ Error OffloadBundler::UnbundleArchive() { .str(); // Replace ':' in optional target feature list with '_' to ensure // cross-platform validity. - std::replace(OutputBundleName.begin(), OutputBundleName.end(), ':', - '_'); + llvm::replace(OutputBundleName, ':', '_'); std::unique_ptr MemBuf = MemoryBuffer::getMemBufferCopy( DataStream.str(), OutputBundleName); diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 85c4a754f93c5..eb4718909c951 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -160,6 +160,10 @@ static std::string describeSanitizeArg(const llvm::opt::Arg *A, /// Sanitizers set. static std::string toString(const clang::SanitizerSet &Sanitizers); +/// Produce a string containing comma-separated names of sanitizers and +/// sanitizer groups in \p Sanitizers set. +static std::string toStringWithGroups(const clang::SanitizerSet &Sanitizers); + /// Return true if an execute-only target disallows data access to code /// sections. static bool isExecuteOnlyTarget(const llvm::Triple &Triple, @@ -289,7 +293,7 @@ parseSanitizeArgs(const Driver &D, const llvm::opt::ArgList &Args, SanitizerSet SetToDiagnose; SetToDiagnose.Mask |= KindsToDiagnose; D.Diag(diag::err_drv_unsupported_option_argument) - << Arg->getSpelling() << toString(SetToDiagnose); + << Arg->getSpelling() << toStringWithGroups(SetToDiagnose); DiagnosedAlwaysOutViolations |= KindsToDiagnose; } } @@ -305,7 +309,7 @@ parseSanitizeArgs(const Driver &D, const llvm::opt::ArgList &Args, SanitizerSet SetToDiagnose; SetToDiagnose.Mask |= KindsToDiagnose; D.Diag(diag::err_drv_unsupported_option_argument) - << Arg->getSpelling() << toString(SetToDiagnose); + << Arg->getSpelling() << toStringWithGroups(SetToDiagnose); DiagnosedAlwaysInViolations |= KindsToDiagnose; } } @@ -1200,6 +1204,19 @@ static std::string toString(const clang::SanitizerMaskCutoffs &Cutoffs) { return llvm::join(Res, ","); } +static std::string toStringWithGroups(const clang::SanitizerSet &Sanitizers) { + std::string Res; +#define SANITIZER(NAME, ID) \ + if (Sanitizers.has(SanitizerKind::ID)) { \ + if (!Res.empty()) \ + Res += ","; \ + Res += NAME; \ + } +#define SANITIZER_GROUP(NAME, ID, ALIAS) SANITIZER(NAME, ID##Group) +#include "clang/Basic/Sanitizers.def" + return Res; +} + static void addSpecialCaseListOpt(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const char *SCLOptFlag, diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 35ca019795ddc..998106cf52d66 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -221,7 +221,7 @@ RocmInstallationDetector::getInstallationPathCandidates() { std::string VerStr = DirName.drop_front(strlen("rocm-")).str(); // The ROCm directory name follows the format of // rocm-{major}.{minor}.{subMinor}[-{build}] - std::replace(VerStr.begin(), VerStr.end(), '-', '.'); + llvm::replace(VerStr, '-', '.'); V.tryParse(VerStr); return V; }; diff --git a/clang/lib/Driver/ToolChains/Haiku.cpp b/clang/lib/Driver/ToolChains/Haiku.cpp index 0e55a71280aff..a6f9582a66662 100644 --- a/clang/lib/Driver/ToolChains/Haiku.cpp +++ b/clang/lib/Driver/ToolChains/Haiku.cpp @@ -273,6 +273,8 @@ void Haiku::AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, "/boot/system/develop/headers/gnu")); addSystemInclude(DriverArgs, CC1Args, concat(D.SysRoot, "/boot/system/develop/headers/posix")); + addSystemInclude(DriverArgs, CC1Args, concat(D.SysRoot, + "/boot/system/develop/headers/gcc/include")); addSystemInclude(DriverArgs, CC1Args, concat(D.SysRoot, "/boot/system/develop/headers")); } diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 20b5352b83a9e..7e32d2084d5ab 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -3091,9 +3091,8 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer { FormatTok->isOneOf(tok::numeric_constant, tok::l_square, tok::l_brace))) || (FormatTok->Tok.isAnyIdentifier() && - std::binary_search(std::begin(FoundationIdentifiers), - std::end(FoundationIdentifiers), - FormatTok->TokenText)) || + llvm::binary_search(FoundationIdentifiers, + FormatTok->TokenText)) || FormatTok->is(TT_ObjCStringLiteral) || FormatTok->isOneOf(Keywords.kw_NS_CLOSED_ENUM, Keywords.kw_NS_ENUM, Keywords.kw_NS_ERROR_ENUM, diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index 1d49d787f9cc9..e0867f6dcce06 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -45,8 +45,7 @@ bool FormatToken::isTypeName(const LangOptions &LangOpts) const { if (is(TT_TypeName) || Tok.isSimpleTypeSpecifier(LangOpts)) return true; return (LangOpts.CXXOperatorNames || LangOpts.C11) && is(tok::identifier) && - std::binary_search(CppNonKeywordTypes.begin(), - CppNonKeywordTypes.end(), TokenText); + llvm::binary_search(CppNonKeywordTypes, TokenText); } bool FormatToken::isTypeOrIdentifier(const LangOptions &LangOpts) const { diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 394512978b521..fd48e425a5c21 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3112,9 +3112,11 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args, if (const Arg *A = Args.getLastArg(OPT_code_completion_at)) { Opts.CodeCompletionAt = ParsedSourceLocation::FromString(A->getValue()); - if (Opts.CodeCompletionAt.FileName.empty()) + if (Opts.CodeCompletionAt.FileName.empty()) { Diags.Report(diag::err_drv_invalid_value) - << A->getAsString(Args) << A->getValue(); + << A->getAsString(Args) << A->getValue(); + Diags.Report(diag::note_command_line_code_loc_requirement); + } } Opts.Plugins = Args.getAllArgValues(OPT_load); diff --git a/clang/lib/Frontend/DiagnosticRenderer.cpp b/clang/lib/Frontend/DiagnosticRenderer.cpp index b11806637efda..3b120abbc3a7a 100644 --- a/clang/lib/Frontend/DiagnosticRenderer.cpp +++ b/clang/lib/Frontend/DiagnosticRenderer.cpp @@ -272,8 +272,7 @@ retrieveMacroLocation(SourceLocation Loc, FileID MacroFileID, if (SM->isMacroArgExpansion(Loc)) { // Only look at the immediate spelling location of this macro argument if // the other location in the source range is also present in that expansion. - if (std::binary_search(CommonArgExpansions.begin(), - CommonArgExpansions.end(), MacroFileID)) + if (llvm::binary_search(CommonArgExpansions, MacroFileID)) MacroRange = CharSourceRange(SM->getImmediateSpellingLoc(Loc), IsTokenRange); MacroArgRange = SM->getImmediateExpansionRange(Loc); diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp index 6de19d689988e..89fda3e839cb9 100644 --- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp +++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp @@ -482,7 +482,7 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, SourceManager &SM, // What's left in DToken is the actual prefix. That might not be a -verify // prefix even if there is only one -verify prefix (for example, the full // DToken is foo-bar-warning, but foo is the only -verify prefix). - if (!std::binary_search(Prefixes.begin(), Prefixes.end(), DToken)) + if (!llvm::binary_search(Prefixes, DToken)) continue; if (NoDiag) { diff --git a/clang/lib/Headers/__clang_hip_cmath.h b/clang/lib/Headers/__clang_hip_cmath.h index 7d982ad9af7ee..8dbde4291fff5 100644 --- a/clang/lib/Headers/__clang_hip_cmath.h +++ b/clang/lib/Headers/__clang_hip_cmath.h @@ -448,7 +448,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {}; } // namespace __hip // __HIP_OVERLOAD1 is used to resolve function calls with integer argument to -// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with +// avoid compilation error due to ambiguity. e.g. floor(5) is resolved with // floor(double). #define __HIP_OVERLOAD1(__retty, __fn) \ template \ @@ -459,17 +459,18 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {}; } // __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double -// or integer argument to avoid compilation error due to ambibuity. e.g. +// or integer argument to avoid compilation error due to ambiguity. e.g. // max(5.0f, 6.0) is resolved with max(double, double). #if __cplusplus >= 201103L #define __HIP_OVERLOAD2(__retty, __fn) \ template \ - __DEVICE__ __CONSTEXPR__ typename __hip_enable_if< \ - __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, \ - typename __hip::__promote<__T1, __T2>::type>::type \ - __fn(__T1 __x, __T2 __y) { \ - typedef typename __hip::__promote<__T1, __T2>::type __result_type; \ - return __fn((__result_type)__x, (__result_type)__y); \ + __DEVICE__ __CONSTEXPR__ \ + typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \ + __hip::is_arithmetic<__T2>::value, \ + __retty>::type \ + __fn(__T1 __x, __T2 __y) { \ + typedef typename __hip::__promote<__T1, __T2>::type __arg_type; \ + return __fn((__arg_type)__x, (__arg_type)__y); \ } #else #define __HIP_OVERLOAD2(__retty, __fn) \ diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h index b6bcf32c09c08..2b7f5043e09e4 100644 --- a/clang/lib/Headers/opencl-c-base.h +++ b/clang/lib/Headers/opencl-c-base.h @@ -47,6 +47,7 @@ #define __opencl_c_ext_fp32_local_atomic_min_max 1 #define __opencl_c_ext_image_raw10_raw12 1 #define __opencl_c_ext_image_unorm_int_2_101010 1 +#define __opencl_c_ext_image_unsigned_10x6_12x4_14x2 1 #define cl_khr_kernel_clock 1 #define __opencl_c_kernel_clock_scope_device 1 #define __opencl_c_kernel_clock_scope_work_group 1 @@ -490,6 +491,14 @@ typedef enum memory_order #ifdef __opencl_c_ext_image_unorm_int_2_101010 #define CLK_UNORM_INT_2_101010_EXT 0x10E5 #endif // __opencl_c_ext_image_unorm_int_2_101010 +#ifdef __opencl_c_ext_image_unsigned_10x6_12x4_14x2 +#define CLK_UNSIGNED_INT10X6_EXT 0x10E6 +#define CLK_UNSIGNED_INT12X4_EXT 0x10E7 +#define CLK_UNSIGNED_INT14X2_EXT 0x10E8 +#define CLK_UNORM_10X6_EXT 0x10E1 +#define CLK_UNORM_12X4_EXT 0x10E9 +#define CLK_UNORM_14X2_EXT 0x10EA +#endif // __opencl_c_ext_image_unsigned_10x6_12x4_14x2 // Channel order, numbering must be aligned with cl_channel_order in cl.h // diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index c2bab9118234c..b2a8459d6b9cc 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -148,8 +148,7 @@ static bool isFeatureTestMacro(StringRef MacroName) { "__STDCPP_WANT_MATH_SPEC_FUNCS__", "__STDC_FORMAT_MACROS", }; - return std::binary_search(std::begin(ReservedMacro), std::end(ReservedMacro), - MacroName); + return llvm::binary_search(ReservedMacro, MacroName); } static bool isLanguageDefinedBuiltin(const SourceManager &SourceMgr, diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 9ea7b95622c76..4c2dbbe881b48 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -759,6 +759,8 @@ void Preprocessor::HandlePoisonedIdentifier(Token & Identifier) { void Preprocessor::updateOutOfDateIdentifier(const IdentifierInfo &II) const { assert(II.isOutOfDate() && "not out of date"); + assert(getExternalSource() && + "getExternalSource() should not return nullptr"); getExternalSource()->updateOutOfDateIdentifier(II); } diff --git a/clang/lib/Parse/ParseCXXInlineMethods.cpp b/clang/lib/Parse/ParseCXXInlineMethods.cpp index 342d46770c656..e215c64cccd11 100644 --- a/clang/lib/Parse/ParseCXXInlineMethods.cpp +++ b/clang/lib/Parse/ParseCXXInlineMethods.cpp @@ -21,7 +21,6 @@ using namespace clang; -/// Parse the optional ("message") part of a deleted-function-body. StringLiteral *Parser::ParseCXXDeletedFunctionMessage() { if (!Tok.is(tok::l_paren)) return nullptr; @@ -48,9 +47,6 @@ StringLiteral *Parser::ParseCXXDeletedFunctionMessage() { return Message; } -/// If we've encountered '= delete' in a context where it is ill-formed, such -/// as in the declaration of a non-function, also skip the ("message") part if -/// it is present to avoid issuing further diagnostics. void Parser::SkipDeletedFunctionBody() { if (!Tok.is(tok::l_paren)) return; @@ -64,9 +60,6 @@ void Parser::SkipDeletedFunctionBody() { BT.consumeClose(); } -/// ParseCXXInlineMethodDef - We parsed and verified that the specified -/// Declarator is a well formed C++ inline method definition. Now lex its body -/// and store its tokens for parsing after the C++ class is complete. NamedDecl *Parser::ParseCXXInlineMethodDef( AccessSpecifier AS, const ParsedAttributesView &AccessAttrs, ParsingDeclarator &D, const ParsedTemplateInfo &TemplateInfo, @@ -238,10 +231,6 @@ NamedDecl *Parser::ParseCXXInlineMethodDef( return FnD; } -/// ParseCXXNonStaticMemberInitializer - We parsed and verified that the -/// specified Declarator is a well formed C++ non-static data member -/// declaration. Now lex its initializer and store its tokens for parsing -/// after the class is complete. void Parser::ParseCXXNonStaticMemberInitializer(Decl *VarD) { assert(Tok.isOneOf(tok::l_brace, tok::equal) && "Current token not a '{' or '='!"); @@ -333,8 +322,6 @@ void Parser::LateParsedPragma::ParseLexedPragmas() { Self->ParseLexedPragma(*this); } -/// Utility to re-enter a possibly-templated scope while parsing its -/// late-parsed components. struct Parser::ReenterTemplateScopeRAII { Parser &P; MultiParseScope Scopes; @@ -349,7 +336,6 @@ struct Parser::ReenterTemplateScopeRAII { } }; -/// Utility to re-enter a class scope while parsing its late-parsed components. struct Parser::ReenterClassScopeRAII : ReenterTemplateScopeRAII { ParsingClass &Class; @@ -375,10 +361,6 @@ struct Parser::ReenterClassScopeRAII : ReenterTemplateScopeRAII { } }; -/// ParseLexedMethodDeclarations - We finished parsing the member -/// specification of a top (non-nested) C++ class. Now go over the -/// stack of method declarations with some parts for which parsing was -/// delayed (such as default arguments) and parse them. void Parser::ParseLexedMethodDeclarations(ParsingClass &Class) { ReenterClassScopeRAII InClassScope(*this, Class); @@ -583,9 +565,6 @@ void Parser::ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM) { Actions.ActOnFinishDelayedCXXMethodDeclaration(getCurScope(), LM.Method); } -/// ParseLexedMethodDefs - We finished parsing the member specification of a top -/// (non-nested) C++ class. Now go over the stack of lexed methods that were -/// collected during its parsing and parse them all. void Parser::ParseLexedMethodDefs(ParsingClass &Class) { ReenterClassScopeRAII InClassScope(*this, Class); @@ -664,9 +643,6 @@ void Parser::ParseLexedMethodDef(LexedMethod &LM) { ParseFunctionStatementBody(LM.D, FnScope); } -/// ParseLexedMemberInitializers - We finished parsing the member specification -/// of a top (non-nested) C++ class. Now go over the stack of lexed data member -/// initializers that were collected during its parsing and parse them all. void Parser::ParseLexedMemberInitializers(ParsingClass &Class) { ReenterClassScopeRAII InClassScope(*this, Class); @@ -734,8 +710,6 @@ void Parser::ParseLexedMemberInitializer(LateParsedMemberInitializer &MI) { ConsumeAnyToken(); } -/// Wrapper class which calls ParseLexedAttribute, after setting up the -/// scope appropriately. void Parser::ParseLexedAttributes(ParsingClass &Class) { ReenterClassScopeRAII InClassScope(*this, Class); @@ -743,7 +717,6 @@ void Parser::ParseLexedAttributes(ParsingClass &Class) { LateD->ParseLexedAttributes(); } -/// Parse all attributes in LAs, and attach them to Decl D. void Parser::ParseLexedAttributeList(LateParsedAttrList &LAs, Decl *D, bool EnterScope, bool OnDefinition) { assert(LAs.parseSoon() && @@ -757,11 +730,6 @@ void Parser::ParseLexedAttributeList(LateParsedAttrList &LAs, Decl *D, LAs.clear(); } -/// Finish parsing an attribute for which parsing was delayed. -/// This will be called at the end of parsing a class declaration -/// for each LateParsedAttribute. We consume the saved tokens and -/// create an attribute with the arguments filled in. We add this -/// to the Attribute list for the decl. void Parser::ParseLexedAttribute(LateParsedAttribute &LA, bool EnterScope, bool OnDefinition) { // Create a fake EOF so that attribute parsing won't go off the end of the @@ -865,12 +833,6 @@ void Parser::ParseLexedPragma(LateParsedPragma &LP) { } } -/// ConsumeAndStoreUntil - Consume and store the token at the passed token -/// container until the token 'T' is reached (which gets -/// consumed/stored too, if ConsumeFinalToken). -/// If StopAtSemi is true, then we will stop early at a ';' character. -/// Returns true if token 'T1' or 'T2' was found. -/// NOTE: This is a specialized version of Parser::SkipUntil. bool Parser::ConsumeAndStoreUntil(tok::TokenKind T1, tok::TokenKind T2, CachedTokens &Toks, bool StopAtSemi, bool ConsumeFinalToken) { @@ -953,12 +915,6 @@ bool Parser::ConsumeAndStoreUntil(tok::TokenKind T1, tok::TokenKind T2, } } -/// Consume tokens and store them in the passed token container until -/// we've passed the try keyword and constructor initializers and have consumed -/// the opening brace of the function body. The opening brace will be consumed -/// if and only if there was no error. -/// -/// \return True on error. bool Parser::ConsumeAndStoreFunctionPrologue(CachedTokens &Toks) { if (Tok.is(tok::kw_try)) { Toks.push_back(Tok); @@ -1170,8 +1126,6 @@ bool Parser::ConsumeAndStoreFunctionPrologue(CachedTokens &Toks) { } } -/// Consume and store tokens from the '?' to the ':' in a conditional -/// expression. bool Parser::ConsumeAndStoreConditional(CachedTokens &Toks) { // Consume '?'. assert(Tok.is(tok::question)); @@ -1195,12 +1149,6 @@ bool Parser::ConsumeAndStoreConditional(CachedTokens &Toks) { return true; } -/// ConsumeAndStoreInitializer - Consume and store the token at the passed token -/// container until the end of the current initializer expression (either a -/// default argument or an in-class initializer for a non-static data member). -/// -/// Returns \c true if we reached the end of something initializer-shaped, -/// \c false if we bailed out. bool Parser::ConsumeAndStoreInitializer(CachedTokens &Toks, CachedInitKind CIK) { // We always want this function to consume at least one token if not at EOF. diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 4fe3565687905..7a87cd2e340cc 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -44,11 +44,6 @@ using namespace clang; // C99 6.7: Declarations. //===----------------------------------------------------------------------===// -/// ParseTypeName -/// type-name: [C99 6.7.6] -/// specifier-qualifier-list abstract-declarator[opt] -/// -/// Called type-id in C++. TypeResult Parser::ParseTypeName(SourceRange *Range, DeclaratorContext Context, AccessSpecifier AS, Decl **OwnedType, ParsedAttributes *Attrs) { @@ -148,20 +143,6 @@ void Parser::ParseAttributes(unsigned WhichAttrKinds, ParsedAttributes &Attrs, } while (MoreToParse); } -/// ParseSingleGNUAttribute - Parse a single GNU attribute. -/// -/// [GNU] attrib: -/// empty -/// attrib-name -/// attrib-name '(' identifier ')' -/// attrib-name '(' identifier ',' nonempty-expr-list ')' -/// attrib-name '(' argument-expression-list [C99 6.5.2] ')' -/// -/// [GNU] attrib-name: -/// identifier -/// typespec -/// typequal -/// storageclass bool Parser::ParseSingleGNUAttribute(ParsedAttributes &Attrs, SourceLocation &EndLoc, LateParsedAttrList *LateAttrs, @@ -228,47 +209,6 @@ bool Parser::ParseSingleGNUAttribute(ParsedAttributes &Attrs, return false; } -/// ParseGNUAttributes - Parse a non-empty attributes list. -/// -/// [GNU] attributes: -/// attribute -/// attributes attribute -/// -/// [GNU] attribute: -/// '__attribute__' '(' '(' attribute-list ')' ')' -/// -/// [GNU] attribute-list: -/// attrib -/// attribute_list ',' attrib -/// -/// [GNU] attrib: -/// empty -/// attrib-name -/// attrib-name '(' identifier ')' -/// attrib-name '(' identifier ',' nonempty-expr-list ')' -/// attrib-name '(' argument-expression-list [C99 6.5.2] ')' -/// -/// [GNU] attrib-name: -/// identifier -/// typespec -/// typequal -/// storageclass -/// -/// Whether an attribute takes an 'identifier' is determined by the -/// attrib-name. GCC's behavior here is not worth imitating: -/// -/// * In C mode, if the attribute argument list starts with an identifier -/// followed by a ',' or an ')', and the identifier doesn't resolve to -/// a type, it is parsed as an identifier. If the attribute actually -/// wanted an expression, it's out of luck (but it turns out that no -/// attributes work that way, because C constant expressions are very -/// limited). -/// * In C++ mode, if the attribute argument list starts with an identifier, -/// and the attribute *wants* an identifier, it is parsed as an identifier. -/// At block scope, any additional tokens between the identifier and the -/// ',' or ')' are ignored, otherwise they produce a parse error. -/// -/// We follow the C++ model, but don't allow junk after the identifier. void Parser::ParseGNUAttributes(ParsedAttributes &Attrs, LateParsedAttrList *LateAttrs, Declarator *D) { assert(Tok.is(tok::kw___attribute) && "Not a GNU attribute list!"); @@ -697,8 +637,6 @@ unsigned Parser::ParseAttributeArgsCommon( return static_cast(ArgExprs.size() + !TheParsedType.get().isNull()); } -/// Parse the arguments to a parameterized GNU attribute or -/// a C++11 attribute in "gnu" namespace. void Parser::ParseGNUAttributeArgs( IdentifierInfo *AttrName, SourceLocation AttrNameLoc, ParsedAttributes &Attrs, SourceLocation *EndLoc, IdentifierInfo *ScopeName, @@ -949,12 +887,6 @@ bool Parser::ParseMicrosoftDeclSpecArgs(IdentifierInfo *AttrName, return true; } -/// [MS] decl-specifier: -/// __declspec ( extended-decl-modifier-seq ) -/// -/// [MS] extended-decl-modifier-seq: -/// extended-decl-modifier[opt] -/// extended-decl-modifier extended-decl-modifier-seq void Parser::ParseMicrosoftDeclSpecs(ParsedAttributes &Attrs) { assert(getLangOpts().DeclSpecKeyword && "__declspec keyword is not enabled"); assert(Tok.is(tok::kw___declspec) && "Not a declspec!"); @@ -1186,14 +1118,6 @@ static bool VersionNumberSeparator(const char Separator) { return (Separator == '.' || Separator == '_'); } -/// Parse a version number. -/// -/// version: -/// simple-integer -/// simple-integer '.' simple-integer -/// simple-integer '_' simple-integer -/// simple-integer '.' simple-integer '.' simple-integer -/// simple-integer '_' simple-integer '_' simple-integer VersionTuple Parser::ParseVersionTuple(SourceRange &Range) { Range = SourceRange(Tok.getLocation(), Tok.getEndLoc()); @@ -1305,31 +1229,6 @@ VersionTuple Parser::ParseVersionTuple(SourceRange &Range) { return VersionTuple(Major, Minor, Subminor); } -/// Parse the contents of the "availability" attribute. -/// -/// availability-attribute: -/// 'availability' '(' platform ',' opt-strict version-arg-list, -/// opt-replacement, opt-message')' -/// -/// platform: -/// identifier -/// -/// opt-strict: -/// 'strict' ',' -/// -/// version-arg-list: -/// version-arg -/// version-arg ',' version-arg-list -/// -/// version-arg: -/// 'introduced' '=' version -/// 'deprecated' '=' version -/// 'obsoleted' = version -/// 'unavailable' -/// opt-replacement: -/// 'replacement' '=' -/// opt-message: -/// 'message' '=' void Parser::ParseAvailabilityAttribute( IdentifierInfo &Availability, SourceLocation AvailabilityLoc, ParsedAttributes &attrs, SourceLocation *endLoc, IdentifierInfo *ScopeName, @@ -1555,20 +1454,6 @@ void Parser::ParseAvailabilityAttribute( StrictLoc, ReplacementExpr.get(), EnvironmentLoc); } -/// Parse the contents of the "external_source_symbol" attribute. -/// -/// external-source-symbol-attribute: -/// 'external_source_symbol' '(' keyword-arg-list ')' -/// -/// keyword-arg-list: -/// keyword-arg -/// keyword-arg ',' keyword-arg-list -/// -/// keyword-arg: -/// 'language' '=' -/// 'defined_in' '=' -/// 'USR' '=' -/// 'generated_declaration' void Parser::ParseExternalSourceSymbolAttribute( IdentifierInfo &ExternalSourceSymbol, SourceLocation Loc, ParsedAttributes &Attrs, SourceLocation *EndLoc, IdentifierInfo *ScopeName, @@ -1687,17 +1572,6 @@ void Parser::ParseExternalSourceSymbolAttribute( ScopeName, ScopeLoc, Args, std::size(Args), Form); } -/// Parse the contents of the "objc_bridge_related" attribute. -/// objc_bridge_related '(' related_class ',' opt-class_method ',' opt-instance_method ')' -/// related_class: -/// Identifier -/// -/// opt-class_method: -/// Identifier: | -/// -/// opt-instance_method: -/// Identifier | -/// void Parser::ParseObjCBridgeRelatedAttribute( IdentifierInfo &ObjCBridgeRelated, SourceLocation ObjCBridgeRelatedLoc, ParsedAttributes &Attrs, SourceLocation *EndLoc, IdentifierInfo *ScopeName, @@ -1867,14 +1741,6 @@ void Parser::ParseTypeTagForDatatypeAttribute( *EndLoc = T.getCloseLocation(); } -/// DiagnoseProhibitedCXX11Attribute - We have found the opening square brackets -/// of a C++11 attribute-specifier in a location where an attribute is not -/// permitted. By C++11 [dcl.attr.grammar]p6, this is ill-formed. Diagnose this -/// situation. -/// -/// \return \c true if we skipped an attribute-like chunk of tokens, \c false if -/// this doesn't appear to actually be an attribute-specifier, and the caller -/// should try to parse it. bool Parser::DiagnoseProhibitedCXX11Attribute() { assert(Tok.is(tok::l_square) && NextToken().is(tok::l_square)); @@ -1901,10 +1767,6 @@ bool Parser::DiagnoseProhibitedCXX11Attribute() { llvm_unreachable("All cases handled above."); } -/// We have found the opening square brackets of a C++11 -/// attribute-specifier in a location where an attribute is not permitted, but -/// we know where the attributes ought to be written. Parse them anyway, and -/// provide a fixit moving them to the right place. void Parser::DiagnoseMisplacedCXX11Attribute(ParsedAttributes &Attrs, SourceLocation CorrectLocation) { assert((Tok.is(tok::l_square) && NextToken().is(tok::l_square)) || @@ -1997,13 +1859,6 @@ void Parser::DiagnoseCXX11AttributeExtension(ParsedAttributes &Attrs) { } } -// Usually, `__attribute__((attrib)) class Foo {} var` means that attribute -// applies to var, not the type Foo. -// As an exception to the rule, __declspec(align(...)) before the -// class-key affects the type instead of the variable. -// Also, Microsoft-style [attributes] seem to affect the type instead of the -// variable. -// This function moves attributes that should apply to the type off DS to Attrs. void Parser::stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs, DeclSpec &DS, TagUseKind TUK) { if (TUK == TagUseKind::Reference) @@ -2024,22 +1879,6 @@ void Parser::stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs, } } -/// ParseDeclaration - Parse a full 'declaration', which consists of -/// declaration-specifiers, some number of declarators, and a semicolon. -/// 'Context' should be a DeclaratorContext value. This returns the -/// location of the semicolon in DeclEnd. -/// -/// declaration: [C99 6.7] -/// block-declaration -> -/// simple-declaration -/// others [FIXME] -/// [C++] template-declaration -/// [C++] namespace-definition -/// [C++] using-directive -/// [C++] using-declaration -/// [C++11/C11] static_assert-declaration -/// others... [FIXME] -/// Parser::DeclGroupPtrTy Parser::ParseDeclaration(DeclaratorContext Context, SourceLocation &DeclEnd, ParsedAttributes &DeclAttrs, @@ -2097,27 +1936,6 @@ Parser::DeclGroupPtrTy Parser::ParseDeclaration(DeclaratorContext Context, return Actions.ConvertDeclToDeclGroup(SingleDecl); } -/// simple-declaration: [C99 6.7: declaration] [C++ 7p1: dcl.dcl] -/// declaration-specifiers init-declarator-list[opt] ';' -/// [C++11] attribute-specifier-seq decl-specifier-seq[opt] -/// init-declarator-list ';' -///[C90/C++]init-declarator-list ';' [TODO] -/// [OMP] threadprivate-directive -/// [OMP] allocate-directive [TODO] -/// -/// for-range-declaration: [C++11 6.5p1: stmt.ranged] -/// attribute-specifier-seq[opt] type-specifier-seq declarator -/// -/// If RequireSemi is false, this does not check for a ';' at the end of the -/// declaration. If it is true, it checks for and eats it. -/// -/// If FRI is non-null, we might be parsing a for-range-declaration instead -/// of a simple-declaration. If we find that we are, we also parse the -/// for-range-initializer, and place it here. -/// -/// DeclSpecStart is used when decl-specifiers are parsed before parsing -/// the Declaration. The SourceLocation for this Decl is set to -/// DeclSpecStart if DeclSpecStart is non-null. Parser::DeclGroupPtrTy Parser::ParseSimpleDeclaration( DeclaratorContext Context, SourceLocation &DeclEnd, ParsedAttributes &DeclAttrs, ParsedAttributes &DeclSpecAttrs, @@ -2168,8 +1986,6 @@ Parser::DeclGroupPtrTy Parser::ParseSimpleDeclaration( return ParseDeclGroup(DS, Context, DeclAttrs, TemplateInfo, &DeclEnd, FRI); } -/// Returns true if this might be the start of a declarator, or a common typo -/// for a declarator. bool Parser::MightBeDeclarator(DeclaratorContext Context) { switch (Tok.getKind()) { case tok::annot_cxxscope: @@ -2234,9 +2050,6 @@ bool Parser::MightBeDeclarator(DeclaratorContext Context) { } } -/// Skip until we reach something which seems like a sensible place to pick -/// up parsing after a malformed declaration. This will sometimes stop sooner -/// than SkipUntil(tok::r_brace) would, but will never stop later. void Parser::SkipMalformedDecl() { while (true) { switch (Tok.getKind()) { @@ -2316,9 +2129,6 @@ void Parser::SkipMalformedDecl() { } } -/// ParseDeclGroup - Having concluded that this is either a function -/// definition or a group of object declarations, actually parse the -/// result. Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS, DeclaratorContext Context, ParsedAttributes &Attrs, @@ -2634,8 +2444,6 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS, return Actions.FinalizeDeclaratorGroup(getCurScope(), DS, DeclsInGroup); } -/// Parse an optional simple-asm-expr and attributes, and attach them to a -/// declarator. Returns true on an error. bool Parser::ParseAsmAttributesAfterDeclarator(Declarator &D) { // If a simple-asm-expr is present, parse it. if (Tok.is(tok::kw_asm)) { @@ -2654,28 +2462,6 @@ bool Parser::ParseAsmAttributesAfterDeclarator(Declarator &D) { return false; } -/// Parse 'declaration' after parsing 'declaration-specifiers -/// declarator'. This method parses the remainder of the declaration -/// (including any attributes or initializer, among other things) and -/// finalizes the declaration. -/// -/// init-declarator: [C99 6.7] -/// declarator -/// declarator '=' initializer -/// [GNU] declarator simple-asm-expr[opt] attributes[opt] -/// [GNU] declarator simple-asm-expr[opt] attributes[opt] '=' initializer -/// [C++] declarator initializer[opt] -/// -/// [C++] initializer: -/// [C++] '=' initializer-clause -/// [C++] '(' expression-list ')' -/// [C++0x] '=' 'default' [TODO] -/// [C++0x] '=' 'delete' -/// [C++0x] braced-init-list -/// -/// According to the standard grammar, =default and =delete are function -/// definitions, but that definitely doesn't fit with the parser here. -/// Decl *Parser::ParseDeclarationAfterDeclarator( Declarator &D, const ParsedTemplateInfo &TemplateInfo) { if (ParseAsmAttributesAfterDeclarator(D)) @@ -2941,12 +2727,6 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes( return OuterDecl ? OuterDecl : ThisDecl; } -/// ParseSpecifierQualifierList -/// specifier-qualifier-list: -/// type-specifier specifier-qualifier-list[opt] -/// type-qualifier specifier-qualifier-list[opt] -/// [GNU] attributes specifier-qualifier-list[opt] -/// void Parser::ParseSpecifierQualifierList( DeclSpec &DS, ImplicitTypenameContext AllowImplicitTypename, AccessSpecifier AS, DeclSpecContext DSC) { @@ -3023,15 +2803,6 @@ static bool isValidAfterIdentifierInDeclarator(const Token &T) { tok::colon); } -/// ParseImplicitInt - This method is called when we have an non-typename -/// identifier in a declspec (which normally terminates the decl spec) when -/// the declspec has no type specifier. In this case, the declspec is either -/// malformed or is "implicit int" (in K&R and C89). -/// -/// This method handles diagnosing this prettily and returns false if the -/// declspec is done being processed. If it recovers and thinks there may be -/// other pieces of declspec after it, it returns true. -/// bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS, ParsedTemplateInfo &TemplateInfo, AccessSpecifier AS, DeclSpecContext DSC, @@ -3258,11 +3029,6 @@ bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS, return true; } -/// Determine the declaration specifier context from the declarator -/// context. -/// -/// \param Context the declarator context, which is one of the -/// DeclaratorContext enumerator values. Parser::DeclSpecContext Parser::getDeclSpecContextFromDeclaratorContext(DeclaratorContext Context) { switch (Context) { @@ -3312,12 +3078,6 @@ Parser::getDeclSpecContextFromDeclaratorContext(DeclaratorContext Context) { llvm_unreachable("Missing DeclaratorContext case"); } -/// ParseAlignArgument - Parse the argument to an alignment-specifier. -/// -/// [C11] type-id -/// [C11] constant-expression -/// [C++0x] type-id ...[opt] -/// [C++0x] assignment-expression ...[opt] ExprResult Parser::ParseAlignArgument(StringRef KWName, SourceLocation Start, SourceLocation &EllipsisLoc, bool &IsType, ParsedType &TypeResult) { @@ -3341,14 +3101,6 @@ ExprResult Parser::ParseAlignArgument(StringRef KWName, SourceLocation Start, return ER; } -/// ParseAlignmentSpecifier - Parse an alignment-specifier, and add the -/// attribute to Attrs. -/// -/// alignment-specifier: -/// [C11] '_Alignas' '(' type-id ')' -/// [C11] '_Alignas' '(' constant-expression ')' -/// [C++11] 'alignas' '(' type-id ...[opt] ')' -/// [C++11] 'alignas' '(' assignment-expression ...[opt] ')' void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs, SourceLocation *EndLoc) { assert(Tok.isOneOf(tok::kw_alignas, tok::kw__Alignas) && @@ -3401,10 +3153,6 @@ void Parser::DistributeCLateParsedAttrs(Decl *Dcl, } } -/// type-qualifier: -/// ('__ptrauth') '(' constant-expression -/// (',' constant-expression)[opt] -/// (',' constant-expression)[opt] ')' void Parser::ParsePtrauthQualifier(ParsedAttributes &Attrs) { assert(Tok.is(tok::kw___ptrauth)); @@ -3440,8 +3188,6 @@ void Parser::ParsePtrauthQualifier(ParsedAttributes &Attrs) { /*IsRegularKeywordAttribute=*/false)); } -/// Bounds attributes (e.g., counted_by): -/// AttrName '(' expression ')' void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName, SourceLocation AttrNameLoc, ParsedAttributes &Attrs, @@ -3508,13 +3254,6 @@ ExprResult Parser::ParseExtIntegerArgument() { return ER; } -/// Determine whether we're looking at something that might be a declarator -/// in a simple-declaration. If it can't possibly be a declarator, maybe -/// diagnose a missing semicolon after a prior tag definition in the decl -/// specifier. -/// -/// \return \c true if an error occurred and this can't be any kind of -/// declaration. bool Parser::DiagnoseMissingSemiAfterTagDefinition(DeclSpec &DS, AccessSpecifier AS, DeclSpecContext DSContext, @@ -3619,33 +3358,6 @@ Parser::DiagnoseMissingSemiAfterTagDefinition(DeclSpec &DS, AccessSpecifier AS, return false; } -/// ParseDeclarationSpecifiers -/// declaration-specifiers: [C99 6.7] -/// storage-class-specifier declaration-specifiers[opt] -/// type-specifier declaration-specifiers[opt] -/// [C99] function-specifier declaration-specifiers[opt] -/// [C11] alignment-specifier declaration-specifiers[opt] -/// [GNU] attributes declaration-specifiers[opt] -/// [Clang] '__module_private__' declaration-specifiers[opt] -/// [ObjC1] '__kindof' declaration-specifiers[opt] -/// -/// storage-class-specifier: [C99 6.7.1] -/// 'typedef' -/// 'extern' -/// 'static' -/// 'auto' -/// 'register' -/// [C++] 'mutable' -/// [C++11] 'thread_local' -/// [C11] '_Thread_local' -/// [GNU] '__thread' -/// function-specifier: [C99 6.7.4] -/// [C99] 'inline' -/// [C++] 'virtual' -/// [C++] 'explicit' -/// [OpenCL] '__kernel' -/// 'friend': [C++ dcl.friend] -/// 'constexpr': [C++0x dcl.constexpr] void Parser::ParseDeclarationSpecifiers( DeclSpec &DS, ParsedTemplateInfo &TemplateInfo, AccessSpecifier AS, DeclSpecContext DSContext, LateParsedAttrList *LateAttrs, @@ -4024,13 +3736,14 @@ void Parser::ParseDeclarationSpecifiers( } case tok::kw___is_signed: - // GNU libstdc++ 4.4 uses __is_signed as an identifier, but Clang - // typically treats it as a trait. If we see __is_signed as it appears - // in libstdc++, e.g., + // HACK: before 2022-12, libstdc++ uses __is_signed as an identifier, + // but Clang typically treats it as a trait. + // If we see __is_signed as it appears in libstdc++, e.g., // // static const bool __is_signed; // // then treat __is_signed as an identifier rather than as a keyword. + // This was fixed by libstdc++ in December 2022. if (DS.getTypeSpecType() == TST_bool && DS.getTypeQualifiers() == DeclSpec::TQ_const && DS.getStorageClassSpec() == DeclSpec::SCS_static) @@ -4975,27 +4688,6 @@ static void DiagnoseCountAttributedTypeInUnnamedAnon(ParsingDeclSpec &DS, } } -/// ParseStructDeclaration - Parse a struct declaration without the terminating -/// semicolon. -/// -/// Note that a struct declaration refers to a declaration in a struct, -/// not to the declaration of a struct. -/// -/// struct-declaration: -/// [C23] attributes-specifier-seq[opt] -/// specifier-qualifier-list struct-declarator-list -/// [GNU] __extension__ struct-declaration -/// [GNU] specifier-qualifier-list -/// struct-declarator-list: -/// struct-declarator -/// struct-declarator-list ',' struct-declarator -/// [GNU] struct-declarator-list ',' attributes[opt] struct-declarator -/// struct-declarator: -/// declarator -/// [GNU] declarator attributes[opt] -/// declarator[opt] ':' constant-expression -/// [GNU] declarator[opt] ':' constant-expression attributes[opt] -/// void Parser::ParseStructDeclaration( ParsingDeclSpec &DS, llvm::function_ref FieldsCallback, @@ -5099,11 +4791,6 @@ void Parser::ParseLexedCAttributeList(LateParsedAttrList &LAs, bool EnterScope, LAs.clear(); } -/// Finish parsing an attribute for which parsing was delayed. -/// This will be called at the end of parsing a class declaration -/// for each LateParsedAttribute. We consume the saved tokens and -/// create an attribute with the arguments filled in. We add this -/// to the Attribute list for the decl. void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, ParsedAttributes *OutAttrs) { // Create a fake EOF so that attribute parsing won't go off the end of the @@ -5153,16 +4840,6 @@ void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope, } } -/// ParseStructUnionBody -/// struct-contents: -/// struct-declaration-list -/// [EXT] empty -/// [GNU] "struct-declaration-list" without terminating ';' -/// struct-declaration-list: -/// struct-declaration -/// struct-declaration-list struct-declaration -/// [OBC] '@' 'defs' '(' class-name ')' -/// void Parser::ParseStructUnionBody(SourceLocation RecordLoc, DeclSpec::TST TagType, RecordDecl *TagDecl) { PrettyDeclStackTraceEntry CrashInfo(Actions.Context, TagDecl, RecordLoc, @@ -5299,36 +4976,6 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc, Actions.ActOnTagFinishDefinition(getCurScope(), TagDecl, T.getRange()); } -/// ParseEnumSpecifier -/// enum-specifier: [C99 6.7.2.2] -/// 'enum' identifier[opt] '{' enumerator-list '}' -///[C99/C++]'enum' identifier[opt] '{' enumerator-list ',' '}' -/// [GNU] 'enum' attributes[opt] identifier[opt] '{' enumerator-list ',' [opt] -/// '}' attributes[opt] -/// [MS] 'enum' __declspec[opt] identifier[opt] '{' enumerator-list ',' [opt] -/// '}' -/// 'enum' identifier -/// [GNU] 'enum' attributes[opt] identifier -/// -/// [C++11] enum-head '{' enumerator-list[opt] '}' -/// [C++11] enum-head '{' enumerator-list ',' '}' -/// -/// enum-head: [C++11] -/// enum-key attribute-specifier-seq[opt] identifier[opt] enum-base[opt] -/// enum-key attribute-specifier-seq[opt] nested-name-specifier -/// identifier enum-base[opt] -/// -/// enum-key: [C++11] -/// 'enum' -/// 'enum' 'class' -/// 'enum' 'struct' -/// -/// enum-base: [C++11] -/// ':' type-specifier-seq -/// -/// [C++] elaborated-type-specifier: -/// [C++] 'enum' nested-name-specifier[opt] identifier -/// void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, const ParsedTemplateInfo &TemplateInfo, AccessSpecifier AS, DeclSpecContext DSC) { @@ -5712,16 +5359,6 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, Diag(StartLoc, DiagID) << PrevSpec; } -/// ParseEnumBody - Parse a {} enclosed enumerator-list. -/// enumerator-list: -/// enumerator -/// enumerator-list ',' enumerator -/// enumerator: -/// enumeration-constant attributes[opt] -/// enumeration-constant attributes[opt] '=' constant-expression -/// enumeration-constant: -/// identifier -/// void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl, SkipBodyInfo *SkipBody) { // Enter the scope of the enum body and start the definition. @@ -5860,9 +5497,6 @@ void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl, } } -/// isKnownToBeTypeSpecifier - Return true if we know that the specified token -/// is definitely a type-specifier. Return false if it isn't part of a type -/// specifier or if we're not sure. bool Parser::isKnownToBeTypeSpecifier(const Token &Tok) const { switch (Tok.getKind()) { default: return false; @@ -5918,8 +5552,6 @@ bool Parser::isKnownToBeTypeSpecifier(const Token &Tok) const { } } -/// isTypeSpecifierQualifier - Return true if the current token could be the -/// start of a specifier-qualifier-list. bool Parser::isTypeSpecifierQualifier() { switch (Tok.getKind()) { default: return false; @@ -6098,13 +5730,6 @@ Parser::DeclGroupPtrTy Parser::ParseTopLevelStmtDecl() { return Actions.BuildDeclaratorGroup(DeclsInGroup); } -/// isDeclarationSpecifier() - Return true if the current token is part of a -/// declaration specifier. -/// -/// \param AllowImplicitTypename whether this is a context where T::type [T -/// dependent] can appear. -/// \param DisambiguatingWithExpression True to indicate that the purpose of -/// this check is to disambiguate between an expression and a declaration. bool Parser::isDeclarationSpecifier( ImplicitTypenameContext AllowImplicitTypename, bool DisambiguatingWithExpression) { @@ -6498,18 +6123,6 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide, return IsConstructor; } -/// ParseTypeQualifierListOpt -/// type-qualifier-list: [C99 6.7.5] -/// type-qualifier -/// [vendor] attributes -/// [ only if AttrReqs & AR_VendorAttributesParsed ] -/// type-qualifier-list type-qualifier -/// [vendor] type-qualifier-list attributes -/// [ only if AttrReqs & AR_VendorAttributesParsed ] -/// [C++0x] attribute-specifier[opt] is allowed before cv-qualifier-seq -/// [ only if AttReqs & AR_CXX11AttributesParsed ] -/// Note: vendor can be GNU, MS, etc and can be explicitly controlled via -/// AttrRequirements bitmask values. void Parser::ParseTypeQualifierListOpt( DeclSpec &DS, unsigned AttrReqs, bool AtomicOrPtrauthAllowed, bool IdentifierRequired, @@ -6679,7 +6292,6 @@ void Parser::ParseTypeQualifierListOpt( } } -/// ParseDeclarator - Parse and verify a newly-initialized declarator. void Parser::ParseDeclarator(Declarator &D) { /// This implements the 'declarator' production in the C grammar, then checks /// for well-formedness and issues diagnostics. @@ -6727,31 +6339,6 @@ static bool isPipeDeclarator(const Declarator &D) { return false; } -/// ParseDeclaratorInternal - Parse a C or C++ declarator. The direct-declarator -/// is parsed by the function passed to it. Pass null, and the direct-declarator -/// isn't parsed at all, making this function effectively parse the C++ -/// ptr-operator production. -/// -/// If the grammar of this construct is extended, matching changes must also be -/// made to TryParseDeclarator and MightBeDeclarator, and possibly to -/// isConstructorDeclarator. -/// -/// declarator: [C99 6.7.5] [C++ 8p4, dcl.decl] -/// [C] pointer[opt] direct-declarator -/// [C++] direct-declarator -/// [C++] ptr-operator declarator -/// -/// pointer: [C99 6.7.5] -/// '*' type-qualifier-list[opt] -/// '*' type-qualifier-list[opt] pointer -/// -/// ptr-operator: -/// '*' cv-qualifier-seq[opt] -/// '&' -/// [C++0x] '&&' -/// [GNU] '&' restrict[opt] attributes[opt] -/// [GNU?] '&&' restrict[opt] attributes[opt] -/// '::'[opt] nested-name-specifier '*' cv-qualifier-seq[opt] void Parser::ParseDeclaratorInternal(Declarator &D, DirectDeclParseFunction DirectDeclParser) { if (Diags.hasAllExtensionsSilenced()) @@ -6953,52 +6540,6 @@ static SourceLocation getMissingDeclaratorIdLoc(Declarator &D, return Loc; } -/// ParseDirectDeclarator -/// direct-declarator: [C99 6.7.5] -/// [C99] identifier -/// '(' declarator ')' -/// [GNU] '(' attributes declarator ')' -/// [C90] direct-declarator '[' constant-expression[opt] ']' -/// [C99] direct-declarator '[' type-qual-list[opt] assignment-expr[opt] ']' -/// [C99] direct-declarator '[' 'static' type-qual-list[opt] assign-expr ']' -/// [C99] direct-declarator '[' type-qual-list 'static' assignment-expr ']' -/// [C99] direct-declarator '[' type-qual-list[opt] '*' ']' -/// [C++11] direct-declarator '[' constant-expression[opt] ']' -/// attribute-specifier-seq[opt] -/// direct-declarator '(' parameter-type-list ')' -/// direct-declarator '(' identifier-list[opt] ')' -/// [GNU] direct-declarator '(' parameter-forward-declarations -/// parameter-type-list[opt] ')' -/// [C++] direct-declarator '(' parameter-declaration-clause ')' -/// cv-qualifier-seq[opt] exception-specification[opt] -/// [C++11] direct-declarator '(' parameter-declaration-clause ')' -/// attribute-specifier-seq[opt] cv-qualifier-seq[opt] -/// ref-qualifier[opt] exception-specification[opt] -/// [C++] declarator-id -/// [C++11] declarator-id attribute-specifier-seq[opt] -/// -/// declarator-id: [C++ 8] -/// '...'[opt] id-expression -/// '::'[opt] nested-name-specifier[opt] type-name -/// -/// id-expression: [C++ 5.1] -/// unqualified-id -/// qualified-id -/// -/// unqualified-id: [C++ 5.1] -/// identifier -/// operator-function-id -/// conversion-function-id -/// '~' class-name -/// template-id -/// -/// C++17 adds the following, which we also handle here: -/// -/// simple-declaration: -/// '[' identifier-list ']' brace-or-equal-initializer ';' -/// -/// Note, any additional constructs added here may need corresponding changes -/// in isConstructorDeclarator. void Parser::ParseDirectDeclarator(Declarator &D) { DeclaratorScopeObj DeclScopeObj(*this, D.getCXXScopeSpec()); @@ -7473,19 +7014,6 @@ void Parser::ParseDecompositionDeclarator(Declarator &D) { T.getCloseLocation()); } -/// ParseParenDeclarator - We parsed the declarator D up to a paren. This is -/// only called before the identifier, so these are most likely just grouping -/// parens for precedence. If we find that these are actually function -/// parameter parens in an abstract-declarator, we call ParseFunctionDeclarator. -/// -/// direct-declarator: -/// '(' declarator ')' -/// [GNU] '(' attributes declarator ')' -/// direct-declarator '(' parameter-type-list ')' -/// direct-declarator '(' identifier-list[opt] ')' -/// [GNU] direct-declarator '(' parameter-forward-declarations -/// parameter-type-list[opt] ')' -/// void Parser::ParseParenDeclarator(Declarator &D) { BalancedDelimiterTracker T(*this, tok::l_paren); T.consumeOpen(); @@ -7628,26 +7156,6 @@ void Parser::InitCXXThisScopeForDeclaratorIfRelevant( IsCXX11MemberFunction); } -/// ParseFunctionDeclarator - We are after the identifier and have parsed the -/// declarator D up to a paren, which indicates that we are parsing function -/// arguments. -/// -/// If FirstArgAttrs is non-null, then the caller parsed those attributes -/// immediately after the open paren - they will be applied to the DeclSpec -/// of the first parameter. -/// -/// If RequiresArg is true, then the first argument of the function is required -/// to be present and required to not be an identifier list. -/// -/// For C++, after the parameter-list, it also parses the cv-qualifier-seq[opt], -/// (C++11) ref-qualifier[opt], exception-specification[opt], -/// (C++11) attribute-specifier-seq[opt], (C++11) trailing-return-type[opt] and -/// (C++2a) the trailing requires-clause. -/// -/// [C++11] exception-specification: -/// dynamic-exception-specification -/// noexcept-specification -/// void Parser::ParseFunctionDeclarator(Declarator &D, ParsedAttributes &FirstArgAttrs, BalancedDelimiterTracker &Tracker, @@ -7844,8 +7352,6 @@ void Parser::ParseFunctionDeclarator(Declarator &D, std::move(FnAttrs), EndLoc); } -/// ParseRefQualifier - Parses a member function ref-qualifier. Returns -/// true if a ref-qualifier is found. bool Parser::ParseRefQualifier(bool &RefQualifierIsLValueRef, SourceLocation &RefQualifierLoc) { if (Tok.isOneOf(tok::amp, tok::ampamp)) { @@ -7860,11 +7366,6 @@ bool Parser::ParseRefQualifier(bool &RefQualifierIsLValueRef, return false; } -/// isFunctionDeclaratorIdentifierList - This parameter list may have an -/// identifier list form for a K&R-style function: void foo(a,b,c) -/// -/// Note that identifier-lists are only allowed for normal declarators, not for -/// abstract-declarators. bool Parser::isFunctionDeclaratorIdentifierList() { return !getLangOpts().requiresStrictPrototypes() && Tok.is(tok::identifier) @@ -7888,15 +7389,6 @@ bool Parser::isFunctionDeclaratorIdentifierList() { (NextToken().is(tok::comma) || NextToken().is(tok::r_paren))); } -/// ParseFunctionDeclaratorIdentifierList - While parsing a function declarator -/// we found a K&R-style identifier list instead of a typed parameter list. -/// -/// After returning, ParamInfo will hold the parsed parameters. -/// -/// identifier-list: [C99 6.7.5] -/// identifier -/// identifier-list ',' identifier -/// void Parser::ParseFunctionDeclaratorIdentifierList( Declarator &D, SmallVectorImpl &ParamInfo) { @@ -7946,38 +7438,6 @@ void Parser::ParseFunctionDeclaratorIdentifierList( } while (TryConsumeToken(tok::comma)); } -/// ParseParameterDeclarationClause - Parse a (possibly empty) parameter-list -/// after the opening parenthesis. This function will not parse a K&R-style -/// identifier list. -/// -/// DeclContext is the context of the declarator being parsed. If FirstArgAttrs -/// is non-null, then the caller parsed those attributes immediately after the -/// open paren - they will be applied to the DeclSpec of the first parameter. -/// -/// After returning, ParamInfo will hold the parsed parameters. EllipsisLoc will -/// be the location of the ellipsis, if any was parsed. -/// -/// parameter-type-list: [C99 6.7.5] -/// parameter-list -/// parameter-list ',' '...' -/// [C++] parameter-list '...' -/// -/// parameter-list: [C99 6.7.5] -/// parameter-declaration -/// parameter-list ',' parameter-declaration -/// -/// parameter-declaration: [C99 6.7.5] -/// declaration-specifiers declarator -/// [C++] declaration-specifiers declarator '=' assignment-expression -/// [C++11] initializer-clause -/// [GNU] declaration-specifiers declarator attributes -/// declaration-specifiers abstract-declarator[opt] -/// [C++] declaration-specifiers abstract-declarator[opt] -/// '=' assignment-expression -/// [GNU] declaration-specifiers abstract-declarator[opt] attributes -/// [C++11] attribute-specifier-seq parameter-declaration -/// [C++2b] attribute-specifier-seq 'this' parameter-declaration -/// void Parser::ParseParameterDeclarationClause( DeclaratorContext DeclaratorCtx, ParsedAttributes &FirstArgAttrs, SmallVectorImpl &ParamInfo, @@ -8256,13 +7716,6 @@ void Parser::ParseParameterDeclarationClause( } while (TryConsumeToken(tok::comma)); } -/// [C90] direct-declarator '[' constant-expression[opt] ']' -/// [C99] direct-declarator '[' type-qual-list[opt] assignment-expr[opt] ']' -/// [C99] direct-declarator '[' 'static' type-qual-list[opt] assign-expr ']' -/// [C99] direct-declarator '[' type-qual-list 'static' assignment-expr ']' -/// [C99] direct-declarator '[' type-qual-list[opt] '*' ']' -/// [C++11] direct-declarator '[' constant-expression[opt] ']' -/// attribute-specifier-seq[opt] void Parser::ParseBracketDeclarator(Declarator &D) { if (CheckProhibitedCXX11Attribute()) return; @@ -8378,7 +7831,6 @@ void Parser::ParseBracketDeclarator(Declarator &D) { std::move(DS.getAttributes()), T.getCloseLocation()); } -/// Diagnose brackets before an identifier. void Parser::ParseMisplacedBracketDeclarator(Declarator &D) { assert(Tok.is(tok::l_square) && "Missing opening bracket"); assert(!D.mayOmitIdentifier() && "Declarator cannot omit identifier"); @@ -8467,18 +7919,6 @@ void Parser::ParseMisplacedBracketDeclarator(Declarator &D) { } } -/// [GNU] typeof-specifier: -/// typeof ( expressions ) -/// typeof ( type-name ) -/// [GNU/C++] typeof unary-expression -/// [C23] typeof-specifier: -/// typeof '(' typeof-specifier-argument ')' -/// typeof_unqual '(' typeof-specifier-argument ')' -/// -/// typeof-specifier-argument: -/// expression -/// type-name -/// void Parser::ParseTypeofSpecifier(DeclSpec &DS) { assert(Tok.isOneOf(tok::kw_typeof, tok::kw_typeof_unqual) && "Not a typeof specifier"); @@ -8552,9 +7992,6 @@ void Parser::ParseTypeofSpecifier(DeclSpec &DS) { Diag(StartLoc, DiagID) << PrevSpec; } -/// [C11] atomic-specifier: -/// _Atomic ( type-name ) -/// void Parser::ParseAtomicSpecifier(DeclSpec &DS) { assert(Tok.is(tok::kw__Atomic) && NextToken().is(tok::l_paren) && "Not an atomic specifier"); @@ -8587,8 +8024,6 @@ void Parser::ParseAtomicSpecifier(DeclSpec &DS) { Diag(StartLoc, DiagID) << PrevSpec; } -/// TryAltiVecVectorTokenOutOfLine - Out of line body that should only be called -/// from TryAltiVecVectorToken. bool Parser::TryAltiVecVectorTokenOutOfLine() { Token Next = NextToken(); switch (Next.getKind()) { diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index f1216331877ba..316bc30edf1f0 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -36,33 +36,6 @@ using namespace clang; -/// ParseNamespace - We know that the current token is a namespace keyword. This -/// may either be a top level namespace or a block-level namespace alias. If -/// there was an inline keyword, it has already been parsed. -/// -/// namespace-definition: [C++: namespace.def] -/// named-namespace-definition -/// unnamed-namespace-definition -/// nested-namespace-definition -/// -/// named-namespace-definition: -/// 'inline'[opt] 'namespace' attributes[opt] identifier '{' -/// namespace-body '}' -/// -/// unnamed-namespace-definition: -/// 'inline'[opt] 'namespace' attributes[opt] '{' namespace-body '}' -/// -/// nested-namespace-definition: -/// 'namespace' enclosing-namespace-specifier '::' 'inline'[opt] -/// identifier '{' namespace-body '}' -/// -/// enclosing-namespace-specifier: -/// identifier -/// enclosing-namespace-specifier '::' 'inline'[opt] identifier -/// -/// namespace-alias-definition: [C++ 7.3.2: namespace.alias] -/// 'namespace' identifier '=' qualified-namespace-specifier ';' -/// Parser::DeclGroupPtrTy Parser::ParseNamespace(DeclaratorContext Context, SourceLocation &DeclEnd, SourceLocation InlineLoc) { @@ -252,7 +225,6 @@ Parser::DeclGroupPtrTy Parser::ParseNamespace(DeclaratorContext Context, ImplicitUsingDirectiveDecl); } -/// ParseInnerNamespace - Parse the contents of a namespace. void Parser::ParseInnerNamespace(const InnerNamespaceInfoList &InnerNSs, unsigned int index, SourceLocation &InlineLoc, ParsedAttributes &attrs, @@ -291,9 +263,6 @@ void Parser::ParseInnerNamespace(const InnerNamespaceInfoList &InnerNSs, Actions.ActOnFinishNamespaceDef(NamespcDecl, Tracker.getCloseLocation()); } -/// ParseNamespaceAlias - Parse the part after the '=' in a namespace -/// alias definition. -/// Decl *Parser::ParseNamespaceAlias(SourceLocation NamespaceLoc, SourceLocation AliasLoc, IdentifierInfo *Alias, @@ -345,13 +314,6 @@ Decl *Parser::ParseNamespaceAlias(SourceLocation NamespaceLoc, Alias, SS, IdentLoc, Ident); } -/// ParseLinkage - We know that the current token is a string_literal -/// and just before that, that extern was seen. -/// -/// linkage-specification: [C++ 7.5p2: dcl.link] -/// 'extern' string-literal '{' declaration-seq[opt] '}' -/// 'extern' string-literal declaration -/// Decl *Parser::ParseLinkage(ParsingDeclSpec &DS, DeclaratorContext Context) { assert(isTokenStringLiteral() && "Not a string literal!"); ExprResult Lang = ParseUnevaluatedStringLiteralExpression(); @@ -436,20 +398,6 @@ Decl *Parser::ParseLinkage(ParsingDeclSpec &DS, DeclaratorContext Context) { : nullptr; } -/// Parse a standard C++ Modules export-declaration. -/// -/// export-declaration: -/// 'export' declaration -/// 'export' '{' declaration-seq[opt] '}' -/// -/// HLSL: Parse export function declaration. -/// -/// export-function-declaration: -/// 'export' function-declaration -/// -/// export-declaration-group: -/// 'export' '{' function-declaration-seq[opt] '}' -/// Decl *Parser::ParseExportDeclaration() { assert(Tok.is(tok::kw_export)); SourceLocation ExportLoc = ConsumeToken(); @@ -494,8 +442,6 @@ Decl *Parser::ParseExportDeclaration() { T.getCloseLocation()); } -/// ParseUsingDirectiveOrDeclaration - Parse C++ using using-declaration or -/// using-directive. Assumes that current token is 'using'. Parser::DeclGroupPtrTy Parser::ParseUsingDirectiveOrDeclaration( DeclaratorContext Context, const ParsedTemplateInfo &TemplateInfo, SourceLocation &DeclEnd, ParsedAttributes &Attrs) { @@ -536,16 +482,6 @@ Parser::DeclGroupPtrTy Parser::ParseUsingDirectiveOrDeclaration( AS_none); } -/// ParseUsingDirective - Parse C++ using-directive, assumes -/// that current token is 'namespace' and 'using' was already parsed. -/// -/// using-directive: [C++ 7.3.p4: namespace.udir] -/// 'using' 'namespace' ::[opt] nested-name-specifier[opt] -/// namespace-name ; -/// [GNU] using-directive: -/// 'using' 'namespace' ::[opt] nested-name-specifier[opt] -/// namespace-name attributes[opt] ; -/// Decl *Parser::ParseUsingDirective(DeclaratorContext Context, SourceLocation UsingLoc, SourceLocation &DeclEnd, @@ -612,11 +548,6 @@ Decl *Parser::ParseUsingDirective(DeclaratorContext Context, IdentLoc, NamespcName, attrs); } -/// Parse a using-declarator (or the identifier in a C++11 alias-declaration). -/// -/// using-declarator: -/// 'typename'[opt] nested-name-specifier unqualified-id -/// bool Parser::ParseUsingDeclarator(DeclaratorContext Context, UsingDeclarator &D) { D.clear(); @@ -687,29 +618,6 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context, return false; } -/// ParseUsingDeclaration - Parse C++ using-declaration or alias-declaration. -/// Assumes that 'using' was already seen. -/// -/// using-declaration: [C++ 7.3.p3: namespace.udecl] -/// 'using' using-declarator-list[opt] ; -/// -/// using-declarator-list: [C++1z] -/// using-declarator '...'[opt] -/// using-declarator-list ',' using-declarator '...'[opt] -/// -/// using-declarator-list: [C++98-14] -/// using-declarator -/// -/// alias-declaration: C++11 [dcl.dcl]p1 -/// 'using' identifier attribute-specifier-seq[opt] = type-id ; -/// -/// using-enum-declaration: [C++20, dcl.enum] -/// 'using' elaborated-enum-specifier ; -/// The terminal name of the elaborated-enum-specifier undergoes -/// type-only lookup -/// -/// elaborated-enum-specifier: -/// 'enum' nested-name-specifier[opt] identifier Parser::DeclGroupPtrTy Parser::ParseUsingDeclaration( DeclaratorContext Context, const ParsedTemplateInfo &TemplateInfo, SourceLocation UsingLoc, SourceLocation &DeclEnd, @@ -1014,14 +922,6 @@ static FixItHint getStaticAssertNoMessageFixIt(const Expr *AssertExpr, return FixItHint::CreateInsertion(EndExprLoc, ", \"\""); } -/// ParseStaticAssertDeclaration - Parse C++0x or C11 static_assert-declaration. -/// -/// [C++0x] static_assert-declaration: -/// static_assert ( constant-expression , string-literal ) ; -/// -/// [C11] static_assert-declaration: -/// _Static_assert ( constant-expression , string-literal ) ; -/// Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) { assert(Tok.isOneOf(tok::kw_static_assert, tok::kw__Static_assert) && "Not a static_assert declaration"); @@ -1123,11 +1023,6 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) { T.getCloseLocation()); } -/// ParseDecltypeSpecifier - Parse a C++11 decltype specifier. -/// -/// 'decltype' ( expression ) -/// 'decltype' ( 'auto' ) [C++1y] -/// SourceLocation Parser::ParseDecltypeSpecifier(DeclSpec &DS) { assert(Tok.isOneOf(tok::kw_decltype, tok::annot_decltype) && "Not a decltype specifier"); @@ -1393,24 +1288,6 @@ bool Parser::MaybeParseTypeTransformTypeSpecifier(DeclSpec &DS) { return true; } -/// ParseBaseTypeSpecifier - Parse a C++ base-type-specifier which is either a -/// class name or decltype-specifier. Note that we only check that the result -/// names a type; semantic analysis will need to verify that the type names a -/// class. The result is either a type or null, depending on whether a type -/// name was found. -/// -/// base-type-specifier: [C++11 class.derived] -/// class-or-decltype -/// class-or-decltype: [C++11 class.derived] -/// nested-name-specifier[opt] class-name -/// decltype-specifier -/// class-name: [C++ class.name] -/// identifier -/// simple-template-id -/// -/// In C++98, instead of base-type-specifier, we have: -/// -/// ::[opt] nested-name-specifier[opt] class-name TypeResult Parser::ParseBaseTypeSpecifier(SourceLocation &BaseLoc, SourceLocation &EndLocation) { // Ignore attempts to use typename @@ -1571,9 +1448,6 @@ void Parser::ParseNullabilityClassAttributes(ParsedAttributes &attrs) { } } -/// Determine whether the following tokens are valid after a type-specifier -/// which could be a standalone declaration. This will conservatively return -/// true if there's any doubt, and is appropriate for insert-';' fixits. bool Parser::isValidAfterTypeSpecifier(bool CouldBeBitfield) { // This switch enumerates the valid "follow" set for type-specifiers. switch (Tok.getKind()) { @@ -1674,46 +1548,6 @@ bool Parser::isValidAfterTypeSpecifier(bool CouldBeBitfield) { return false; } -/// ParseClassSpecifier - Parse a C++ class-specifier [C++ class] or -/// elaborated-type-specifier [C++ dcl.type.elab]; we can't tell which -/// until we reach the start of a definition or see a token that -/// cannot start a definition. -/// -/// class-specifier: [C++ class] -/// class-head '{' member-specification[opt] '}' -/// class-head '{' member-specification[opt] '}' attributes[opt] -/// class-head: -/// class-key identifier[opt] base-clause[opt] -/// class-key nested-name-specifier identifier base-clause[opt] -/// class-key nested-name-specifier[opt] simple-template-id -/// base-clause[opt] -/// [GNU] class-key attributes[opt] identifier[opt] base-clause[opt] -/// [GNU] class-key attributes[opt] nested-name-specifier -/// identifier base-clause[opt] -/// [GNU] class-key attributes[opt] nested-name-specifier[opt] -/// simple-template-id base-clause[opt] -/// class-key: -/// 'class' -/// 'struct' -/// 'union' -/// -/// elaborated-type-specifier: [C++ dcl.type.elab] -/// class-key ::[opt] nested-name-specifier[opt] identifier -/// class-key ::[opt] nested-name-specifier[opt] 'template'[opt] -/// simple-template-id -/// -/// Note that the C++ class-specifier and elaborated-type-specifier, -/// together, subsume the C99 struct-or-union-specifier: -/// -/// struct-or-union-specifier: [C99 6.7.2.1] -/// struct-or-union identifier[opt] '{' struct-contents '}' -/// struct-or-union identifier -/// [GNU] struct-or-union attributes[opt] identifier[opt] '{' struct-contents -/// '}' attributes[opt] -/// [GNU] struct-or-union attributes[opt] identifier -/// struct-or-union: -/// 'struct' -/// 'union' void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind, SourceLocation StartLoc, DeclSpec &DS, ParsedTemplateInfo &TemplateInfo, @@ -2412,13 +2246,6 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind, } } -/// ParseBaseClause - Parse the base-clause of a C++ class [C++ class.derived]. -/// -/// base-clause : [C++ class.derived] -/// ':' base-specifier-list -/// base-specifier-list: -/// base-specifier '...'[opt] -/// base-specifier-list ',' base-specifier '...'[opt] void Parser::ParseBaseClause(Decl *ClassDecl) { assert(Tok.is(tok::colon) && "Not a base clause"); ConsumeToken(); @@ -2448,17 +2275,6 @@ void Parser::ParseBaseClause(Decl *ClassDecl) { Actions.ActOnBaseSpecifiers(ClassDecl, BaseInfo); } -/// ParseBaseSpecifier - Parse a C++ base-specifier. A base-specifier is -/// one entry in the base class list of a class specifier, for example: -/// class foo : public bar, virtual private baz { -/// 'public bar' and 'virtual private baz' are each base-specifiers. -/// -/// base-specifier: [C++ class.derived] -/// attribute-specifier-seq[opt] base-type-specifier -/// attribute-specifier-seq[opt] 'virtual' access-specifier[opt] -/// base-type-specifier -/// attribute-specifier-seq[opt] access-specifier 'virtual'[opt] -/// base-type-specifier BaseResult Parser::ParseBaseSpecifier(Decl *ClassDecl) { bool IsVirtual = false; SourceLocation StartLoc = Tok.getLocation(); @@ -2532,13 +2348,6 @@ BaseResult Parser::ParseBaseSpecifier(Decl *ClassDecl) { EllipsisLoc); } -/// getAccessSpecifierIfPresent - Determine whether the next token is -/// a C++ access-specifier. -/// -/// access-specifier: [C++ class.derived] -/// 'private' -/// 'protected' -/// 'public' AccessSpecifier Parser::getAccessSpecifierIfPresent() const { switch (Tok.getKind()) { default: @@ -2552,10 +2361,6 @@ AccessSpecifier Parser::getAccessSpecifierIfPresent() const { } } -/// If the given declarator has any parts for which parsing has to be -/// delayed, e.g., default arguments or an exception-specification, create a -/// late-parsed method declaration record to handle the parsing at the end of -/// the class definition. void Parser::HandleMemberFunctionDeclDelays(Declarator &DeclaratorInfo, Decl *ThisDecl) { DeclaratorChunk::FunctionTypeInfo &FTI = DeclaratorInfo.getFunctionTypeInfo(); @@ -2597,13 +2402,6 @@ void Parser::HandleMemberFunctionDeclDelays(Declarator &DeclaratorInfo, } } -/// isCXX11VirtSpecifier - Determine whether the given token is a C++11 -/// virt-specifier. -/// -/// virt-specifier: -/// override -/// final -/// __final VirtSpecifiers::Specifier Parser::isCXX11VirtSpecifier(const Token &Tok) const { if (!getLangOpts().CPlusPlus || Tok.isNot(tok::identifier)) return VirtSpecifiers::VS_None; @@ -2640,11 +2438,6 @@ VirtSpecifiers::Specifier Parser::isCXX11VirtSpecifier(const Token &Tok) const { return VirtSpecifiers::VS_None; } -/// ParseOptionalCXX11VirtSpecifierSeq - Parse a virt-specifier-seq. -/// -/// virt-specifier-seq: -/// virt-specifier -/// virt-specifier-seq virt-specifier void Parser::ParseOptionalCXX11VirtSpecifierSeq(VirtSpecifiers &VS, bool IsInterface, SourceLocation FriendLoc) { @@ -2690,8 +2483,6 @@ void Parser::ParseOptionalCXX11VirtSpecifierSeq(VirtSpecifiers &VS, } } -/// isCXX11FinalKeyword - Determine whether the next token is a C++11 -/// 'final' or Microsoft 'sealed' contextual keyword. bool Parser::isCXX11FinalKeyword() const { VirtSpecifiers::Specifier Specifier = isCXX11VirtSpecifier(); return Specifier == VirtSpecifiers::VS_Final || @@ -2751,10 +2542,6 @@ void Parser::ParseCXX2CReplaceableSpecifier(SourceLocation &MRS) { MRS = ConsumeToken(); } -/// isClassCompatibleKeyword - Determine whether the next token is a C++11 -/// 'final', a C++26 'trivially_relocatable_if_eligible', -/// 'replaceable_if_eligible', or Microsoft 'sealed' or 'abstract' contextual -/// keyword. bool Parser::isClassCompatibleKeyword(Token Tok) const { if (isCXX2CTriviallyRelocatableKeyword(Tok) || isCXX2CReplaceableKeyword(Tok)) return true; @@ -2856,8 +2643,6 @@ bool Parser::ParseCXXMemberDeclaratorBeforeInitializer( return false; } -/// Look for declaration specifiers possibly occurring after C++11 -/// virt-specifier-seq and diagnose them. void Parser::MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq( Declarator &D, VirtSpecifiers &VS) { DeclSpec DS(AttrFactory); @@ -2911,56 +2696,6 @@ void Parser::MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq( } } -/// ParseCXXClassMemberDeclaration - Parse a C++ class member declaration. -/// -/// member-declaration: -/// decl-specifier-seq[opt] member-declarator-list[opt] ';' -/// function-definition ';'[opt] -/// [C++26] friend-type-declaration -/// ::[opt] nested-name-specifier template[opt] unqualified-id ';'[TODO] -/// using-declaration [TODO] -/// [C++0x] static_assert-declaration -/// template-declaration -/// [GNU] '__extension__' member-declaration -/// -/// member-declarator-list: -/// member-declarator -/// member-declarator-list ',' member-declarator -/// -/// member-declarator: -/// declarator virt-specifier-seq[opt] pure-specifier[opt] -/// [C++2a] declarator requires-clause -/// declarator constant-initializer[opt] -/// [C++11] declarator brace-or-equal-initializer[opt] -/// identifier[opt] ':' constant-expression -/// -/// virt-specifier-seq: -/// virt-specifier -/// virt-specifier-seq virt-specifier -/// -/// virt-specifier: -/// override -/// final -/// [MS] sealed -/// -/// pure-specifier: -/// '= 0' -/// -/// constant-initializer: -/// '=' constant-expression -/// -/// friend-type-declaration: -/// 'friend' friend-type-specifier-list ; -/// -/// friend-type-specifier-list: -/// friend-type-specifier ...[opt] -/// friend-type-specifier-list , friend-type-specifier ...[opt] -/// -/// friend-type-specifier: -/// simple-type-specifier -/// elaborated-type-specifier -/// typename-specifier -/// Parser::DeclGroupPtrTy Parser::ParseCXXClassMemberDeclaration( AccessSpecifier AS, ParsedAttributes &AccessAttrs, ParsedTemplateInfo &TemplateInfo, ParsingDeclRAIIObject *TemplateDiags) { @@ -3585,26 +3320,6 @@ Parser::DeclGroupPtrTy Parser::ParseCXXClassMemberDeclaration( return Actions.FinalizeDeclaratorGroup(getCurScope(), DS, DeclsInGroup); } -/// ParseCXXMemberInitializer - Parse the brace-or-equal-initializer. -/// Also detect and reject any attempted defaulted/deleted function definition. -/// The location of the '=', if any, will be placed in EqualLoc. -/// -/// This does not check for a pure-specifier; that's handled elsewhere. -/// -/// brace-or-equal-initializer: -/// '=' initializer-expression -/// braced-init-list -/// -/// initializer-clause: -/// assignment-expression -/// braced-init-list -/// -/// defaulted/deleted function-definition: -/// '=' 'default' -/// '=' 'delete' -/// -/// Prior to C++0x, the assignment-expression in an initializer-clause must -/// be a constant-expression. ExprResult Parser::ParseCXXMemberInitializer(Decl *D, bool IsFunction, SourceLocation &EqualLoc) { assert(Tok.isOneOf(tok::equal, tok::l_brace) && @@ -3824,12 +3539,6 @@ Parser::DeclGroupPtrTy Parser::ParseCXXClassMemberDeclarationWithPragmas( } } -/// ParseCXXMemberSpecification - Parse the class definition. -/// -/// member-specification: -/// member-declaration member-specification[opt] -/// access-specifier ':' member-specification[opt] -/// void Parser::ParseCXXMemberSpecification(SourceLocation RecordLoc, SourceLocation AttrFixitLoc, ParsedAttributes &Attrs, @@ -4123,27 +3832,6 @@ void Parser::DiagnoseUnexpectedNamespace(NamedDecl *D) { Tok.setKind(tok::r_brace); } -/// ParseConstructorInitializer - Parse a C++ constructor initializer, -/// which explicitly initializes the members or base classes of a -/// class (C++ [class.base.init]). For example, the three initializers -/// after the ':' in the Derived constructor below: -/// -/// @code -/// class Base { }; -/// class Derived : Base { -/// int x; -/// float f; -/// public: -/// Derived(float f) : Base(), x(17), f(f) { } -/// }; -/// @endcode -/// -/// [C++] ctor-initializer: -/// ':' mem-initializer-list -/// -/// [C++] mem-initializer-list: -/// mem-initializer ...[opt] -/// mem-initializer ...[opt] , mem-initializer-list void Parser::ParseConstructorInitializer(Decl *ConstructorDecl) { assert(Tok.is(tok::colon) && "Constructor initializer always starts with ':'"); @@ -4195,18 +3883,6 @@ void Parser::ParseConstructorInitializer(Decl *ConstructorDecl) { AnyErrors); } -/// ParseMemInitializer - Parse a C++ member initializer, which is -/// part of a constructor initializer that explicitly initializes one -/// member or base class (C++ [class.base.init]). See -/// ParseConstructorInitializer for an example. -/// -/// [C++] mem-initializer: -/// mem-initializer-id '(' expression-list[opt] ')' -/// [C++0x] mem-initializer-id braced-init-list -/// -/// [C++] mem-initializer-id: -/// '::'[opt] nested-name-specifier[opt] class-name -/// identifier MemInitResult Parser::ParseMemInitializer(Decl *ConstructorDecl) { // parse '::'[opt] nested-name-specifier[opt] CXXScopeSpec SS; @@ -4318,15 +3994,6 @@ MemInitResult Parser::ParseMemInitializer(Decl *ConstructorDecl) { return Diag(Tok, diag::err_expected) << tok::l_paren; } -/// Parse a C++ exception-specification if present (C++0x [except.spec]). -/// -/// exception-specification: -/// dynamic-exception-specification -/// noexcept-specification -/// -/// noexcept-specification: -/// 'noexcept' -/// 'noexcept' '(' constant-expression ')' ExceptionSpecificationType Parser::tryParseExceptionSpecification( bool Delayed, SourceRange &SpecificationRange, SmallVectorImpl &DynamicExceptions, @@ -4446,17 +4113,6 @@ static void diagnoseDynamicExceptionSpecification(Parser &P, SourceRange Range, } } -/// ParseDynamicExceptionSpecification - Parse a C++ -/// dynamic-exception-specification (C++ [except.spec]). -/// -/// dynamic-exception-specification: -/// 'throw' '(' type-id-list [opt] ')' -/// [MS] 'throw' '(' '...' ')' -/// -/// type-id-list: -/// type-id ... [opt] -/// type-id-list ',' type-id ... [opt] -/// ExceptionSpecificationType Parser::ParseDynamicExceptionSpecification( SourceRange &SpecificationRange, SmallVectorImpl &Exceptions, SmallVectorImpl &Ranges) { @@ -4513,8 +4169,6 @@ ExceptionSpecificationType Parser::ParseDynamicExceptionSpecification( return Exceptions.empty() ? EST_DynamicNone : EST_Dynamic; } -/// ParseTrailingReturnType - Parse a trailing return type on a new-style -/// function declaration. TypeResult Parser::ParseTrailingReturnType(SourceRange &Range, bool MayBeFollowedByDirectInit) { assert(Tok.is(tok::arrow) && "expected arrow"); @@ -4526,7 +4180,6 @@ TypeResult Parser::ParseTrailingReturnType(SourceRange &Range, : DeclaratorContext::TrailingReturn); } -/// Parse a requires-clause as part of a function declaration. void Parser::ParseTrailingRequiresClause(Declarator &D) { assert(Tok.is(tok::kw_requires) && "expected requires"); @@ -4600,9 +4253,6 @@ void Parser::ParseTrailingRequiresClause(Declarator &D) { } } -/// We have just started parsing the definition of a new class, -/// so push that class onto our stack of classes that is currently -/// being parsed. Sema::ParsingClassState Parser::PushParsingClass(Decl *ClassDecl, bool NonNestedClass, bool IsInterface) { @@ -4612,20 +4262,12 @@ Sema::ParsingClassState Parser::PushParsingClass(Decl *ClassDecl, return Actions.PushParsingClass(); } -/// Deallocate the given parsed class and all of its nested -/// classes. void Parser::DeallocateParsedClasses(Parser::ParsingClass *Class) { for (unsigned I = 0, N = Class->LateParsedDeclarations.size(); I != N; ++I) delete Class->LateParsedDeclarations[I]; delete Class; } -/// Pop the top class of the stack of classes that are -/// currently being parsed. -/// -/// This routine should be called when we have finished parsing the -/// definition of a class, but have not yet popped the Scope -/// associated with the class's definition. void Parser::PopParsingClass(Sema::ParsingClassState state) { assert(!ClassStack.empty() && "Mismatched push/pop for class parsing"); @@ -4659,15 +4301,6 @@ void Parser::PopParsingClass(Sema::ParsingClassState state) { new LateParsedClass(this, Victim)); } -/// Try to parse an 'identifier' which appears within an attribute-token. -/// -/// \return the parsed identifier on success, and 0 if the next token is not an -/// attribute-token. -/// -/// C++11 [dcl.attr.grammar]p3: -/// If a keyword or an alternative token that satisfies the syntactic -/// requirements of an identifier is contained in an attribute-token, -/// it is considered an identifier. IdentifierInfo *Parser::TryParseCXX11AttributeIdentifier( SourceLocation &Loc, SemaCodeCompletion::AttributeCompletion Completion, const IdentifierInfo *Scope) { @@ -4821,7 +4454,6 @@ static bool IsBuiltInOrStandardCXX11Attribute(IdentifierInfo *AttrName, } } -/// Parse the argument to C++23's [[assume()]] attribute. bool Parser::ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs, IdentifierInfo *AttrName, SourceLocation AttrNameLoc, @@ -4877,20 +4509,6 @@ bool Parser::ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs, return false; } -/// ParseCXX11AttributeArgs -- Parse a C++11 attribute-argument-clause. -/// -/// [C++11] attribute-argument-clause: -/// '(' balanced-token-seq ')' -/// -/// [C++11] balanced-token-seq: -/// balanced-token -/// balanced-token-seq balanced-token -/// -/// [C++11] balanced-token: -/// '(' balanced-token-seq ')' -/// '[' balanced-token-seq ']' -/// '{' balanced-token-seq '}' -/// any token but '(', ')', '[', ']', '{', or '}' bool Parser::ParseCXX11AttributeArgs( IdentifierInfo *AttrName, SourceLocation AttrNameLoc, ParsedAttributes &Attrs, SourceLocation *EndLoc, IdentifierInfo *ScopeName, @@ -4989,30 +4607,6 @@ bool Parser::ParseCXX11AttributeArgs( return true; } -/// Parse a C++11 or C23 attribute-specifier. -/// -/// [C++11] attribute-specifier: -/// '[' '[' attribute-list ']' ']' -/// alignment-specifier -/// -/// [C++11] attribute-list: -/// attribute[opt] -/// attribute-list ',' attribute[opt] -/// attribute '...' -/// attribute-list ',' attribute '...' -/// -/// [C++11] attribute: -/// attribute-token attribute-argument-clause[opt] -/// -/// [C++11] attribute-token: -/// identifier -/// attribute-scoped-token -/// -/// [C++11] attribute-scoped-token: -/// attribute-namespace '::' identifier -/// -/// [C++11] attribute-namespace: -/// identifier void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs, CachedTokens &OpenMPTokens, SourceLocation *EndLoc) { @@ -5167,10 +4761,6 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs, SkipUntil(tok::r_square); } -/// ParseCXX11Attributes - Parse a C++11 or C23 attribute-specifier-seq. -/// -/// attribute-specifier-seq: -/// attribute-specifier-seq[opt] attribute-specifier void Parser::ParseCXX11Attributes(ParsedAttributes &Attrs) { SourceLocation StartLoc = Tok.getLocation(); SourceLocation EndLoc = StartLoc; @@ -5228,7 +4818,6 @@ SourceLocation Parser::SkipCXX11Attributes() { return EndLoc; } -/// Parse uuid() attribute when it appears in a [] Microsoft attribute. void Parser::ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs) { assert(Tok.is(tok::identifier) && "Not a Microsoft attribute list"); IdentifierInfo *UuidIdent = Tok.getIdentifierInfo(); @@ -5397,14 +4986,6 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) { ParsedAttr::Form::Microsoft()); } -/// ParseMicrosoftAttributes - Parse Microsoft attributes [Attr] -/// -/// [MS] ms-attribute: -/// '[' token-seq ']' -/// -/// [MS] ms-attribute-seq: -/// ms-attribute[opt] -/// ms-attribute ms-attribute-seq void Parser::ParseMicrosoftAttributes(ParsedAttributes &Attrs) { assert(Tok.is(tok::l_square) && "Not a Microsoft attribute list"); diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 4b5d677f4ba87..11cfbbe790418 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -43,113 +43,17 @@ #include using namespace clang; -/// Simple precedence-based parser for binary/ternary operators. -/// -/// Note: we diverge from the C99 grammar when parsing the assignment-expression -/// production. C99 specifies that the LHS of an assignment operator should be -/// parsed as a unary-expression, but consistency dictates that it be a -/// conditional-expession. In practice, the important thing here is that the -/// LHS of an assignment has to be an l-value, which productions between -/// unary-expression and conditional-expression don't produce. Because we want -/// consistency, we parse the LHS as a conditional-expression, then check for -/// l-value-ness in semantic analysis stages. -/// -/// \verbatim -/// pm-expression: [C++ 5.5] -/// cast-expression -/// pm-expression '.*' cast-expression -/// pm-expression '->*' cast-expression -/// -/// multiplicative-expression: [C99 6.5.5] -/// Note: in C++, apply pm-expression instead of cast-expression -/// cast-expression -/// multiplicative-expression '*' cast-expression -/// multiplicative-expression '/' cast-expression -/// multiplicative-expression '%' cast-expression -/// -/// additive-expression: [C99 6.5.6] -/// multiplicative-expression -/// additive-expression '+' multiplicative-expression -/// additive-expression '-' multiplicative-expression -/// -/// shift-expression: [C99 6.5.7] -/// additive-expression -/// shift-expression '<<' additive-expression -/// shift-expression '>>' additive-expression -/// -/// compare-expression: [C++20 expr.spaceship] -/// shift-expression -/// compare-expression '<=>' shift-expression -/// -/// relational-expression: [C99 6.5.8] -/// compare-expression -/// relational-expression '<' compare-expression -/// relational-expression '>' compare-expression -/// relational-expression '<=' compare-expression -/// relational-expression '>=' compare-expression -/// -/// equality-expression: [C99 6.5.9] -/// relational-expression -/// equality-expression '==' relational-expression -/// equality-expression '!=' relational-expression -/// -/// AND-expression: [C99 6.5.10] -/// equality-expression -/// AND-expression '&' equality-expression -/// -/// exclusive-OR-expression: [C99 6.5.11] -/// AND-expression -/// exclusive-OR-expression '^' AND-expression -/// -/// inclusive-OR-expression: [C99 6.5.12] -/// exclusive-OR-expression -/// inclusive-OR-expression '|' exclusive-OR-expression -/// -/// logical-AND-expression: [C99 6.5.13] -/// inclusive-OR-expression -/// logical-AND-expression '&&' inclusive-OR-expression -/// -/// logical-OR-expression: [C99 6.5.14] -/// logical-AND-expression -/// logical-OR-expression '||' logical-AND-expression -/// -/// conditional-expression: [C99 6.5.15] -/// logical-OR-expression -/// logical-OR-expression '?' expression ':' conditional-expression -/// [GNU] logical-OR-expression '?' ':' conditional-expression -/// [C++] the third operand is an assignment-expression -/// -/// assignment-expression: [C99 6.5.16] -/// conditional-expression -/// unary-expression assignment-operator assignment-expression -/// [C++] throw-expression [C++ 15] -/// -/// assignment-operator: one of -/// = *= /= %= += -= <<= >>= &= ^= |= -/// -/// expression: [C99 6.5.17] -/// assignment-expression ...[opt] -/// expression ',' assignment-expression ...[opt] -/// \endverbatim ExprResult Parser::ParseExpression(TypeCastState isTypeCast) { ExprResult LHS(ParseAssignmentExpression(isTypeCast)); return ParseRHSOfBinaryExpression(LHS, prec::Comma); } -/// This routine is called when the '@' is seen and consumed. -/// Current token is an Identifier and is not a 'try'. This -/// routine is necessary to disambiguate \@try-statement from, -/// for example, \@encode-expression. -/// ExprResult Parser::ParseExpressionWithLeadingAt(SourceLocation AtLoc) { ExprResult LHS(ParseObjCAtExpression(AtLoc)); return ParseRHSOfBinaryExpression(LHS, prec::Comma); } -/// This routine is called when a leading '__extension__' is seen and -/// consumed. This is necessary because the token gets consumed in the -/// process of disambiguating between an expression and a declaration. ExprResult Parser::ParseExpressionWithLeadingExtension(SourceLocation ExtLoc) { ExprResult LHS(true); @@ -167,7 +71,6 @@ Parser::ParseExpressionWithLeadingExtension(SourceLocation ExtLoc) { return ParseRHSOfBinaryExpression(LHS, prec::Comma); } -/// Parse an expr that doesn't include (top-level) commas. ExprResult Parser::ParseAssignmentExpression(TypeCastState isTypeCast) { if (Tok.is(tok::code_completion)) { cutOffParsing(); @@ -201,15 +104,6 @@ ExprResult Parser::ParseConditionalExpression() { return ParseRHSOfBinaryExpression(LHS, prec::Conditional); } -/// Parse an assignment expression where part of an Objective-C message -/// send has already been parsed. -/// -/// In this case \p LBracLoc indicates the location of the '[' of the message -/// send, and either \p ReceiverName or \p ReceiverExpr is non-null indicating -/// the receiver of the message. -/// -/// Since this handles full assignment-expression's, it handles postfix -/// expressions and other binary operators for these expressions as well. ExprResult Parser::ParseAssignmentExprWithObjCMessageExprStart(SourceLocation LBracLoc, SourceLocation SuperLoc, @@ -281,12 +175,6 @@ ExprResult Parser::ParseCaseExpression(SourceLocation CaseLoc) { return Actions.ActOnCaseExpr(CaseLoc, Res); } -/// Parse a constraint-expression. -/// -/// \verbatim -/// constraint-expression: C++2a[temp.constr.decl]p1 -/// logical-or-expression -/// \endverbatim ExprResult Parser::ParseConstraintExpression() { EnterExpressionEvaluationContext ConstantEvaluated( Actions, Sema::ExpressionEvaluationContext::Unevaluated); @@ -299,15 +187,6 @@ ExprResult Parser::ParseConstraintExpression() { return Res; } -/// \brief Parse a constraint-logical-and-expression. -/// -/// \verbatim -/// C++2a[temp.constr.decl]p1 -/// constraint-logical-and-expression: -/// primary-expression -/// constraint-logical-and-expression '&&' primary-expression -/// -/// \endverbatim ExprResult Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) { EnterExpressionEvaluationContext ConstantEvaluated( @@ -390,16 +269,6 @@ Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) { return LHS; } -/// \brief Parse a constraint-logical-or-expression. -/// -/// \verbatim -/// C++2a[temp.constr.decl]p1 -/// constraint-logical-or-expression: -/// constraint-logical-and-expression -/// constraint-logical-or-expression '||' -/// constraint-logical-and-expression -/// -/// \endverbatim ExprResult Parser::ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause) { ExprResult LHS(ParseConstraintLogicalAndExpression(IsTrailingRequiresClause)); @@ -445,8 +314,6 @@ bool Parser::isFoldOperator(tok::TokenKind Kind) const { return isFoldOperator(getBinOpPrecedence(Kind, GreaterThanIsOperator, true)); } -/// Parse a binary expression that starts with \p LHS and has a -/// precedence of at least \p MinPrec. ExprResult Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) { prec::Level NextTokPrec = getBinOpPrecedence(Tok.getKind(), @@ -717,12 +584,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) { } } -/// Parse a cast-expression, unary-expression or primary-expression, based -/// on \p ExprType. -/// -/// \p isAddressOfOperand exists because an id-expression that is the -/// operand of address-of gets special treatment due to member pointers. -/// ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, bool isAddressOfOperand, TypeCastState isTypeCast, @@ -881,191 +742,6 @@ ExprResult Parser::ParseBuiltinPtrauthTypeDiscriminator() { /*isType=*/true, Ty.get().getAsOpaquePtr(), SourceRange(Loc, EndLoc)); } -/// Parse a cast-expression, or, if \pisUnaryExpression is true, parse -/// a unary-expression. -/// -/// \p isAddressOfOperand exists because an id-expression that is the operand -/// of address-of gets special treatment due to member pointers. NotCastExpr -/// is set to true if the token is not the start of a cast-expression, and no -/// diagnostic is emitted in this case and no tokens are consumed. -/// -/// \verbatim -/// cast-expression: [C99 6.5.4] -/// unary-expression -/// '(' type-name ')' cast-expression -/// -/// unary-expression: [C99 6.5.3] -/// postfix-expression -/// '++' unary-expression -/// '--' unary-expression -/// [Coro] 'co_await' cast-expression -/// unary-operator cast-expression -/// 'sizeof' unary-expression -/// 'sizeof' '(' type-name ')' -/// [C++11] 'sizeof' '...' '(' identifier ')' -/// [GNU] '__alignof' unary-expression -/// [GNU] '__alignof' '(' type-name ')' -/// [C11] '_Alignof' '(' type-name ')' -/// [C++11] 'alignof' '(' type-id ')' -/// [C2y] '_Countof' unary-expression -/// [C2y] '_Countof' '(' type-name ')' -/// [GNU] '&&' identifier -/// [C++11] 'noexcept' '(' expression ')' [C++11 5.3.7] -/// [C++] new-expression -/// [C++] delete-expression -/// -/// unary-operator: one of -/// '&' '*' '+' '-' '~' '!' -/// [GNU] '__extension__' '__real' '__imag' -/// -/// primary-expression: [C99 6.5.1] -/// [C99] identifier -/// [C++] id-expression -/// constant -/// string-literal -/// [C++] boolean-literal [C++ 2.13.5] -/// [C++11] 'nullptr' [C++11 2.14.7] -/// [C++11] user-defined-literal -/// '(' expression ')' -/// [C11] generic-selection -/// [C++2a] requires-expression -/// '__func__' [C99 6.4.2.2] -/// [GNU] '__FUNCTION__' -/// [MS] '__FUNCDNAME__' -/// [MS] 'L__FUNCTION__' -/// [MS] '__FUNCSIG__' -/// [MS] 'L__FUNCSIG__' -/// [GNU] '__PRETTY_FUNCTION__' -/// [GNU] '(' compound-statement ')' -/// [GNU] '__builtin_va_arg' '(' assignment-expression ',' type-name ')' -/// [GNU] '__builtin_offsetof' '(' type-name ',' offsetof-member-designator')' -/// [GNU] '__builtin_choose_expr' '(' assign-expr ',' assign-expr ',' -/// assign-expr ')' -/// [GNU] '__builtin_FILE' '(' ')' -/// [CLANG] '__builtin_FILE_NAME' '(' ')' -/// [GNU] '__builtin_FUNCTION' '(' ')' -/// [MS] '__builtin_FUNCSIG' '(' ')' -/// [GNU] '__builtin_LINE' '(' ')' -/// [CLANG] '__builtin_COLUMN' '(' ')' -/// [GNU] '__builtin_source_location' '(' ')' -/// [GNU] '__builtin_types_compatible_p' '(' type-name ',' type-name ')' -/// [GNU] '__null' -/// [OBJC] '[' objc-message-expr ']' -/// [OBJC] '\@selector' '(' objc-selector-arg ')' -/// [OBJC] '\@protocol' '(' identifier ')' -/// [OBJC] '\@encode' '(' type-name ')' -/// [OBJC] objc-string-literal -/// [C++] simple-type-specifier '(' expression-list[opt] ')' [C++ 5.2.3] -/// [C++11] simple-type-specifier braced-init-list [C++11 5.2.3] -/// [C++] typename-specifier '(' expression-list[opt] ')' [C++ 5.2.3] -/// [C++11] typename-specifier braced-init-list [C++11 5.2.3] -/// [C++] 'const_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] -/// [C++] 'dynamic_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] -/// [C++] 'reinterpret_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] -/// [C++] 'static_cast' '<' type-name '>' '(' expression ')' [C++ 5.2p1] -/// [C++] 'typeid' '(' expression ')' [C++ 5.2p1] -/// [C++] 'typeid' '(' type-id ')' [C++ 5.2p1] -/// [C++] 'this' [C++ 9.3.2] -/// [G++] unary-type-trait '(' type-id ')' -/// [G++] binary-type-trait '(' type-id ',' type-id ')' [TODO] -/// [EMBT] array-type-trait '(' type-id ',' integer ')' -/// [clang] '^' block-literal -/// -/// constant: [C99 6.4.4] -/// integer-constant -/// floating-constant -/// enumeration-constant -> identifier -/// character-constant -/// -/// id-expression: [C++ 5.1] -/// unqualified-id -/// qualified-id -/// -/// unqualified-id: [C++ 5.1] -/// identifier -/// operator-function-id -/// conversion-function-id -/// '~' class-name -/// template-id -/// -/// new-expression: [C++ 5.3.4] -/// '::'[opt] 'new' new-placement[opt] new-type-id -/// new-initializer[opt] -/// '::'[opt] 'new' new-placement[opt] '(' type-id ')' -/// new-initializer[opt] -/// -/// delete-expression: [C++ 5.3.5] -/// '::'[opt] 'delete' cast-expression -/// '::'[opt] 'delete' '[' ']' cast-expression -/// -/// [GNU/Embarcadero] unary-type-trait: -/// '__is_arithmetic' -/// '__is_floating_point' -/// '__is_integral' -/// '__is_lvalue_expr' -/// '__is_rvalue_expr' -/// '__is_complete_type' -/// '__is_void' -/// '__is_array' -/// '__is_function' -/// '__is_reference' -/// '__is_lvalue_reference' -/// '__is_rvalue_reference' -/// '__is_fundamental' -/// '__is_object' -/// '__is_scalar' -/// '__is_compound' -/// '__is_pointer' -/// '__is_member_object_pointer' -/// '__is_member_function_pointer' -/// '__is_member_pointer' -/// '__is_const' -/// '__is_volatile' -/// '__is_trivial' -/// '__is_standard_layout' -/// '__is_signed' -/// '__is_unsigned' -/// -/// [GNU] unary-type-trait: -/// '__has_nothrow_assign' -/// '__has_nothrow_copy' -/// '__has_nothrow_constructor' -/// '__has_trivial_assign' [TODO] -/// '__has_trivial_copy' [TODO] -/// '__has_trivial_constructor' -/// '__has_trivial_destructor' -/// '__has_virtual_destructor' -/// '__is_abstract' [TODO] -/// '__is_class' -/// '__is_empty' [TODO] -/// '__is_enum' -/// '__is_final' -/// '__is_pod' -/// '__is_polymorphic' -/// '__is_sealed' [MS] -/// '__is_trivial' -/// '__is_union' -/// '__has_unique_object_representations' -/// -/// [Clang] unary-type-trait: -/// '__is_aggregate' -/// '__trivially_copyable' -/// -/// binary-type-trait: -/// [GNU] '__is_base_of' -/// [MS] '__is_convertible_to' -/// '__is_convertible' -/// '__is_same' -/// -/// [Embarcadero] array-type-trait: -/// '__array_rank' -/// '__array_extent' -/// -/// [Embarcadero] expression-trait: -/// '__is_lvalue_expr' -/// '__is_rvalue_expr' -/// \endverbatim -/// ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, bool isAddressOfOperand, bool &NotCastExpr, @@ -1985,27 +1661,6 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, return Res; } -/// Once the leading part of a postfix-expression is parsed, this -/// method parses any suffixes that apply. -/// -/// \verbatim -/// postfix-expression: [C99 6.5.2] -/// primary-expression -/// postfix-expression '[' expression ']' -/// postfix-expression '[' braced-init-list ']' -/// postfix-expression '[' expression-list [opt] ']' [C++23 12.4.5] -/// postfix-expression '(' argument-expression-list[opt] ')' -/// postfix-expression '.' identifier -/// postfix-expression '->' identifier -/// postfix-expression '++' -/// postfix-expression '--' -/// '(' type-name ')' '{' initializer-list '}' -/// '(' type-name ')' '{' initializer-list ',' '}' -/// -/// argument-expression-list: [C99 6.5.2] -/// argument-expression ...[opt] -/// argument-expression-list ',' assignment-expression ...[opt] -/// \endverbatim ExprResult Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { // Now that the primary-expression piece of the postfix-expression has been @@ -2428,38 +2083,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { } } -/// ParseExprAfterUnaryExprOrTypeTrait - We parsed a typeof/sizeof/alignof/ -/// vec_step and we are at the start of an expression or a parenthesized -/// type-id. OpTok is the operand token (typeof/sizeof/alignof). Returns the -/// expression (isCastExpr == false) or the type (isCastExpr == true). -/// -/// \verbatim -/// unary-expression: [C99 6.5.3] -/// 'sizeof' unary-expression -/// 'sizeof' '(' type-name ')' -/// [Clang] '__datasizeof' unary-expression -/// [Clang] '__datasizeof' '(' type-name ')' -/// [GNU] '__alignof' unary-expression -/// [GNU] '__alignof' '(' type-name ')' -/// [C11] '_Alignof' '(' type-name ')' -/// [C++0x] 'alignof' '(' type-id ')' -/// -/// [GNU] typeof-specifier: -/// typeof ( expressions ) -/// typeof ( type-name ) -/// [GNU/C++] typeof unary-expression -/// [C23] typeof-specifier: -/// typeof '(' typeof-specifier-argument ')' -/// typeof_unqual '(' typeof-specifier-argument ')' -/// -/// typeof-specifier-argument: -/// expression -/// type-name -/// -/// [OpenCL 1.1 6.11.12] vec_step built-in function: -/// vec_step ( expressions ) -/// vec_step ( type-name ) -/// \endverbatim ExprResult Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok, bool &isCastExpr, @@ -2560,8 +2183,6 @@ Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok, return Operand; } -/// Parse a __builtin_sycl_unique_stable_name expression. Accepts a type-id as -/// a parameter. ExprResult Parser::ParseSYCLUniqueStableNameExpression() { assert(Tok.is(tok::kw___builtin_sycl_unique_stable_name) && "Not __builtin_sycl_unique_stable_name"); @@ -2588,22 +2209,6 @@ ExprResult Parser::ParseSYCLUniqueStableNameExpression() { OpLoc, T.getOpenLocation(), T.getCloseLocation(), Ty.get()); } -/// Parse a sizeof or alignof expression. -/// -/// \verbatim -/// unary-expression: [C99 6.5.3] -/// 'sizeof' unary-expression -/// 'sizeof' '(' type-name ')' -/// [C++11] 'sizeof' '...' '(' identifier ')' -/// [Clang] '__datasizeof' unary-expression -/// [Clang] '__datasizeof' '(' type-name ')' -/// [GNU] '__alignof' unary-expression -/// [GNU] '__alignof' '(' type-name ')' -/// [C11] '_Alignof' '(' type-name ')' -/// [C++11] 'alignof' '(' type-id ')' -/// [C2y] '_Countof' unary-expression -/// [C2y] '_Countof' '(' type-name ')' -/// \endverbatim ExprResult Parser::ParseUnaryExprOrTypeTraitExpression() { assert(Tok.isOneOf(tok::kw_sizeof, tok::kw___datasizeof, tok::kw___alignof, tok::kw_alignof, tok::kw__Alignof, tok::kw_vec_step, @@ -2731,29 +2336,6 @@ ExprResult Parser::ParseUnaryExprOrTypeTraitExpression() { return Operand; } -/// ParseBuiltinPrimaryExpression -/// -/// \verbatim -/// primary-expression: [C99 6.5.1] -/// [GNU] '__builtin_va_arg' '(' assignment-expression ',' type-name ')' -/// [GNU] '__builtin_offsetof' '(' type-name ',' offsetof-member-designator')' -/// [GNU] '__builtin_choose_expr' '(' assign-expr ',' assign-expr ',' -/// assign-expr ')' -/// [GNU] '__builtin_types_compatible_p' '(' type-name ',' type-name ')' -/// [GNU] '__builtin_FILE' '(' ')' -/// [CLANG] '__builtin_FILE_NAME' '(' ')' -/// [GNU] '__builtin_FUNCTION' '(' ')' -/// [MS] '__builtin_FUNCSIG' '(' ')' -/// [GNU] '__builtin_LINE' '(' ')' -/// [CLANG] '__builtin_COLUMN' '(' ')' -/// [GNU] '__builtin_source_location' '(' ')' -/// [OCL] '__builtin_astype' '(' assignment-expression ',' type-name ')' -/// -/// [GNU] offsetof-member-designator: -/// [GNU] identifier -/// [GNU] offsetof-member-designator '.' identifier -/// [GNU] offsetof-member-designator '[' expression ']' -/// \endverbatim ExprResult Parser::ParseBuiltinPrimaryExpression() { ExprResult Res; const IdentifierInfo *BuiltinII = Tok.getIdentifierInfo(); @@ -3049,33 +2631,6 @@ bool Parser::tryParseOpenMPArrayShapingCastPart() { return !ErrorFound; } -/// ParseParenExpression - This parses the unit that starts with a '(' token, -/// based on what is allowed by ExprType. The actual thing parsed is returned -/// in ExprType. If stopIfCastExpr is true, it will only return the parsed type, -/// not the parsed cast-expression. -/// -/// \verbatim -/// primary-expression: [C99 6.5.1] -/// '(' expression ')' -/// [GNU] '(' compound-statement ')' (if !ParenExprOnly) -/// postfix-expression: [C99 6.5.2] -/// '(' type-name ')' '{' initializer-list '}' -/// '(' type-name ')' '{' initializer-list ',' '}' -/// cast-expression: [C99 6.5.4] -/// '(' type-name ')' cast-expression -/// [ARC] bridged-cast-expression -/// [ARC] bridged-cast-expression: -/// (__bridge type-name) cast-expression -/// (__bridge_transfer type-name) cast-expression -/// (__bridge_retained type-name) cast-expression -/// fold-expression: [C++1z] -/// '(' cast-expression fold-operator '...' ')' -/// '(' '...' fold-operator cast-expression ')' -/// '(' cast-expression fold-operator '...' -/// fold-operator cast-expression ')' -/// [OPENMP] Array shaping operation -/// '(' '[' expression ']' { '[' expression ']' } cast-expression -/// \endverbatim ExprResult Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr, bool isTypeCast, ParsedType &CastTy, @@ -3421,14 +2976,6 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr, return Result; } -/// ParseCompoundLiteralExpression - We have parsed the parenthesized type-name -/// and we are at the left brace. -/// -/// \verbatim -/// postfix-expression: [C99 6.5.2] -/// '(' type-name ')' '{' initializer-list '}' -/// '(' type-name ')' '{' initializer-list ',' '}' -/// \endverbatim ExprResult Parser::ParseCompoundLiteralExpression(ParsedType Ty, SourceLocation LParenLoc, @@ -3443,14 +2990,6 @@ Parser::ParseCompoundLiteralExpression(ParsedType Ty, return Result; } -/// ParseStringLiteralExpression - This handles the various token types that -/// form string literals, and also handles string concatenation [C99 5.1.1.2, -/// translation phase #6]. -/// -/// \verbatim -/// primary-expression: [C99 6.5.1] -/// string-literal -/// \verbatim ExprResult Parser::ParseStringLiteralExpression(bool AllowUserDefinedLiteral) { return ParseStringLiteralExpression(AllowUserDefinedLiteral, /*Unevaluated=*/false); @@ -3487,25 +3026,6 @@ ExprResult Parser::ParseStringLiteralExpression(bool AllowUserDefinedLiteral, : nullptr); } -/// ParseGenericSelectionExpression - Parse a C11 generic-selection -/// [C11 6.5.1.1]. -/// -/// \verbatim -/// generic-selection: -/// _Generic ( assignment-expression , generic-assoc-list ) -/// generic-assoc-list: -/// generic-association -/// generic-assoc-list , generic-association -/// generic-association: -/// type-name : assignment-expression -/// default : assignment-expression -/// \endverbatim -/// -/// As an extension, Clang also accepts: -/// \verbatim -/// generic-selection: -/// _Generic ( type-name, generic-assoc-list ) -/// \endverbatim ExprResult Parser::ParseGenericSelectionExpression() { assert(Tok.is(tok::kw__Generic) && "_Generic keyword expected"); @@ -3604,14 +3124,6 @@ ExprResult Parser::ParseGenericSelectionExpression() { ExprOrTy, Types, Exprs); } -/// Parse A C++1z fold-expression after the opening paren and optional -/// left-hand-side expression. -/// -/// \verbatim -/// fold-expression: -/// ( cast-expression fold-operator ... ) -/// ( ... fold-operator cast-expression ) -/// ( cast-expression fold-operator ... fold-operator cast-expression ) ExprResult Parser::ParseFoldExpression(ExprResult LHS, BalancedDelimiterTracker &T) { if (LHS.isInvalid()) { @@ -3683,28 +3195,6 @@ void Parser::injectEmbedTokens() { ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true); } -/// ParseExpressionList - Used for C/C++ (argument-)expression-list. -/// -/// \verbatim -/// argument-expression-list: -/// assignment-expression -/// argument-expression-list , assignment-expression -/// -/// [C++] expression-list: -/// [C++] assignment-expression -/// [C++] expression-list , assignment-expression -/// -/// [C++0x] expression-list: -/// [C++0x] initializer-list -/// -/// [C++0x] initializer-list -/// [C++0x] initializer-clause ...[opt] -/// [C++0x] initializer-list , initializer-clause ...[opt] -/// -/// [C++0x] initializer-clause: -/// [C++0x] assignment-expression -/// [C++0x] braced-init-list -/// \endverbatim bool Parser::ParseExpressionList(SmallVectorImpl &Exprs, llvm::function_ref ExpressionStarts, bool FailImmediatelyOnInvalidExpr, @@ -3763,14 +3253,6 @@ bool Parser::ParseExpressionList(SmallVectorImpl &Exprs, return SawError; } -/// ParseSimpleExpressionList - A simple comma-separated list of expressions, -/// used for misc language extensions. -/// -/// \verbatim -/// simple-expression-list: -/// assignment-expression -/// simple-expression-list , assignment-expression -/// \endverbatim bool Parser::ParseSimpleExpressionList(SmallVectorImpl &Exprs) { while (true) { ExprResult Expr = ParseAssignmentExpression(); @@ -3791,12 +3273,6 @@ bool Parser::ParseSimpleExpressionList(SmallVectorImpl &Exprs) { } } -/// ParseBlockId - Parse a block-id, which roughly looks like int (int x). -/// -/// \verbatim -/// [clang] block-id: -/// [clang] specifier-qualifier-list block-declarator -/// \endverbatim void Parser::ParseBlockId(SourceLocation CaretLoc) { if (Tok.is(tok::code_completion)) { cutOffParsing(); @@ -3821,16 +3297,6 @@ void Parser::ParseBlockId(SourceLocation CaretLoc) { Actions.ActOnBlockArguments(CaretLoc, DeclaratorInfo, getCurScope()); } -/// ParseBlockLiteralExpression - Parse a block literal, which roughly looks -/// like ^(int x){ return x+1; } -/// -/// \verbatim -/// block-literal: -/// [clang] '^' block-args[opt] compound-statement -/// [clang] '^' block-id compound-statement -/// [clang] block-args: -/// [clang] '(' parameter-list ')' -/// \endverbatim ExprResult Parser::ParseBlockLiteralExpression() { assert(Tok.is(tok::caret) && "block literal starts with ^"); SourceLocation CaretLoc = ConsumeToken(); @@ -3929,10 +3395,6 @@ ExprResult Parser::ParseBlockLiteralExpression() { return Result; } -/// ParseObjCBoolLiteral - This handles the objective-c Boolean literals. -/// -/// '__objc_yes' -/// '__objc_no' ExprResult Parser::ParseObjCBoolLiteral() { tok::TokenKind Kind = Tok.getKind(); return Actions.ObjC().ActOnObjCBoolLiteral(ConsumeToken(), Kind); @@ -3978,11 +3440,6 @@ static bool CheckAvailabilitySpecList(Parser &P, return !Valid; } -/// Parse availability query specification. -/// -/// availability-spec: -/// '*' -/// identifier version-tuple std::optional Parser::ParseAvailabilitySpec() { if (Tok.is(tok::star)) { return AvailabilitySpec(ConsumeToken()); diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index 546c228a30513..d95260829e4a0 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -46,7 +46,6 @@ static int SelectDigraphErrorMessage(tok::TokenKind Kind) { } } -// Are the two tokens adjacent in the same source file? bool Parser::areTokensAdjacent(const Token &First, const Token &Second) { SourceManager &SM = PP.getSourceManager(); SourceLocation FirstLoc = SM.getSpellingLoc(First.getLocation()); @@ -82,8 +81,6 @@ static void FixDigraph(Parser &P, Preprocessor &PP, Token &DigraphToken, PP.EnterToken(DigraphToken, /*IsReinject*/ true); } -// Check for '<::' which should be '< ::' instead of '[:' when following -// a template name. void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType, bool EnteringContext, IdentifierInfo &II, CXXScopeSpec &SS) { @@ -107,55 +104,6 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType, /*AtDigraph*/false); } -/// Parse global scope or nested-name-specifier if present. -/// -/// Parses a C++ global scope specifier ('::') or nested-name-specifier (which -/// may be preceded by '::'). Note that this routine will not parse ::new or -/// ::delete; it will just leave them in the token stream. -/// -/// '::'[opt] nested-name-specifier -/// '::' -/// -/// nested-name-specifier: -/// type-name '::' -/// namespace-name '::' -/// nested-name-specifier identifier '::' -/// nested-name-specifier 'template'[opt] simple-template-id '::' -/// -/// -/// \param SS the scope specifier that will be set to the parsed -/// nested-name-specifier (or empty) -/// -/// \param ObjectType if this nested-name-specifier is being parsed following -/// the "." or "->" of a member access expression, this parameter provides the -/// type of the object whose members are being accessed. -/// -/// \param ObjectHadErrors if this unqualified-id occurs within a member access -/// expression, indicates whether the original subexpressions had any errors. -/// When true, diagnostics for missing 'template' keyword will be supressed. -/// -/// \param EnteringContext whether we will be entering into the context of -/// the nested-name-specifier after parsing it. -/// -/// \param MayBePseudoDestructor When non-NULL, points to a flag that -/// indicates whether this nested-name-specifier may be part of a -/// pseudo-destructor name. In this case, the flag will be set false -/// if we don't actually end up parsing a destructor name. Moreover, -/// if we do end up determining that we are parsing a destructor name, -/// the last component of the nested-name-specifier is not parsed as -/// part of the scope specifier. -/// -/// \param IsTypename If \c true, this nested-name-specifier is known to be -/// part of a type name. This is used to improve error recovery. -/// -/// \param LastII When non-NULL, points to an IdentifierInfo* that will be -/// filled in with the leading identifier in the last component of the -/// nested-name-specifier, if any. -/// -/// \param OnlyNamespace If true, only considers namespaces in lookup. -/// -/// -/// \returns true if there was an error parsing a scope specifier bool Parser::ParseOptionalCXXScopeSpecifier( CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, bool EnteringContext, bool *MayBePseudoDestructor, bool IsTypename, @@ -707,48 +655,6 @@ Parser::tryParseCXXPackIndexingExpression(ExprResult PackIdExpression) { return E; } -/// ParseCXXIdExpression - Handle id-expression. -/// -/// id-expression: -/// unqualified-id -/// qualified-id -/// -/// qualified-id: -/// '::'[opt] nested-name-specifier 'template'[opt] unqualified-id -/// '::' identifier -/// '::' operator-function-id -/// '::' template-id -/// -/// NOTE: The standard specifies that, for qualified-id, the parser does not -/// expect: -/// -/// '::' conversion-function-id -/// '::' '~' class-name -/// -/// This may cause a slight inconsistency on diagnostics: -/// -/// class C {}; -/// namespace A {} -/// void f() { -/// :: A :: ~ C(); // Some Sema error about using destructor with a -/// // namespace. -/// :: ~ C(); // Some Parser error like 'unexpected ~'. -/// } -/// -/// We simplify the parser a bit and make it work like: -/// -/// qualified-id: -/// '::'[opt] nested-name-specifier 'template'[opt] unqualified-id -/// '::' unqualified-id -/// -/// That way Sema can handle and report similar errors for namespaces and the -/// global scope. -/// -/// The isAddressOfOperand parameter indicates that this id-expression is a -/// direct operand of the address-of operator. This is, besides member contexts, -/// the only place where a qualified-id naming a non-static class member may -/// appear. -/// ExprResult Parser::ParseCXXIdExpression(bool isAddressOfOperand) { // qualified-id: // '::'[opt] nested-name-specifier 'template'[opt] unqualified-id @@ -773,51 +679,6 @@ ExprResult Parser::ParseCXXIdExpression(bool isAddressOfOperand) { return Result; } -/// ParseLambdaExpression - Parse a C++11 lambda expression. -/// -/// lambda-expression: -/// lambda-introducer lambda-declarator compound-statement -/// lambda-introducer '<' template-parameter-list '>' -/// requires-clause[opt] lambda-declarator compound-statement -/// -/// lambda-introducer: -/// '[' lambda-capture[opt] ']' -/// -/// lambda-capture: -/// capture-default -/// capture-list -/// capture-default ',' capture-list -/// -/// capture-default: -/// '&' -/// '=' -/// -/// capture-list: -/// capture -/// capture-list ',' capture -/// -/// capture: -/// simple-capture -/// init-capture [C++1y] -/// -/// simple-capture: -/// identifier -/// '&' identifier -/// 'this' -/// -/// init-capture: [C++1y] -/// identifier initializer -/// '&' identifier initializer -/// -/// lambda-declarator: -/// lambda-specifiers [C++23] -/// '(' parameter-declaration-clause ')' lambda-specifiers -/// requires-clause[opt] -/// -/// lambda-specifiers: -/// decl-specifier-seq[opt] noexcept-specifier[opt] -/// attribute-specifier-seq[opt] trailing-return-type[opt] -/// ExprResult Parser::ParseLambdaExpression() { // Parse lambda-introducer. LambdaIntroducer Intro; @@ -831,10 +692,6 @@ ExprResult Parser::ParseLambdaExpression() { return ParseLambdaExpressionAfterIntroducer(Intro); } -/// Use lookahead and potentially tentative parsing to determine if we are -/// looking at a C++11 lambda expression, and parse it if we are. -/// -/// If we are not looking at a lambda expression, returns ExprError(). ExprResult Parser::TryParseLambdaExpression() { assert(getLangOpts().CPlusPlus && Tok.is(tok::l_square) && "Not at the start of a possible lambda expression."); @@ -899,15 +756,6 @@ ExprResult Parser::TryParseLambdaExpression() { return ParseLambdaExpressionAfterIntroducer(Intro); } -/// Parse a lambda introducer. -/// \param Intro A LambdaIntroducer filled in with information about the -/// contents of the lambda-introducer. -/// \param Tentative If non-null, we are disambiguating between a -/// lambda-introducer and some other construct. In this mode, we do not -/// produce any diagnostics or take any other irreversible action unless -/// we're sure that this is a lambda-expression. -/// \return \c true if parsing (or disambiguation) failed with a diagnostic and -/// the caller should bail out / recover. bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro, LambdaIntroducerTentativeParse *Tentative) { if (Tentative) @@ -1353,8 +1201,6 @@ static void DiagnoseStaticSpecifierRestrictions(Parser &P, } } -/// ParseLambdaExpressionAfterIntroducer - Parse the rest of a lambda -/// expression. ExprResult Parser::ParseLambdaExpressionAfterIntroducer( LambdaIntroducer &Intro) { SourceLocation LambdaBeginLoc = Intro.Range.getBegin(); @@ -1648,17 +1494,6 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer( return ExprError(); } -/// ParseCXXCasts - This handles the various ways to cast expressions to another -/// type. -/// -/// postfix-expression: [C++ 5.2p1] -/// 'dynamic_cast' '<' type-name '>' '(' expression ')' -/// 'static_cast' '<' type-name '>' '(' expression ')' -/// 'reinterpret_cast' '<' type-name '>' '(' expression ')' -/// 'const_cast' '<' type-name '>' '(' expression ')' -/// -/// C++ for OpenCL s2.3.1 adds: -/// 'addrspace_cast' '<' type-name '>' '(' expression ')' ExprResult Parser::ParseCXXCasts() { tok::TokenKind Kind = Tok.getKind(); const char *CastName = nullptr; // For error messages @@ -1721,12 +1556,6 @@ ExprResult Parser::ParseCXXCasts() { return Result; } -/// ParseCXXTypeid - This handles the C++ typeid expression. -/// -/// postfix-expression: [C++ 5.2p1] -/// 'typeid' '(' expression ')' -/// 'typeid' '(' type-id ')' -/// ExprResult Parser::ParseCXXTypeid() { assert(Tok.is(tok::kw_typeid) && "Not 'typeid'!"); @@ -1789,11 +1618,6 @@ ExprResult Parser::ParseCXXTypeid() { return Result; } -/// ParseCXXUuidof - This handles the Microsoft C++ __uuidof expression. -/// -/// '__uuidof' '(' expression ')' -/// '__uuidof' '(' type-id ')' -/// ExprResult Parser::ParseCXXUuidof() { assert(Tok.is(tok::kw___uuidof) && "Not '__uuidof'!"); @@ -1838,37 +1662,6 @@ ExprResult Parser::ParseCXXUuidof() { return Result; } -/// Parse a C++ pseudo-destructor expression after the base, -/// . or -> operator, and nested-name-specifier have already been -/// parsed. We're handling this fragment of the grammar: -/// -/// postfix-expression: [C++2a expr.post] -/// postfix-expression . template[opt] id-expression -/// postfix-expression -> template[opt] id-expression -/// -/// id-expression: -/// qualified-id -/// unqualified-id -/// -/// qualified-id: -/// nested-name-specifier template[opt] unqualified-id -/// -/// nested-name-specifier: -/// type-name :: -/// decltype-specifier :: FIXME: not implemented, but probably only -/// allowed in C++ grammar by accident -/// nested-name-specifier identifier :: -/// nested-name-specifier template[opt] simple-template-id :: -/// [...] -/// -/// unqualified-id: -/// ~ type-name -/// ~ decltype-specifier -/// [...] -/// -/// ... where the all but the last component of the nested-name-specifier -/// has already been parsed, and the base expression is not of a non-dependent -/// class type. ExprResult Parser::ParseCXXPseudoDestructor(Expr *Base, SourceLocation OpLoc, tok::TokenKind OpKind, @@ -1950,20 +1743,11 @@ Parser::ParseCXXPseudoDestructor(Expr *Base, SourceLocation OpLoc, SecondTypeName); } -/// ParseCXXBoolLiteral - This handles the C++ Boolean literals. -/// -/// boolean-literal: [C++ 2.13.5] -/// 'true' -/// 'false' ExprResult Parser::ParseCXXBoolLiteral() { tok::TokenKind Kind = Tok.getKind(); return Actions.ActOnCXXBoolLiteral(ConsumeToken(), Kind); } -/// ParseThrowExpression - This handles the C++ throw expression. -/// -/// throw-expression: [C++ 15] -/// 'throw' assignment-expression[opt] ExprResult Parser::ParseThrowExpression() { assert(Tok.is(tok::kw_throw) && "Not throw!"); SourceLocation ThrowLoc = ConsumeToken(); // Eat the throw token. @@ -1987,10 +1771,6 @@ ExprResult Parser::ParseThrowExpression() { } } -/// Parse the C++ Coroutines co_yield expression. -/// -/// co_yield-expression: -/// 'co_yield' assignment-expression[opt] ExprResult Parser::ParseCoyieldExpression() { assert(Tok.is(tok::kw_co_yield) && "Not co_yield!"); @@ -2002,30 +1782,12 @@ ExprResult Parser::ParseCoyieldExpression() { return Expr; } -/// ParseCXXThis - This handles the C++ 'this' pointer. -/// -/// C++ 9.3.2: In the body of a non-static member function, the keyword this is -/// a non-lvalue expression whose value is the address of the object for which -/// the function is called. ExprResult Parser::ParseCXXThis() { assert(Tok.is(tok::kw_this) && "Not 'this'!"); SourceLocation ThisLoc = ConsumeToken(); return Actions.ActOnCXXThis(ThisLoc); } -/// ParseCXXTypeConstructExpression - Parse construction of a specified type. -/// Can be interpreted either as function-style casting ("int(x)") -/// or class type construction ("ClassType(x,y,z)") -/// or creation of a value-initialized type ("int()"). -/// See [C++ 5.2.3]. -/// -/// postfix-expression: [C++ 5.2p1] -/// simple-type-specifier '(' expression-list[opt] ')' -/// [C++0x] simple-type-specifier braced-init-list -/// typename-specifier '(' expression-list[opt] ')' -/// [C++0x] typename-specifier braced-init-list -/// -/// In C++1z onwards, the type specifier can also be a template-name. ExprResult Parser::ParseCXXTypeConstructExpression(const DeclSpec &DS) { Declarator DeclaratorInfo(DS, ParsedAttributesView::none(), @@ -2111,37 +1873,6 @@ Parser::ParseAliasDeclarationInInitStatement(DeclaratorContext Context, return DG; } -/// ParseCXXCondition - if/switch/while condition expression. -/// -/// condition: -/// expression -/// type-specifier-seq declarator '=' assignment-expression -/// [C++11] type-specifier-seq declarator '=' initializer-clause -/// [C++11] type-specifier-seq declarator braced-init-list -/// [Clang] type-specifier-seq ref-qualifier[opt] '[' identifier-list ']' -/// brace-or-equal-initializer -/// [GNU] type-specifier-seq declarator simple-asm-expr[opt] attributes[opt] -/// '=' assignment-expression -/// -/// In C++1z, a condition may in some contexts be preceded by an -/// optional init-statement. This function will parse that too. -/// -/// \param InitStmt If non-null, an init-statement is permitted, and if present -/// will be parsed and stored here. -/// -/// \param Loc The location of the start of the statement that requires this -/// condition, e.g., the "for" in a for loop. -/// -/// \param MissingOK Whether an empty condition is acceptable here. Otherwise -/// it is considered an error to be recovered from. -/// -/// \param FRI If non-null, a for range declaration is permitted, and if -/// present will be parsed and stored here, and a null result will be returned. -/// -/// \param EnterForConditionScope If true, enter a continue/break scope at the -/// appropriate moment for a 'for' loop. -/// -/// \returns The parsed condition. Sema::ConditionResult Parser::ParseCXXCondition(StmtResult *InitStmt, SourceLocation Loc, Sema::ConditionKind CK, bool MissingOK, @@ -2330,32 +2061,6 @@ Parser::ParseCXXCondition(StmtResult *InitStmt, SourceLocation Loc, return Actions.ActOnConditionVariable(DeclOut, Loc, CK); } -/// ParseCXXSimpleTypeSpecifier - [C++ 7.1.5.2] Simple type specifiers. -/// This should only be called when the current token is known to be part of -/// simple-type-specifier. -/// -/// simple-type-specifier: -/// '::'[opt] nested-name-specifier[opt] type-name -/// '::'[opt] nested-name-specifier 'template' simple-template-id [TODO] -/// char -/// wchar_t -/// bool -/// short -/// int -/// long -/// signed -/// unsigned -/// float -/// double -/// void -/// [GNU] typeof-specifier -/// [C++0x] auto [TODO] -/// -/// type-name: -/// class-name -/// enum-name -/// typedef-name -/// void Parser::ParseCXXSimpleTypeSpecifier(DeclSpec &DS) { DS.SetRangeStart(Tok.getLocation()); const char *PrevSpec; @@ -2507,17 +2212,6 @@ void Parser::ParseCXXSimpleTypeSpecifier(DeclSpec &DS) { DS.Finish(Actions, Policy); } -/// ParseCXXTypeSpecifierSeq - Parse a C++ type-specifier-seq (C++ -/// [dcl.name]), which is a non-empty sequence of type-specifiers, -/// e.g., "const short int". Note that the DeclSpec is *not* finished -/// by parsing the type-specifier-seq, because these sequences are -/// typically followed by some form of declarator. Returns true and -/// emits diagnostics if this is not a type-specifier-seq, false -/// otherwise. -/// -/// type-specifier-seq: [C++ 8.1] -/// type-specifier type-specifier-seq[opt] -/// bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS, DeclaratorContext Context) { ParseSpecifierQualifierList(DS, AS_none, getDeclSpecContextFromDeclaratorContext(Context)); @@ -2525,41 +2219,6 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS, DeclaratorContext Context) { return false; } -/// Finish parsing a C++ unqualified-id that is a template-id of -/// some form. -/// -/// This routine is invoked when a '<' is encountered after an identifier or -/// operator-function-id is parsed by \c ParseUnqualifiedId() to determine -/// whether the unqualified-id is actually a template-id. This routine will -/// then parse the template arguments and form the appropriate template-id to -/// return to the caller. -/// -/// \param SS the nested-name-specifier that precedes this template-id, if -/// we're actually parsing a qualified-id. -/// -/// \param ObjectType if this unqualified-id occurs within a member access -/// expression, the type of the base object whose member is being accessed. -/// -/// \param ObjectHadErrors this unqualified-id occurs within a member access -/// expression, indicates whether the original subexpressions had any errors. -/// -/// \param Name for constructor and destructor names, this is the actual -/// identifier that may be a template-name. -/// -/// \param NameLoc the location of the class-name in a constructor or -/// destructor. -/// -/// \param EnteringContext whether we're entering the scope of the -/// nested-name-specifier. -/// -/// \param Id as input, describes the template-name or operator-function-id -/// that precedes the '<'. If template arguments were parsed successfully, -/// will be updated with the template-id. -/// -/// \param AssumeTemplateId When true, this routine will assume that the name -/// refers to a template without performing name lookup to verify. -/// -/// \returns true if a parse error occurred, false otherwise. bool Parser::ParseUnqualifiedIdTemplateId( CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, SourceLocation TemplateKWLoc, IdentifierInfo *Name, SourceLocation NameLoc, @@ -2715,46 +2374,6 @@ bool Parser::ParseUnqualifiedIdTemplateId( return false; } -/// Parse an operator-function-id or conversion-function-id as part -/// of a C++ unqualified-id. -/// -/// This routine is responsible only for parsing the operator-function-id or -/// conversion-function-id; it does not handle template arguments in any way. -/// -/// \code -/// operator-function-id: [C++ 13.5] -/// 'operator' operator -/// -/// operator: one of -/// new delete new[] delete[] -/// + - * / % ^ & | ~ -/// ! = < > += -= *= /= %= -/// ^= &= |= << >> >>= <<= == != -/// <= >= && || ++ -- , ->* -> -/// () [] <=> -/// -/// conversion-function-id: [C++ 12.3.2] -/// operator conversion-type-id -/// -/// conversion-type-id: -/// type-specifier-seq conversion-declarator[opt] -/// -/// conversion-declarator: -/// ptr-operator conversion-declarator[opt] -/// \endcode -/// -/// \param SS The nested-name-specifier that preceded this unqualified-id. If -/// non-empty, then we are parsing the unqualified-id of a qualified-id. -/// -/// \param EnteringContext whether we are entering the scope of the -/// nested-name-specifier. -/// -/// \param ObjectType if this unqualified-id occurs within a member access -/// expression, the type of the base object whose member is being accessed. -/// -/// \param Result on a successful parse, contains the parsed unqualified-id. -/// -/// \returns true if parsing fails, false otherwise. bool Parser::ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, ParsedType ObjectType, UnqualifiedId &Result) { @@ -2957,42 +2576,6 @@ bool Parser::ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, return false; } -/// Parse a C++ unqualified-id (or a C identifier), which describes the -/// name of an entity. -/// -/// \code -/// unqualified-id: [C++ expr.prim.general] -/// identifier -/// operator-function-id -/// conversion-function-id -/// [C++0x] literal-operator-id [TODO] -/// ~ class-name -/// template-id -/// -/// \endcode -/// -/// \param SS The nested-name-specifier that preceded this unqualified-id. If -/// non-empty, then we are parsing the unqualified-id of a qualified-id. -/// -/// \param ObjectType if this unqualified-id occurs within a member access -/// expression, the type of the base object whose member is being accessed. -/// -/// \param ObjectHadErrors if this unqualified-id occurs within a member access -/// expression, indicates whether the original subexpressions had any errors. -/// When true, diagnostics for missing 'template' keyword will be supressed. -/// -/// \param EnteringContext whether we are entering the scope of the -/// nested-name-specifier. -/// -/// \param AllowDestructorName whether we allow parsing of a destructor name. -/// -/// \param AllowConstructorName whether we allow parsing a constructor name. -/// -/// \param AllowDeductionGuide whether we allow parsing a deduction guide name. -/// -/// \param Result on a successful parse, contains the parsed unqualified-id. -/// -/// \returns true if parsing fails, false otherwise. bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, bool EnteringContext, bool AllowDestructorName, @@ -3281,34 +2864,6 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, } } -/// ParseCXXNewExpression - Parse a C++ new-expression. New is used to allocate -/// memory in a typesafe manner and call constructors. -/// -/// This method is called to parse the new expression after the optional :: has -/// been already parsed. If the :: was present, "UseGlobal" is true and "Start" -/// is its location. Otherwise, "Start" is the location of the 'new' token. -/// -/// new-expression: -/// '::'[opt] 'new' new-placement[opt] new-type-id -/// new-initializer[opt] -/// '::'[opt] 'new' new-placement[opt] '(' type-id ')' -/// new-initializer[opt] -/// -/// new-placement: -/// '(' expression-list ')' -/// -/// new-type-id: -/// type-specifier-seq new-declarator[opt] -/// [GNU] attributes type-specifier-seq new-declarator[opt] -/// -/// new-declarator: -/// ptr-operator new-declarator[opt] -/// direct-new-declarator -/// -/// new-initializer: -/// '(' expression-list[opt] ')' -/// [C++0x] braced-init-list -/// ExprResult Parser::ParseCXXNewExpression(bool UseGlobal, SourceLocation Start) { assert(Tok.is(tok::kw_new) && "expected 'new' token"); @@ -3441,13 +2996,6 @@ Parser::ParseCXXNewExpression(bool UseGlobal, SourceLocation Start) { TypeIdParens, DeclaratorInfo, Initializer.get()); } -/// ParseDirectNewDeclarator - Parses a direct-new-declarator. Intended to be -/// passed to ParseDeclaratorInternal. -/// -/// direct-new-declarator: -/// '[' expression[opt] ']' -/// direct-new-declarator '[' constant-expression ']' -/// void Parser::ParseDirectNewDeclarator(Declarator &D) { // Parse the array dimensions. bool First = true; @@ -3486,16 +3034,6 @@ void Parser::ParseDirectNewDeclarator(Declarator &D) { } } -/// ParseExpressionListOrTypeId - Parse either an expression-list or a type-id. -/// This ambiguity appears in the syntax of the C++ new operator. -/// -/// new-expression: -/// '::'[opt] 'new' new-placement[opt] '(' type-id ')' -/// new-initializer[opt] -/// -/// new-placement: -/// '(' expression-list ')' -/// bool Parser::ParseExpressionListOrTypeId( SmallVectorImpl &PlacementArgs, Declarator &D) { @@ -3511,17 +3049,6 @@ bool Parser::ParseExpressionListOrTypeId( return ParseExpressionList(PlacementArgs); } -/// ParseCXXDeleteExpression - Parse a C++ delete-expression. Delete is used -/// to free memory allocated by new. -/// -/// This method is called to parse the 'delete' expression after the optional -/// '::' has been already parsed. If the '::' was present, "UseGlobal" is true -/// and "Start" is its location. Otherwise, "Start" is the location of the -/// 'delete' token. -/// -/// delete-expression: -/// '::'[opt] 'delete' cast-expression -/// '::'[opt] 'delete' '[' ']' cast-expression ExprResult Parser::ParseCXXDeleteExpression(bool UseGlobal, SourceLocation Start) { assert(Tok.is(tok::kw_delete) && "Expected 'delete' keyword"); @@ -3605,30 +3132,6 @@ Parser::ParseCXXDeleteExpression(bool UseGlobal, SourceLocation Start) { return Actions.ActOnCXXDelete(Start, UseGlobal, ArrayDelete, Operand.get()); } -/// ParseRequiresExpression - Parse a C++2a requires-expression. -/// C++2a [expr.prim.req]p1 -/// A requires-expression provides a concise way to express requirements on -/// template arguments. A requirement is one that can be checked by name -/// lookup (6.4) or by checking properties of types and expressions. -/// -/// requires-expression: -/// 'requires' requirement-parameter-list[opt] requirement-body -/// -/// requirement-parameter-list: -/// '(' parameter-declaration-clause[opt] ')' -/// -/// requirement-body: -/// '{' requirement-seq '}' -/// -/// requirement-seq: -/// requirement -/// requirement-seq requirement -/// -/// requirement: -/// simple-requirement -/// type-requirement -/// compound-requirement -/// nested-requirement ExprResult Parser::ParseRequiresExpression() { assert(Tok.is(tok::kw_requires) && "Expected 'requires' keyword"); SourceLocation RequiresKWLoc = ConsumeToken(); // Consume 'requires' @@ -3955,17 +3458,6 @@ static ExpressionTrait ExpressionTraitFromTokKind(tok::TokenKind kind) { } } -/// Parse the built-in type-trait pseudo-functions that allow -/// implementation of the TR1/C++11 type traits templates. -/// -/// primary-expression: -/// unary-type-trait '(' type-id ')' -/// binary-type-trait '(' type-id ',' type-id ')' -/// type-trait '(' type-id-seq ')' -/// -/// type-id-seq: -/// type-id ...[opt] type-id-seq[opt] -/// ExprResult Parser::ParseTypeTrait() { tok::TokenKind Kind = Tok.getKind(); @@ -4008,13 +3500,6 @@ ExprResult Parser::ParseTypeTrait() { return Actions.ActOnTypeTrait(TypeTraitFromTokKind(Kind), Loc, Args, EndLoc); } -/// ParseArrayTypeTrait - Parse the built-in array type-trait -/// pseudo-functions. -/// -/// primary-expression: -/// [Embarcadero] '__array_rank' '(' type-id ')' -/// [Embarcadero] '__array_extent' '(' type-id ',' expression ')' -/// ExprResult Parser::ParseArrayTypeTrait() { ArrayTypeTrait ATT = ArrayTypeTraitFromTokKind(Tok.getKind()); SourceLocation Loc = ConsumeToken(); @@ -4056,12 +3541,6 @@ ExprResult Parser::ParseArrayTypeTrait() { llvm_unreachable("Invalid ArrayTypeTrait!"); } -/// ParseExpressionTrait - Parse built-in expression-trait -/// pseudo-functions like __is_lvalue_expr( xxx ). -/// -/// primary-expression: -/// [Embarcadero] expression-trait '(' expression ')' -/// ExprResult Parser::ParseExpressionTrait() { ExpressionTrait ET = ExpressionTraitFromTokKind(Tok.getKind()); SourceLocation Loc = ConsumeToken(); @@ -4078,10 +3557,6 @@ ExprResult Parser::ParseExpressionTrait() { T.getCloseLocation()); } - -/// ParseCXXAmbiguousParenExpression - We have parsed the left paren of a -/// parenthesized ambiguous type-id. This uses tentative parsing to disambiguate -/// based on the context past the parens. ExprResult Parser::ParseCXXAmbiguousParenExpression(ParenParseOption &ExprType, ParsedType &CastTy, @@ -4235,7 +3710,6 @@ Parser::ParseCXXAmbiguousParenExpression(ParenParseOption &ExprType, return Result; } -/// Parse a __builtin_bit_cast(T, E). ExprResult Parser::ParseBuiltinBitCast() { SourceLocation KWLoc = ConsumeToken(); diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp index a4ac692f429f7..af32b5d3b3ca0 100644 --- a/clang/lib/Parse/ParseInit.cpp +++ b/clang/lib/Parse/ParseInit.cpp @@ -24,10 +24,6 @@ #include "llvm/ADT/SmallString.h" using namespace clang; - -/// MayBeDesignationStart - Return true if the current token might be the start -/// of a designator. If we can tell it is impossible that it is a designator, -/// return false. bool Parser::MayBeDesignationStart() { switch (Tok.getKind()) { default: @@ -119,48 +115,6 @@ static void CheckArrayDesignatorSyntax(Parser &P, SourceLocation Loc, P.Diag(Loc, diag::err_expected_equal_designator); } -/// ParseInitializerWithPotentialDesignator - Parse the 'initializer' production -/// checking to see if the token stream starts with a designator. -/// -/// C99: -/// -/// designation: -/// designator-list '=' -/// [GNU] array-designator -/// [GNU] identifier ':' -/// -/// designator-list: -/// designator -/// designator-list designator -/// -/// designator: -/// array-designator -/// '.' identifier -/// -/// array-designator: -/// '[' constant-expression ']' -/// [GNU] '[' constant-expression '...' constant-expression ']' -/// -/// C++20: -/// -/// designated-initializer-list: -/// designated-initializer-clause -/// designated-initializer-list ',' designated-initializer-clause -/// -/// designated-initializer-clause: -/// designator brace-or-equal-initializer -/// -/// designator: -/// '.' identifier -/// -/// We allow the C99 syntax extensions in C++20, but do not allow the C++20 -/// extension (a braced-init-list after the designator with no '=') in C99. -/// -/// NOTE: [OBC] allows '[ objc-receiver objc-message-args ]' as an -/// initializer (because it is an expression). We need to consider this case -/// when parsing array designators. -/// -/// \p CodeCompleteCB is called with Designation parsed so far. ExprResult Parser::ParseInitializerWithPotentialDesignator( DesignatorCompletionInfo DesignatorCompletion) { // If this is the old-style GNU extension: @@ -456,18 +410,6 @@ ExprResult Parser::createEmbedExpr() { return Res; } -/// ParseBraceInitializer - Called when parsing an initializer that has a -/// leading open brace. -/// -/// initializer: [C99 6.7.8] -/// '{' initializer-list '}' -/// '{' initializer-list ',' '}' -/// [C23] '{' '}' -/// -/// initializer-list: -/// designation[opt] initializer ...[opt] -/// initializer-list ',' designation[opt] initializer ...[opt] -/// ExprResult Parser::ParseBraceInitializer() { InMessageExpressionRAIIObject InMessage(*this, false); @@ -578,9 +520,6 @@ ExprResult Parser::ParseBraceInitializer() { return ExprError(); // an error occurred. } - -// Return true if a comma (or closing brace) is necessary after the -// __if_exists/if_not_exists statement. bool Parser::ParseMicrosoftIfExistsBraceInitializer(ExprVector &InitExprs, bool &InitExprsOk) { bool trailingComma = false; diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp index 6496b4fba54f2..ed1a301686bc6 100644 --- a/clang/lib/Parse/ParseObjc.cpp +++ b/clang/lib/Parse/ParseObjc.cpp @@ -29,7 +29,6 @@ using namespace clang; -/// Skips attributes after an Objective-C @ directive. Emits a diagnostic. void Parser::MaybeSkipAttributes(tok::ObjCKeywordKind Kind) { ParsedAttributes attrs(AttrFactory); if (Tok.is(tok::kw___attribute)) { @@ -42,14 +41,6 @@ void Parser::MaybeSkipAttributes(tok::ObjCKeywordKind Kind) { } } -/// ParseObjCAtDirectives - Handle parts of the external-declaration production: -/// external-declaration: [C99 6.9] -/// [OBJC] objc-class-definition -/// [OBJC] objc-class-declaration -/// [OBJC] objc-alias-declaration -/// [OBJC] objc-protocol-definition -/// [OBJC] objc-method-definition -/// [OBJC] '@' 'end' Parser::DeclGroupPtrTy Parser::ParseObjCAtDirectives(ParsedAttributes &DeclAttrs, ParsedAttributes &DeclSpecAttrs) { @@ -115,7 +106,6 @@ Parser::ParseObjCAtDirectives(ParsedAttributes &DeclAttrs, return Actions.ConvertDeclToDeclGroup(SingleDecl); } -/// Class to handle popping type parameters when leaving the scope. class Parser::ObjCTypeParamListScope { Sema &Actions; Scope *S; @@ -141,13 +131,6 @@ class Parser::ObjCTypeParamListScope { } }; -/// -/// objc-class-declaration: -/// '@' 'class' objc-class-forward-decl (',' objc-class-forward-decl)* ';' -/// -/// objc-class-forward-decl: -/// identifier objc-type-parameter-list[opt] -/// Parser::DeclGroupPtrTy Parser::ParseObjCAtClassDeclaration(SourceLocation atLoc) { ConsumeToken(); // the identifier "class" @@ -206,35 +189,6 @@ void Parser::CheckNestedObjCContexts(SourceLocation AtLoc) Diag(Decl->getBeginLoc(), diag::note_objc_container_start) << (int)ock; } -/// -/// objc-interface: -/// objc-class-interface-attributes[opt] objc-class-interface -/// objc-category-interface -/// -/// objc-class-interface: -/// '@' 'interface' identifier objc-type-parameter-list[opt] -/// objc-superclass[opt] objc-protocol-refs[opt] -/// objc-class-instance-variables[opt] -/// objc-interface-decl-list -/// @end -/// -/// objc-category-interface: -/// '@' 'interface' identifier objc-type-parameter-list[opt] -/// '(' identifier[opt] ')' objc-protocol-refs[opt] -/// objc-interface-decl-list -/// @end -/// -/// objc-superclass: -/// ':' identifier objc-type-arguments[opt] -/// -/// objc-class-interface-attributes: -/// __attribute__((visibility("default"))) -/// __attribute__((visibility("hidden"))) -/// __attribute__((deprecated)) -/// __attribute__((unavailable)) -/// __attribute__((objc_exception)) - used by NSException on 64-bit -/// __attribute__((objc_root_class)) -/// Decl *Parser::ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, ParsedAttributes &attrs) { assert(Tok.isObjCAtKeyword(tok::objc_interface) && @@ -434,30 +388,6 @@ static void addContextSensitiveTypeNullability(Parser &P, } } -/// Parse an Objective-C type parameter list, if present, or capture -/// the locations of the protocol identifiers for a list of protocol -/// references. -/// -/// objc-type-parameter-list: -/// '<' objc-type-parameter (',' objc-type-parameter)* '>' -/// -/// objc-type-parameter: -/// objc-type-parameter-variance? identifier objc-type-parameter-bound[opt] -/// -/// objc-type-parameter-bound: -/// ':' type-name -/// -/// objc-type-parameter-variance: -/// '__covariant' -/// '__contravariant' -/// -/// \param lAngleLoc The location of the starting '<'. -/// -/// \param protocolIdents Will capture the list of identifiers, if the -/// angle brackets contain a list of protocol references rather than a -/// type parameter list. -/// -/// \param rAngleLoc The location of the ending '>'. ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( ObjCTypeParamListScope &Scope, SourceLocation &lAngleLoc, SmallVectorImpl &protocolIdents, SourceLocation &rAngleLoc, @@ -605,7 +535,6 @@ ObjCTypeParamList *Parser::parseObjCTypeParamListOrProtocolRefs( return invalid ? nullptr : list; } -/// Parse an objc-type-parameter-list. ObjCTypeParamList *Parser::parseObjCTypeParamList() { SourceLocation lAngleLoc; SmallVector protocolIdents; @@ -630,18 +559,6 @@ static bool isTopLevelObjCKeyword(tok::ObjCKeywordKind DirectiveKind) { } } -/// objc-interface-decl-list: -/// empty -/// objc-interface-decl-list objc-property-decl [OBJC2] -/// objc-interface-decl-list objc-method-requirement [OBJC2] -/// objc-interface-decl-list objc-method-proto ';' -/// objc-interface-decl-list declaration -/// objc-interface-decl-list ';' -/// -/// objc-method-requirement: [OBJC2] -/// @required -/// @optional -/// void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey, Decl *CDecl) { SmallVector allMethods; @@ -870,32 +787,6 @@ static void diagnoseRedundantPropertyNullability(Parser &P, << SourceRange(DS.getNullabilityLoc()); } -/// Parse property attribute declarations. -/// -/// property-attr-decl: '(' property-attrlist ')' -/// property-attrlist: -/// property-attribute -/// property-attrlist ',' property-attribute -/// property-attribute: -/// getter '=' identifier -/// setter '=' identifier ':' -/// direct -/// readonly -/// readwrite -/// assign -/// retain -/// copy -/// nonatomic -/// atomic -/// strong -/// weak -/// unsafe_unretained -/// nonnull -/// nullable -/// null_unspecified -/// null_resettable -/// class -/// void Parser::ParseObjCPropertyAttribute(ObjCDeclSpec &DS) { assert(Tok.getKind() == tok::l_paren); BalancedDelimiterTracker T(*this, tok::l_paren); @@ -1033,16 +924,6 @@ void Parser::ParseObjCPropertyAttribute(ObjCDeclSpec &DS) { T.consumeClose(); } -/// objc-method-proto: -/// objc-instance-method objc-method-decl objc-method-attributes[opt] -/// objc-class-method objc-method-decl objc-method-attributes[opt] -/// -/// objc-instance-method: '-' -/// objc-class-method: '+' -/// -/// objc-method-attributes: [OBJC2] -/// __attribute__((deprecated)) -/// Decl *Parser::ParseObjCMethodPrototype(tok::ObjCKeywordKind MethodImplKind, bool MethodDefinition) { assert(Tok.isOneOf(tok::minus, tok::plus) && "expected +/-"); @@ -1056,19 +937,9 @@ Decl *Parser::ParseObjCMethodPrototype(tok::ObjCKeywordKind MethodImplKind, return MDecl; } -/// objc-selector: -/// identifier -/// one of -/// enum struct union if else while do for switch case default -/// break continue return goto asm sizeof typeof __alignof -/// unsigned long const short volatile signed restrict _Complex -/// in out inout bycopy byref oneway int char float double void _Bool -/// IdentifierInfo *Parser::ParseObjCSelectorPiece(SourceLocation &SelectorLoc) { switch (Tok.getKind()) { - default: - return nullptr; case tok::colon: // Empty selector piece uses the location of the ':'. SelectorLoc = Tok.getLocation(); @@ -1094,84 +965,18 @@ IdentifierInfo *Parser::ParseObjCSelectorPiece(SourceLocation &SelectorLoc) { return nullptr; } - case tok::identifier: - case tok::kw_asm: - case tok::kw_auto: - case tok::kw_bool: - case tok::kw_break: - case tok::kw_case: - case tok::kw_catch: - case tok::kw_char: - case tok::kw_class: - case tok::kw_const: - case tok::kw_const_cast: - case tok::kw_continue: - case tok::kw_default: - case tok::kw_delete: - case tok::kw_do: - case tok::kw_double: - case tok::kw_dynamic_cast: - case tok::kw_else: - case tok::kw_enum: - case tok::kw_explicit: - case tok::kw_export: - case tok::kw_extern: - case tok::kw_false: - case tok::kw_float: - case tok::kw_for: - case tok::kw_friend: - case tok::kw_goto: - case tok::kw_if: - case tok::kw_inline: - case tok::kw_int: - case tok::kw_long: - case tok::kw_mutable: - case tok::kw_namespace: - case tok::kw_new: - case tok::kw_operator: - case tok::kw_private: - case tok::kw_protected: - case tok::kw_public: - case tok::kw_register: - case tok::kw_reinterpret_cast: - case tok::kw_restrict: - case tok::kw_return: - case tok::kw_short: - case tok::kw_signed: - case tok::kw_sizeof: - case tok::kw_static: - case tok::kw_static_cast: - case tok::kw_struct: - case tok::kw_switch: - case tok::kw_template: - case tok::kw_this: - case tok::kw_throw: - case tok::kw_true: - case tok::kw_try: - case tok::kw_typedef: - case tok::kw_typeid: - case tok::kw_typename: - case tok::kw_typeof: - case tok::kw_union: - case tok::kw_unsigned: - case tok::kw_using: - case tok::kw_virtual: - case tok::kw_void: - case tok::kw_volatile: - case tok::kw_wchar_t: - case tok::kw_while: - case tok::kw__Bool: - case tok::kw__Complex: - case tok::kw___alignof: - case tok::kw___auto_type: + case tok::kw___attribute: + return nullptr; + + default: IdentifierInfo *II = Tok.getIdentifierInfo(); + if (!II) + return nullptr; SelectorLoc = ConsumeToken(); return II; } } -/// objc-for-collection-in: 'in' -/// bool Parser::isTokIdentifier_in() const { // FIXME: May have to do additional look-ahead to only allow for // valid tokens following an 'in'; such as an identifier, unary operators, @@ -1181,25 +986,6 @@ bool Parser::isTokIdentifier_in() const { ObjCTypeQuals[llvm::to_underlying(ObjCTypeQual::in)]); } -/// ParseObjCTypeQualifierList - This routine parses the objective-c's type -/// qualifier list and builds their bitmask representation in the input -/// argument. -/// -/// objc-type-qualifiers: -/// objc-type-qualifier -/// objc-type-qualifiers objc-type-qualifier -/// -/// objc-type-qualifier: -/// 'in' -/// 'out' -/// 'inout' -/// 'oneway' -/// 'bycopy' -/// 'byref' -/// 'nonnull' -/// 'nullable' -/// 'null_unspecified' -/// void Parser::ParseObjCTypeQualifierList(ObjCDeclSpec &DS, DeclaratorContext Context) { assert(Context == DeclaratorContext::ObjCParameter || @@ -1309,10 +1095,6 @@ static void takeDeclAttributes(ParsedAttributes &attrs, takeDeclAttributes(attrs, D.getTypeObject(i).getAttrs()); } -/// objc-type-name: -/// '(' objc-type-qualifiers[opt] type-name ')' -/// '(' objc-type-qualifiers[opt] ')' -/// ParsedType Parser::ParseObjCTypeName(ObjCDeclSpec &DS, DeclaratorContext context, ParsedAttributes *paramAttrs) { @@ -1379,34 +1161,6 @@ ParsedType Parser::ParseObjCTypeName(ObjCDeclSpec &DS, return Ty; } -/// objc-method-decl: -/// objc-selector -/// objc-keyword-selector objc-parmlist[opt] -/// objc-type-name objc-selector -/// objc-type-name objc-keyword-selector objc-parmlist[opt] -/// -/// objc-keyword-selector: -/// objc-keyword-decl -/// objc-keyword-selector objc-keyword-decl -/// -/// objc-keyword-decl: -/// objc-selector ':' objc-type-name objc-keyword-attributes[opt] identifier -/// objc-selector ':' objc-keyword-attributes[opt] identifier -/// ':' objc-type-name objc-keyword-attributes[opt] identifier -/// ':' objc-keyword-attributes[opt] identifier -/// -/// objc-parmlist: -/// objc-parms objc-ellipsis[opt] -/// -/// objc-parms: -/// objc-parms , parameter-declaration -/// -/// objc-ellipsis: -/// , ... -/// -/// objc-keyword-attributes: [OBJC2] -/// __attribute__((unused)) -/// Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc, tok::TokenKind mType, tok::ObjCKeywordKind MethodImplKind, @@ -1601,9 +1355,6 @@ Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc, return Result; } -/// objc-protocol-refs: -/// '<' identifier-list '>' -/// bool Parser:: ParseObjCProtocolReferences(SmallVectorImpl &Protocols, SmallVectorImpl &ProtocolLocs, @@ -1668,11 +1419,6 @@ TypeResult Parser::parseObjCProtocolQualifierType(SourceLocation &rAngleLoc) { return result; } -/// Parse Objective-C type arguments or protocol qualifiers. -/// -/// objc-type-arguments: -/// '<' type-name '...'[opt] (',' type-name '...'[opt])* '>' -/// void Parser::parseObjCTypeArgsOrProtocolQualifiers( ParsedType baseType, SourceLocation &typeArgsLAngleLoc, @@ -1950,27 +1696,6 @@ void Parser::HelperActionsForIvarDeclarations( ParsedAttributesView()); } -/// objc-class-instance-variables: -/// '{' objc-instance-variable-decl-list[opt] '}' -/// -/// objc-instance-variable-decl-list: -/// objc-visibility-spec -/// objc-instance-variable-decl ';' -/// ';' -/// objc-instance-variable-decl-list objc-visibility-spec -/// objc-instance-variable-decl-list objc-instance-variable-decl ';' -/// objc-instance-variable-decl-list static_assert-declaration -/// objc-instance-variable-decl-list ';' -/// -/// objc-visibility-spec: -/// @private -/// @protected -/// @public -/// @package [OBJC2] -/// -/// objc-instance-variable-decl: -/// struct-declaration -/// void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl, tok::ObjCKeywordKind visibility, SourceLocation atLoc) { @@ -2070,22 +1795,6 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl, T, AllIvarDecls, false); } -/// objc-protocol-declaration: -/// objc-protocol-definition -/// objc-protocol-forward-reference -/// -/// objc-protocol-definition: -/// \@protocol identifier -/// objc-protocol-refs[opt] -/// objc-interface-decl-list -/// \@end -/// -/// objc-protocol-forward-reference: -/// \@protocol identifier-list ';' -/// -/// "\@protocol identifier ;" should be resolved as "\@protocol -/// identifier-list ;": objc-interface-decl-list may not start with a -/// semicolon in the first alternative if objc-protocol-refs are omitted. Parser::DeclGroupPtrTy Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, ParsedAttributes &attrs) { @@ -2171,16 +1880,6 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, return Actions.ConvertDeclToDeclGroup(ProtoType); } -/// objc-implementation: -/// objc-class-implementation-prologue -/// objc-category-implementation-prologue -/// -/// objc-class-implementation-prologue: -/// @implementation identifier objc-superclass[opt] -/// objc-class-instance-variables[opt] -/// -/// objc-category-implementation-prologue: -/// @implementation identifier ( identifier ) Parser::DeclGroupPtrTy Parser::ParseObjCAtImplementationDeclaration(SourceLocation AtLoc, ParsedAttributes &Attrs) { @@ -2366,9 +2065,6 @@ void Parser::ObjCImplParsingDataRAII::finish(SourceRange AtEnd) { Finished = true; } -/// compatibility-alias-decl: -/// @compatibility_alias alias-name class-name ';' -/// Decl *Parser::ParseObjCAtAliasDeclaration(SourceLocation atLoc) { assert(Tok.isObjCAtKeyword(tok::objc_compatibility_alias) && "ParseObjCAtAliasDeclaration(): Expected @compatibility_alias"); @@ -2386,17 +2082,6 @@ Decl *Parser::ParseObjCAtAliasDeclaration(SourceLocation atLoc) { classId, classLoc); } -/// property-synthesis: -/// @synthesize property-ivar-list ';' -/// -/// property-ivar-list: -/// property-ivar -/// property-ivar-list ',' property-ivar -/// -/// property-ivar: -/// identifier -/// identifier '=' identifier -/// Decl *Parser::ParseObjCPropertySynthesize(SourceLocation atLoc) { assert(Tok.isObjCAtKeyword(tok::objc_synthesize) && "ParseObjCPropertySynthesize(): Expected '@synthesize'"); @@ -2445,13 +2130,6 @@ Decl *Parser::ParseObjCPropertySynthesize(SourceLocation atLoc) { return nullptr; } -/// property-dynamic: -/// @dynamic property-list -/// -/// property-list: -/// identifier -/// property-list ',' identifier -/// Decl *Parser::ParseObjCPropertyDynamic(SourceLocation atLoc) { assert(Tok.isObjCAtKeyword(tok::objc_dynamic) && "ParseObjCPropertyDynamic(): Expected '@dynamic'"); @@ -2510,9 +2188,6 @@ Decl *Parser::ParseObjCPropertyDynamic(SourceLocation atLoc) { return nullptr; } -/// objc-throw-statement: -/// throw expression[opt]; -/// StmtResult Parser::ParseObjCThrowStmt(SourceLocation atLoc) { ExprResult Res; ConsumeToken(); // consume throw @@ -2528,9 +2203,6 @@ StmtResult Parser::ParseObjCThrowStmt(SourceLocation atLoc) { return Actions.ObjC().ActOnObjCAtThrowStmt(atLoc, Res.get(), getCurScope()); } -/// objc-synchronized-statement: -/// @synchronized '(' expression ')' compound-statement -/// StmtResult Parser::ParseObjCSynchronizedStmt(SourceLocation atLoc) { ConsumeToken(); // consume synchronized @@ -2582,17 +2254,6 @@ Parser::ParseObjCSynchronizedStmt(SourceLocation atLoc) { body.get()); } -/// objc-try-catch-statement: -/// @try compound-statement objc-catch-list[opt] -/// @try compound-statement objc-catch-list[opt] @finally compound-statement -/// -/// objc-catch-list: -/// @catch ( parameter-declaration ) compound-statement -/// objc-catch-list @catch ( catch-parameter-declaration ) compound-statement -/// catch-parameter-declaration: -/// parameter-declaration -/// '...' [OBJC2] -/// StmtResult Parser::ParseObjCTryStmt(SourceLocation atLoc) { bool catch_or_finally_seen = false; @@ -2709,9 +2370,6 @@ StmtResult Parser::ParseObjCTryStmt(SourceLocation atLoc) { FinallyStmt.get()); } -/// objc-autoreleasepool-statement: -/// @autoreleasepool compound-statement -/// StmtResult Parser::ParseObjCAutoreleasePoolStmt(SourceLocation atLoc) { ConsumeToken(); // consume autoreleasepool @@ -2732,8 +2390,6 @@ Parser::ParseObjCAutoreleasePoolStmt(SourceLocation atLoc) { AutoreleasePoolBody.get()); } -/// StashAwayMethodOrFunctionBodyTokens - Consume the tokens and store them -/// for later parsing. void Parser::StashAwayMethodOrFunctionBodyTokens(Decl *MDecl) { if (SkipFunctionBodies && (!MDecl || Actions.canSkipFunctionBody(MDecl)) && trySkippingFunctionBody()) { @@ -2776,8 +2432,6 @@ void Parser::StashAwayMethodOrFunctionBodyTokens(Decl *MDecl) { } } -/// objc-method-def: objc-method-proto ';'[opt] '{' body '}' -/// Decl *Parser::ParseObjCMethodDefinition() { Decl *MDecl = ParseObjCMethodPrototype(); @@ -2966,28 +2620,6 @@ ExprResult Parser::ParseObjCAtExpression(SourceLocation AtLoc) { } } -/// Parse the receiver of an Objective-C++ message send. -/// -/// This routine parses the receiver of a message send in -/// Objective-C++ either as a type or as an expression. Note that this -/// routine must not be called to parse a send to 'super', since it -/// has no way to return such a result. -/// -/// \param IsExpr Whether the receiver was parsed as an expression. -/// -/// \param TypeOrExpr If the receiver was parsed as an expression (\c -/// IsExpr is true), the parsed expression. If the receiver was parsed -/// as a type (\c IsExpr is false), the parsed type. -/// -/// \returns True if an error occurred during parsing or semantic -/// analysis, in which case the arguments do not have valid -/// values. Otherwise, returns false for a successful parse. -/// -/// objc-receiver: [C++] -/// 'super' [not parsed here] -/// expression -/// simple-type-specifier -/// typename-specifier bool Parser::ParseObjCXXMessageReceiver(bool &IsExpr, void *&TypeOrExpr) { InMessageExpressionRAIIObject InMessage(*this, true); @@ -3057,11 +2689,6 @@ bool Parser::ParseObjCXXMessageReceiver(bool &IsExpr, void *&TypeOrExpr) { return false; } -/// Determine whether the parser is currently referring to a an -/// Objective-C message send, using a simplified heuristic to avoid overhead. -/// -/// This routine will only return true for a subset of valid message-send -/// expressions. bool Parser::isSimpleObjCMessageExpression() { assert(Tok.is(tok::l_square) && getLangOpts().ObjC && "Incorrect start for isSimpleObjCMessageExpression"); @@ -3098,15 +2725,6 @@ bool Parser::isStartOfObjCClassMessageMissingOpenBracket() { return false; } -/// objc-message-expr: -/// '[' objc-receiver objc-message-args ']' -/// -/// objc-receiver: [C] -/// 'super' -/// expression -/// class-name -/// type-name -/// ExprResult Parser::ParseObjCMessageExpression() { assert(Tok.is(tok::l_square) && "'[' expected"); SourceLocation LBracLoc = ConsumeBracket(); // consume '[' @@ -3202,44 +2820,6 @@ ExprResult Parser::ParseObjCMessageExpression() { Res.get()); } -/// Parse the remainder of an Objective-C message following the -/// '[' objc-receiver. -/// -/// This routine handles sends to super, class messages (sent to a -/// class name), and instance messages (sent to an object), and the -/// target is represented by \p SuperLoc, \p ReceiverType, or \p -/// ReceiverExpr, respectively. Only one of these parameters may have -/// a valid value. -/// -/// \param LBracLoc The location of the opening '['. -/// -/// \param SuperLoc If this is a send to 'super', the location of the -/// 'super' keyword that indicates a send to the superclass. -/// -/// \param ReceiverType If this is a class message, the type of the -/// class we are sending a message to. -/// -/// \param ReceiverExpr If this is an instance message, the expression -/// used to compute the receiver object. -/// -/// objc-message-args: -/// objc-selector -/// objc-keywordarg-list -/// -/// objc-keywordarg-list: -/// objc-keywordarg -/// objc-keywordarg-list objc-keywordarg -/// -/// objc-keywordarg: -/// selector-name[opt] ':' objc-keywordexpr -/// -/// objc-keywordexpr: -/// nonempty-expr-list -/// -/// nonempty-expr-list: -/// assignment-expression -/// nonempty-expr-list , assignment-expression -/// ExprResult Parser::ParseObjCMessageExpressionBody(SourceLocation LBracLoc, SourceLocation SuperLoc, @@ -3437,20 +3017,12 @@ ExprResult Parser::ParseObjCStringLiteral(SourceLocation AtLoc) { return Actions.ObjC().ParseObjCStringLiteral(AtLocs.data(), AtStrings); } -/// ParseObjCBooleanLiteral - -/// objc-scalar-literal : '@' boolean-keyword -/// ; -/// boolean-keyword: 'true' | 'false' | '__objc_yes' | '__objc_no' -/// ; ExprResult Parser::ParseObjCBooleanLiteral(SourceLocation AtLoc, bool ArgValue) { SourceLocation EndLoc = ConsumeToken(); // consume the keyword. return Actions.ObjC().ActOnObjCBoolLiteral(AtLoc, EndLoc, ArgValue); } -/// ParseObjCCharacterLiteral - -/// objc-scalar-literal : '@' character-literal -/// ; ExprResult Parser::ParseObjCCharacterLiteral(SourceLocation AtLoc) { ExprResult Lit(Actions.ActOnCharacterConstant(Tok)); if (Lit.isInvalid()) { @@ -3460,11 +3032,6 @@ ExprResult Parser::ParseObjCCharacterLiteral(SourceLocation AtLoc) { return Actions.ObjC().BuildObjCNumericLiteral(AtLoc, Lit.get()); } -/// ParseObjCNumericLiteral - -/// objc-scalar-literal : '@' scalar-literal -/// ; -/// scalar-literal : | numeric-constant /* any numeric constant. */ -/// ; ExprResult Parser::ParseObjCNumericLiteral(SourceLocation AtLoc) { ExprResult Lit(Actions.ActOnNumericConstant(Tok)); if (Lit.isInvalid()) { @@ -3474,9 +3041,6 @@ ExprResult Parser::ParseObjCNumericLiteral(SourceLocation AtLoc) { return Actions.ObjC().BuildObjCNumericLiteral(AtLoc, Lit.get()); } -/// ParseObjCBoxedExpr - -/// objc-box-expression: -/// @( assignment-expression ) ExprResult Parser::ParseObjCBoxedExpr(SourceLocation AtLoc) { if (Tok.isNot(tok::l_paren)) @@ -3608,8 +3172,6 @@ ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) { Elements); } -/// objc-encode-expression: -/// \@encode ( type-name ) ExprResult Parser::ParseObjCEncodeExpression(SourceLocation AtLoc) { assert(Tok.isObjCAtKeyword(tok::objc_encode) && "Not an @encode expression!"); @@ -3633,8 +3195,6 @@ Parser::ParseObjCEncodeExpression(SourceLocation AtLoc) { AtLoc, EncLoc, T.getOpenLocation(), Ty.get(), T.getCloseLocation()); } -/// objc-protocol-expression -/// \@protocol ( protocol-name ) ExprResult Parser::ParseObjCProtocolExpression(SourceLocation AtLoc) { SourceLocation ProtoLoc = ConsumeToken(); @@ -3658,8 +3218,6 @@ Parser::ParseObjCProtocolExpression(SourceLocation AtLoc) { T.getCloseLocation()); } -/// objc-selector-expression -/// @selector '(' '('[opt] objc-keyword-selector ')'[opt] ')' ExprResult Parser::ParseObjCSelectorExpression(SourceLocation AtLoc) { SourceLocation SelectorLoc = ConsumeToken(); diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index e1da86a3a72fd..e2c2463200892 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -665,8 +665,6 @@ ExprResult Parser::ParseOpenACCConditionExpr() { return R.isInvalid() ? ExprError() : R.get().second; } -// Tries to parse the 'modifier-list' for a 'copy', 'copyin', 'copyout', or -// 'create' clause. OpenACCModifierKind Parser::tryParseModifierList(OpenACCClauseKind CK) { // Use the tentative parsing to decide whether we are a comma-delmited list of // identifers ending in a colon so we can do an actual parse with diagnostics. @@ -729,11 +727,6 @@ OpenACCModifierKind Parser::tryParseModifierList(OpenACCClauseKind CK) { return CurModList; } -// OpenACC 3.3, section 1.7: -// To simplify the specification and convey appropriate constraint information, -// a pqr-list is a comma-separated list of pdr items. The one exception is a -// clause-list, which is a list of one or more clauses optionally separated by -// commas. SmallVector Parser::ParseOpenACCClauseList(OpenACCDirectiveKind DirKind) { SmallVector Clauses; @@ -807,15 +800,6 @@ bool Parser::ParseOpenACCIntExprList(OpenACCDirectiveKind DK, return false; } -/// OpenACC 3.3 Section 2.4: -/// The argument to the device_type clause is a comma-separated list of one or -/// more device architecture name identifiers, or an asterisk. -/// -/// The syntax of the device_type clause is -/// device_type( * ) -/// device_type( device-type-list ) -/// -/// The device_type clause may be abbreviated to dtype. bool Parser::ParseOpenACCDeviceTypeList( llvm::SmallVector &Archs) { @@ -841,12 +825,6 @@ bool Parser::ParseOpenACCDeviceTypeList( return false; } -/// OpenACC 3.3 Section 2.9: -/// size-expr is one of: -// * -// int-expr -// Note that this is specified under 'gang-arg-list', but also applies to 'tile' -// via reference. ExprResult Parser::ParseOpenACCSizeExpr(OpenACCClauseKind CK) { // The size-expr ends up being ambiguous when only looking at the current // token, as it could be a deref of a variable/expression. @@ -895,12 +873,6 @@ bool Parser::ParseOpenACCSizeExprList( return false; } -/// OpenACC 3.3 Section 2.9: -/// -/// where gang-arg is one of: -/// [num:]int-expr -/// dim:int-expr -/// static:size-expr Parser::OpenACCGangArgRes Parser::ParseOpenACCGangArg(SourceLocation GangLoc) { if (isOpenACCSpecialToken(OpenACCSpecialTokenKind::Static, getCurToken()) && @@ -967,11 +939,6 @@ bool Parser::ParseOpenACCGangArgList( return false; } -// The OpenACC Clause List is a comma or space-delimited list of clauses (see -// the comment on ParseOpenACCClauseList). The concept of a 'clause' doesn't -// really have its owner grammar and each individual one has its own definition. -// However, they all are named with a single-identifier (or auto/default!) -// token, followed in some cases by either braces or parens. Parser::OpenACCClauseParseResult Parser::ParseOpenACCClause(ArrayRef ExistingClauses, OpenACCDirectiveKind DirKind) { @@ -1278,23 +1245,12 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams( Actions.OpenACC().ActOnClause(ExistingClauses, ParsedClause)); } -/// OpenACC 3.3 section 2.16: -/// In this section and throughout the specification, the term async-argument -/// means a nonnegative scalar integer expression (int for C or C++, integer for -/// Fortran), or one of the special values acc_async_noval or acc_async_sync, as -/// defined in the C header file and the Fortran openacc module. The special -/// values are negative values, so as not to conflict with a user-specified -/// nonnegative async-argument. Parser::OpenACCIntExprParseResult Parser::ParseOpenACCAsyncArgument(OpenACCDirectiveKind DK, OpenACCClauseKind CK, SourceLocation Loc) { return ParseOpenACCIntExpr(DK, CK, Loc); } -/// OpenACC 3.3, section 2.16: -/// In this section and throughout the specification, the term wait-argument -/// means: -/// [ devnum : int-expr : ] [ queues : ] async-argument-list Parser::OpenACCWaitParseInfo Parser::ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective) { OpenACCWaitParseInfo Result; @@ -1442,13 +1398,6 @@ Parser::ParseOpenACCBindClauseArgument() { return cast(Res.get()); } -/// OpenACC 3.3, section 1.6: -/// In this spec, a 'var' (in italics) is one of the following: -/// - a variable name (a scalar, array, or composite variable name) -/// - a subarray specification with subscript ranges -/// - an array element -/// - a member of a composite variable -/// - a common block name between slashes (fortran only) Parser::OpenACCVarParseResult Parser::ParseOpenACCVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK) { OpenACCArraySectionRAII ArraySections(*this); @@ -1493,10 +1442,6 @@ llvm::SmallVector Parser::ParseOpenACCVarList(OpenACCDirectiveKind DK, return Vars; } -/// OpenACC 3.3, section 2.10: -/// In C and C++, the syntax of the cache directive is: -/// -/// #pragma acc cache ([readonly:]var-list) new-line Parser::OpenACCCacheParseInfo Parser::ParseOpenACCCacheVarList() { // If this is the end of the line, just return 'false' and count on the close // paren diagnostic to catch the issue. @@ -1678,7 +1623,6 @@ Parser::ParseOpenACCAfterRoutineStmt(OpenACCDirectiveParseInfo &DirInfo) { DirInfo.RParenLoc, DirInfo.Clauses, DirInfo.EndLoc, NextStmt.get()); } -// Parse OpenACC directive on a declaration. Parser::DeclGroupPtrTy Parser::ParseOpenACCDirectiveDecl(AccessSpecifier &AS, ParsedAttributes &Attrs, DeclSpec::TST TagType, Decl *TagDecl) { @@ -1700,7 +1644,6 @@ Parser::ParseOpenACCDirectiveDecl(AccessSpecifier &AS, ParsedAttributes &Attrs, DirInfo.RParenLoc, DirInfo.EndLoc, DirInfo.Clauses)); } -// Parse OpenACC Directive on a Statement. StmtResult Parser::ParseOpenACCDirectiveStmt() { assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token"); diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 85838feae77d3..cfffcdb01a514 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -276,16 +276,6 @@ static DeclarationName parseOpenMPReductionId(Parser &P) { : DeclNames.getCXXOperatorName(OOK); } -/// Parse 'omp declare reduction' construct. -/// -/// declare-reduction-directive: -/// annot_pragma_openmp 'declare' 'reduction' -/// '(' ':' {',' } ':' ')' -/// ['initializer' '(' ('omp_priv' '=' )| ')'] -/// annot_pragma_openmp_end -/// is either a base language identifier or one of the following -/// operators: '+', '-', '*', '&', '|', '^', '&&' and '||'. -/// Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) { unsigned OMPVersion = Actions.getLangOpts().OpenMP; @@ -530,14 +520,6 @@ void Parser::ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm) { } } -/// Parses 'omp declare mapper' directive. -/// -/// declare-mapper-directive: -/// annot_pragma_openmp 'declare' 'mapper' '(' [ ':'] -/// ')' [[[,] ] ... ] -/// annot_pragma_openmp_end -/// and are base language identifiers. -/// Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) { bool IsCorrect = true; @@ -868,7 +850,6 @@ static bool parseDeclareSimdClauses( return IsError; } -/// Parse clauses for '#pragma omp declare simd'. Parser::DeclGroupPtrTy Parser::ParseOMPDeclareSimdClauses(Parser::DeclGroupPtrTy Ptr, CachedTokens &Toks, SourceLocation Loc) { @@ -1207,9 +1188,6 @@ static ExprResult parseContextScore(Parser &P) { return ScoreExpr; } -/// Parses an OpenMP context selector. -/// -/// ['('[] [, ]* ')'] void Parser::parseOMPContextSelector( OMPTraitSelector &TISelector, llvm::omp::TraitSet Set, llvm::StringMap &SeenSelectors) { @@ -1379,9 +1357,6 @@ void Parser::parseOMPTraitSetKind(OMPTraitSet &TISet, << CONTEXT_SELECTOR_SET_LVL << listOpenMPContextTraitSets(); } -/// Parses an OpenMP context selector set. -/// -/// '=' '{' [, ]* '}' void Parser::parseOMPContextSelectorSet( OMPTraitSet &TISet, llvm::StringMap &SeenSets) { auto OuterBC = BraceCount; @@ -1456,9 +1431,6 @@ void Parser::parseOMPContextSelectorSet( } } -/// Parse OpenMP context selectors: -/// -/// [, ]* bool Parser::parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI) { llvm::StringMap SeenSets; do { @@ -1471,7 +1443,6 @@ bool Parser::parseOMPContextSelectors(SourceLocation Loc, OMPTraitInfo &TI) { return false; } -/// Parse clauses for '#pragma omp declare variant ( variant-func-id ) clause'. void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, CachedTokens &Toks, SourceLocation Loc) { @@ -1721,13 +1692,6 @@ bool Parser::parseOMPDeclareVariantMatchClause(SourceLocation Loc, return false; } -/// [clause[ [,] clause] ... ] -/// -/// clauses: for error directive -/// 'at' '(' compilation | execution ')' -/// 'severity' '(' fatal | warning ')' -/// 'message' '(' msg-string ')' -/// .... void Parser::ParseOpenMPClauses(OpenMPDirectiveKind DKind, SmallVectorImpl &Clauses, SourceLocation Loc) { @@ -1755,19 +1719,6 @@ void Parser::ParseOpenMPClauses(OpenMPDirectiveKind DKind, } } -/// `omp assumes` or `omp begin/end assumes` [[,]]... -/// where -/// -/// clause: -/// 'ext_IMPL_DEFINED' -/// 'absent' '(' directive-name [, directive-name]* ')' -/// 'contains' '(' directive-name [, directive-name]* ')' -/// 'holds' '(' scalar-expression ')' -/// 'no_openmp' -/// 'no_openmp_routines' -/// 'no_openmp_constructs' (OpenMP 6.0) -/// 'no_parallelism' -/// void Parser::ParseOpenMPAssumesDirective(OpenMPDirectiveKind DKind, SourceLocation Loc) { SmallVector Assumptions; @@ -2104,42 +2055,6 @@ void Parser::ParseOMPEndDeclareTargetDirective(OpenMPDirectiveKind BeginDKind, ConsumeAnnotationToken(); } -/// Parsing of declarative OpenMP directives. -/// -/// threadprivate-directive: -/// annot_pragma_openmp 'threadprivate' simple-variable-list -/// annot_pragma_openmp_end -/// -/// allocate-directive: -/// annot_pragma_openmp 'allocate' simple-variable-list [] -/// annot_pragma_openmp_end -/// -/// declare-reduction-directive: -/// annot_pragma_openmp 'declare' 'reduction' [...] -/// annot_pragma_openmp_end -/// -/// declare-mapper-directive: -/// annot_pragma_openmp 'declare' 'mapper' '(' [ ':'] -/// ')' [[[,] ] ... ] -/// annot_pragma_openmp_end -/// -/// declare-simd-directive: -/// annot_pragma_openmp 'declare simd' { [,]} -/// annot_pragma_openmp_end -/// -/// -/// requires directive: -/// annot_pragma_openmp 'requires' [[[,] ] ... ] -/// annot_pragma_openmp_end -/// -/// assumes directive: -/// annot_pragma_openmp 'assumes' [[[,] ] ... ] -/// annot_pragma_openmp_end -/// or -/// annot_pragma_openmp 'begin assumes' [[[,] ] ... ] -/// annot_pragma_openmp 'end assumes' -/// annot_pragma_openmp_end -/// Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( AccessSpecifier &AS, ParsedAttributes &Attrs, bool Delayed, DeclSpec::TST TagType, Decl *Tag) { @@ -2682,46 +2597,6 @@ StmtResult Parser::ParseOpenMPInformationalDirective( return Directive; } -/// Parsing of declarative or executable OpenMP directives. -/// -/// threadprivate-directive: -/// annot_pragma_openmp 'threadprivate' simple-variable-list -/// annot_pragma_openmp_end -/// -/// allocate-directive: -/// annot_pragma_openmp 'allocate' simple-variable-list -/// annot_pragma_openmp_end -/// -/// declare-reduction-directive: -/// annot_pragma_openmp 'declare' 'reduction' '(' ':' -/// {',' } ':' ')' ['initializer' '(' -/// ('omp_priv' '=' |) ')'] -/// annot_pragma_openmp_end -/// -/// declare-mapper-directive: -/// annot_pragma_openmp 'declare' 'mapper' '(' [ ':'] -/// ')' [[[,] ] ... ] -/// annot_pragma_openmp_end -/// -/// executable-directive: -/// annot_pragma_openmp 'parallel' | 'simd' | 'for' | 'sections' | -/// 'section' | 'single' | 'master' | 'critical' [ '(' ')' ] | -/// 'parallel for' | 'parallel sections' | 'parallel master' | 'task' | -/// 'taskyield' | 'barrier' | 'taskwait' | 'flush' | 'ordered' | 'error' -/// | 'atomic' | 'for simd' | 'parallel for simd' | 'target' | 'target -/// data' | 'taskgroup' | 'teams' | 'taskloop' | 'taskloop simd' | -/// 'master taskloop' | 'master taskloop simd' | 'parallel master -/// taskloop' | 'parallel master taskloop simd' | 'distribute' | 'target -/// enter data' | 'target exit data' | 'target parallel' | 'target -/// parallel for' | 'target update' | 'distribute parallel for' | -/// 'distribute paralle for simd' | 'distribute simd' | 'target parallel -/// for simd' | 'target simd' | 'teams distribute' | 'teams distribute -/// simd' | 'teams distribute parallel for simd' | 'teams distribute -/// parallel for' | 'target teams' | 'target teams distribute' | 'target -/// teams distribute parallel for' | 'target teams distribute parallel -/// for simd' | 'target teams distribute simd' | 'masked' | -/// 'parallel masked' {clause} annot_pragma_openmp_end -/// StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( ParsedStmtContext StmtCtx, bool ReadDirectiveWithinMetadirective) { if (!ReadDirectiveWithinMetadirective) @@ -2738,6 +2613,10 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( Diag(Tok, diag::err_omp_unknown_directive); return StmtError(); } + if (!(getDirectiveLanguages(DKind) & SourceLanguage::C)) { + // Treat directives that are not allowed in C/C++ as unknown. + DKind = OMPD_unknown; + } StmtResult Directive = StmtError(); @@ -2786,8 +2665,12 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( ? OMPC_unknown : getOpenMPClauseKind(PP.getSpelling(Tok)); // Check if the clause is unrecognized. - if (CKind == OMPC_unknown) + if (CKind == OMPC_unknown) { Diag(Tok, diag::err_omp_expected_clause) << "metadirective"; + TPA.Revert(); + SkipUntil(tok::annot_pragma_openmp_end); + return Directive; + } if (getLangOpts().OpenMP < 52 && CKind == OMPC_otherwise) Diag(Tok, diag::err_omp_unexpected_clause) << getOpenMPClauseName(CKind) << "metadirective"; @@ -2798,8 +2681,11 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( // Parse '('. if (T.expectAndConsume(diag::err_expected_lparen_after, - getOpenMPClauseName(CKind).data())) + getOpenMPClauseName(CKind).data())) { + TPA.Revert(); + SkipUntil(tok::annot_pragma_openmp_end); return Directive; + } OMPTraitInfo &TI = Actions.getASTContext().getNewOMPTraitInfo(); if (CKind == OMPC_when) { @@ -3082,10 +2968,6 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( return Directive; } -// Parses simple list: -// simple-variable-list: -// '(' id-expression {, id-expression} ')' -// bool Parser::ParseOpenMPSimpleVarList( OpenMPDirectiveKind Kind, const llvm::function_ref @@ -3222,26 +3104,6 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) { Loc, T.getOpenLocation(), T.getCloseLocation(), Data); } -/// Parsing of OpenMP clauses. -/// -/// clause: -/// if-clause | final-clause | num_threads-clause | safelen-clause | -/// default-clause | private-clause | firstprivate-clause | shared-clause -/// | linear-clause | aligned-clause | collapse-clause | bind-clause | -/// lastprivate-clause | reduction-clause | proc_bind-clause | -/// schedule-clause | copyin-clause | copyprivate-clause | untied-clause | -/// mergeable-clause | flush-clause | read-clause | write-clause | -/// update-clause | capture-clause | seq_cst-clause | device-clause | -/// simdlen-clause | threads-clause | simd-clause | num_teams-clause | -/// thread_limit-clause | priority-clause | grainsize-clause | -/// nogroup-clause | num_tasks-clause | hint-clause | to-clause | -/// from-clause | is_device_ptr-clause | task_reduction-clause | -/// in_reduction-clause | allocator-clause | allocate-clause | -/// acq_rel-clause | acquire-clause | release-clause | relaxed-clause | -/// depobj-clause | destroy-clause | detach-clause | inclusive-clause | -/// exclusive-clause | uses_allocators-clause | use_device_addr-clause | -/// has_device_addr -/// OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, OpenMPClauseKind CKind, bool FirstClause) { OMPClauseKind = CKind; @@ -3638,50 +3500,6 @@ ExprResult Parser::ParseOpenMPParensExpr(StringRef ClauseName, return Val; } -/// Parsing of OpenMP clauses with single expressions like 'final', -/// 'collapse', 'safelen', 'num_threads', 'simdlen', 'num_teams', -/// 'thread_limit', 'simdlen', 'priority', 'grainsize', 'num_tasks', 'hint' or -/// 'detach'. -/// -/// final-clause: -/// 'final' '(' expression ')' -/// -/// num_threads-clause: -/// 'num_threads' '(' expression ')' -/// -/// safelen-clause: -/// 'safelen' '(' expression ')' -/// -/// simdlen-clause: -/// 'simdlen' '(' expression ')' -/// -/// collapse-clause: -/// 'collapse' '(' expression ')' -/// -/// priority-clause: -/// 'priority' '(' expression ')' -/// -/// grainsize-clause: -/// 'grainsize' '(' expression ')' -/// -/// num_tasks-clause: -/// 'num_tasks' '(' expression ')' -/// -/// hint-clause: -/// 'hint' '(' expression ')' -/// -/// allocator-clause: -/// 'allocator' '(' expression ')' -/// -/// detach-clause: -/// 'detach' '(' event-handler-expression ')' -/// -/// align-clause -/// 'align' '(' positive-integer-constant ')' -/// -/// holds-clause -/// 'holds' '(' expression ')' -/// OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind, bool ParseOnly) { SourceLocation Loc = ConsumeToken(); @@ -3699,10 +3517,6 @@ OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind, LLoc, RLoc); } -/// Parse indirect clause for '#pragma omp declare target' directive. -/// 'indirect' '[' '(' invoked-by-fptr ')' ']' -/// where invoked-by-fptr is a constant boolean expression that evaluates to -/// true or false at compile time. bool Parser::ParseOpenMPIndirectClause( SemaOpenMP::DeclareTargetContextInfo &DTCI, bool ParseOnly) { SourceLocation Loc = ConsumeToken(); @@ -3740,8 +3554,6 @@ bool Parser::ParseOpenMPIndirectClause( return false; } -/// Parses a comma-separated list of interop-types and a prefer_type list. -/// bool Parser::ParseOMPInteropInfo(OMPInteropInfo &InteropInfo, OpenMPClauseKind Kind) { const Token &Tok = getCurToken(); @@ -3824,29 +3636,6 @@ bool Parser::ParseOMPInteropInfo(OMPInteropInfo &InteropInfo, return HasError; } -/// Parsing of OpenMP clauses that use an interop-var. -/// -/// init-clause: -/// init([interop-modifier, ]interop-type[[, interop-type] ... ]:interop-var) -/// -/// destroy-clause: -/// destroy(interop-var) -/// -/// use-clause: -/// use(interop-var) -/// -/// interop-modifier: -/// prefer_type(preference-list) -/// -/// preference-list: -/// foreign-runtime-id [, foreign-runtime-id]... -/// -/// foreign-runtime-id: -/// | -/// -/// interop-type: -/// target | targetsync -/// OMPClause *Parser::ParseOpenMPInteropClause(OpenMPClauseKind Kind, bool ParseOnly) { SourceLocation Loc = ConsumeToken(); @@ -3950,21 +3739,6 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool ParseOnly) { Attrs, Loc, T.getOpenLocation(), T.getCloseLocation()); } -/// Parsing of simple OpenMP clauses like 'default' or 'proc_bind'. -/// -/// default-clause: -/// 'default' '(' 'none' | 'shared' | 'private' | 'firstprivate' ')' -/// -/// proc_bind-clause: -/// 'proc_bind' '(' 'master' | 'close' | 'spread' ')' -/// -/// bind-clause: -/// 'bind' '(' 'teams' | 'parallel' | 'thread' ')' -/// -/// update-clause: -/// 'update' '(' 'in' | 'out' | 'inout' | 'mutexinoutset' | -/// 'inoutset' ')' -/// OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind, bool ParseOnly) { std::optional Val = parseOpenMPSimpleClause(*this, Kind); @@ -3986,32 +3760,6 @@ OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind, Kind, Val->Type, Val->TypeLoc, Val->LOpen, Val->Loc, Val->RLoc); } -/// Parsing of OpenMP clauses like 'ordered'. -/// -/// ordered-clause: -/// 'ordered' -/// -/// nowait-clause: -/// 'nowait' -/// -/// untied-clause: -/// 'untied' -/// -/// mergeable-clause: -/// 'mergeable' -/// -/// read-clause: -/// 'read' -/// -/// threads-clause: -/// 'threads' -/// -/// simd-clause: -/// 'simd' -/// -/// nogroup-clause: -/// 'nogroup' -/// OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly) { SourceLocation Loc = Tok.getLocation(); ConsumeAnyToken(); @@ -4021,22 +3769,6 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly) { return Actions.OpenMP().ActOnOpenMPClause(Kind, Loc, Tok.getLocation()); } -/// Parsing of OpenMP clauses with single expressions and some additional -/// argument like 'schedule' or 'dist_schedule'. -/// -/// schedule-clause: -/// 'schedule' '(' [ modifier [ ',' modifier ] ':' ] kind [',' expression ] -/// ')' -/// -/// if-clause: -/// 'if' '(' [ directive-name-modifier ':' ] expression ')' -/// -/// defaultmap: -/// 'defaultmap' '(' modifier [ ':' kind ] ')' -/// -/// device-clause: -/// 'device' '(' [ device-modifier ':' ] expression ')' -/// OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, bool ParseOnly) { @@ -4343,7 +4075,6 @@ static OpenMPMapModifierKind isMapModifier(Parser &P) { return TypeModifier; } -/// Parse the mapper modifier in map, to, and from clauses. bool Parser::parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data) { // Parse '('. BalancedDelimiterTracker T(*this, tok::l_paren, tok::colon); @@ -4374,11 +4105,6 @@ bool Parser::parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data) { static OpenMPMapClauseKind isMapType(Parser &P); -/// Parse map-type-modifiers in map clause. -/// map([ [map-type-modifier[,] [map-type-modifier[,] ...] [map-type] : ] list) -/// where, map-type-modifier ::= always | close | mapper(mapper-identifier) | -/// present -/// where, map-type ::= alloc | delete | from | release | to | tofrom bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) { bool HasMapType = false; SourceLocation PreMapLoc = Tok.getLocation(); @@ -4506,8 +4232,6 @@ static void parseMapType(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data) { P.ConsumeToken(); } -/// Parses simple expression in parens for single-expression clauses of OpenMP -/// constructs. ExprResult Parser::ParseOpenMPIteratorsExpr() { assert(Tok.is(tok::identifier) && PP.getSpelling(Tok) == "iterator" && "Expected 'iterator' token."); @@ -4745,7 +4469,6 @@ parseOpenMPAllocateClauseModifiers(Parser &P, OpenMPClauseKind Kind, return Tail; } -/// Parses clauses with list. bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, SmallVectorImpl &Vars, @@ -5237,66 +4960,6 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, InvalidReductionId || IsInvalidMapperModifier || InvalidIterator; } -/// Parsing of OpenMP clause 'private', 'firstprivate', 'lastprivate', -/// 'shared', 'copyin', 'copyprivate', 'flush', 'reduction', 'task_reduction', -/// 'in_reduction', 'nontemporal', 'exclusive' or 'inclusive'. -/// -/// private-clause: -/// 'private' '(' list ')' -/// firstprivate-clause: -/// 'firstprivate' '(' list ')' -/// lastprivate-clause: -/// 'lastprivate' '(' list ')' -/// shared-clause: -/// 'shared' '(' list ')' -/// linear-clause: -/// 'linear' '(' linear-list [ ':' linear-step ] ')' -/// aligned-clause: -/// 'aligned' '(' list [ ':' alignment ] ')' -/// reduction-clause: -/// 'reduction' '(' [ modifier ',' ] reduction-identifier ':' list ')' -/// task_reduction-clause: -/// 'task_reduction' '(' reduction-identifier ':' list ')' -/// in_reduction-clause: -/// 'in_reduction' '(' reduction-identifier ':' list ')' -/// copyprivate-clause: -/// 'copyprivate' '(' list ')' -/// flush-clause: -/// 'flush' '(' list ')' -/// depend-clause: -/// 'depend' '(' in | out | inout : list | source ')' -/// map-clause: -/// 'map' '(' [ [ always [,] ] [ close [,] ] -/// [ mapper '(' mapper-identifier ')' [,] ] -/// to | from | tofrom | alloc | release | delete ':' ] list ')'; -/// to-clause: -/// 'to' '(' [ mapper '(' mapper-identifier ')' ':' ] list ')' -/// from-clause: -/// 'from' '(' [ mapper '(' mapper-identifier ')' ':' ] list ')' -/// use_device_ptr-clause: -/// 'use_device_ptr' '(' list ')' -/// use_device_addr-clause: -/// 'use_device_addr' '(' list ')' -/// is_device_ptr-clause: -/// 'is_device_ptr' '(' list ')' -/// has_device_addr-clause: -/// 'has_device_addr' '(' list ')' -/// allocate-clause: -/// 'allocate' '(' [ allocator ':' ] list ')' -/// As of OpenMP 5.1 there's also -/// 'allocate' '(' allocate-modifier: list ')' -/// where allocate-modifier is: 'allocator' '(' allocator ')' -/// nontemporal-clause: -/// 'nontemporal' '(' list ')' -/// inclusive-clause: -/// 'inclusive' '(' list ')' -/// exclusive-clause: -/// 'exclusive' '(' list ')' -/// -/// For 'linear' clause linear-list may have the following forms: -/// list -/// modifier(list) -/// where modifier is 'val' (C) or 'ref', 'val' or 'uval'(C++). OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, bool ParseOnly) { diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp index 69f3568cfdba7..3d46d02b72128 100644 --- a/clang/lib/Parse/ParsePragma.cpp +++ b/clang/lib/Parse/ParsePragma.cpp @@ -704,11 +704,6 @@ void Parser::resetPragmaHandlers() { } } -/// Handle the annotation token produced for #pragma unused(...) -/// -/// Each annot_pragma_unused is followed by the argument token so e.g. -/// "#pragma unused(x,y)" becomes: -/// annot_pragma_unused 'x' annot_pragma_unused 'y' void Parser::HandlePragmaUnused() { assert(Tok.is(tok::annot_pragma_unused)); SourceLocation UnusedLoc = ConsumeAnnotationToken(); @@ -1226,7 +1221,6 @@ bool Parser::HandlePragmaMSSegment(StringRef PragmaName, return true; } -// #pragma init_seg({ compiler | lib | user | "section-name" [, func-name]} ) bool Parser::HandlePragmaMSInitSeg(StringRef PragmaName, SourceLocation PragmaLocation) { if (getTargetInfo().getTriple().getEnvironment() != llvm::Triple::MSVC) { @@ -1288,9 +1282,6 @@ bool Parser::HandlePragmaMSInitSeg(StringRef PragmaName, return true; } -// #pragma strict_gs_check(pop) -// #pragma strict_gs_check(push, "on" | "off") -// #pragma strict_gs_check("on" | "off") bool Parser::HandlePragmaMSStrictGuardStackCheck( StringRef PragmaName, SourceLocation PragmaLocation) { if (ExpectAndConsume(tok::l_paren, diag::warn_pragma_expected_lparen, @@ -3856,7 +3847,6 @@ bool Parser::HandlePragmaMSFunction(StringRef PragmaName, return true; } -// #pragma optimize("gsty", on|off) bool Parser::HandlePragmaMSOptimize(StringRef PragmaName, SourceLocation PragmaLocation) { Token FirstTok = Tok; diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 2794a5834dce9..c788723023c8b 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -36,8 +36,6 @@ using namespace clang; // C99 6.8: Statements and Blocks. //===----------------------------------------------------------------------===// -/// Parse a standalone statement (for instance, as the body of an 'if', -/// 'while', or 'for'). StmtResult Parser::ParseStatement(SourceLocation *TrailingElseLoc, ParsedStmtContext StmtCtx) { StmtResult Res; @@ -52,55 +50,6 @@ StmtResult Parser::ParseStatement(SourceLocation *TrailingElseLoc, return Res; } -/// ParseStatementOrDeclaration - Read 'statement' or 'declaration'. -/// StatementOrDeclaration: -/// statement -/// declaration -/// -/// statement: -/// labeled-statement -/// compound-statement -/// expression-statement -/// selection-statement -/// iteration-statement -/// jump-statement -/// [C++] declaration-statement -/// [C++] try-block -/// [MS] seh-try-block -/// [OBC] objc-throw-statement -/// [OBC] objc-try-catch-statement -/// [OBC] objc-synchronized-statement -/// [GNU] asm-statement -/// [OMP] openmp-construct [TODO] -/// -/// labeled-statement: -/// identifier ':' statement -/// 'case' constant-expression ':' statement -/// 'default' ':' statement -/// -/// selection-statement: -/// if-statement -/// switch-statement -/// -/// iteration-statement: -/// while-statement -/// do-statement -/// for-statement -/// -/// expression-statement: -/// expression[opt] ';' -/// -/// jump-statement: -/// 'goto' identifier ';' -/// 'continue' ';' -/// 'break' ';' -/// 'return' expression[opt] ';' -/// [GNU] 'goto' '*' expression ';' -/// -/// [OBC] objc-throw-statement: -/// [OBC] '@' 'throw' expression ';' -/// [OBC] '@' 'throw' ';' -/// StmtResult Parser::ParseStatementOrDeclaration(StmtVector &Stmts, ParsedStmtContext StmtCtx, @@ -554,7 +503,6 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes( return Res; } -/// Parse an expression statement. StmtResult Parser::ParseExprStatement(ParsedStmtContext StmtCtx) { // If a case keyword is missing, this is where it should be inserted. Token OldToken = Tok; @@ -599,15 +547,6 @@ StmtResult Parser::ParseExprStatement(ParsedStmtContext StmtCtx) { return R; } -/// ParseSEHTryBlockCommon -/// -/// seh-try-block: -/// '__try' compound-statement seh-handler -/// -/// seh-handler: -/// seh-except-block -/// seh-finally-block -/// StmtResult Parser::ParseSEHTryBlock() { assert(Tok.is(tok::kw___try) && "Expected '__try'"); SourceLocation TryLoc = ConsumeToken(); @@ -642,11 +581,6 @@ StmtResult Parser::ParseSEHTryBlock() { Handler.get()); } -/// ParseSEHExceptBlock - Handle __except -/// -/// seh-except-block: -/// '__except' '(' seh-filter-expression ')' compound-statement -/// StmtResult Parser::ParseSEHExceptBlock(SourceLocation ExceptLoc) { PoisonIdentifierRAIIObject raii(Ident__exception_code, false), raii2(Ident___exception_code, false), @@ -694,11 +628,6 @@ StmtResult Parser::ParseSEHExceptBlock(SourceLocation ExceptLoc) { return Actions.ActOnSEHExceptBlock(ExceptLoc, FilterExpr.get(), Block.get()); } -/// ParseSEHFinallyBlock - Handle __finally -/// -/// seh-finally-block: -/// '__finally' compound-statement -/// StmtResult Parser::ParseSEHFinallyBlock(SourceLocation FinallyLoc) { PoisonIdentifierRAIIObject raii(Ident__abnormal_termination, false), raii2(Ident___abnormal_termination, false), @@ -741,15 +670,6 @@ static void DiagnoseLabelFollowedByDecl(Parser &P, const Stmt *SubStmt) { } } -/// ParseLabeledStatement - We have an identifier and a ':' after it. -/// -/// label: -/// identifier ':' -/// [GNU] identifier ':' attributes[opt] -/// -/// labeled-statement: -/// label statement -/// StmtResult Parser::ParseLabeledStatement(ParsedAttributes &Attrs, ParsedStmtContext StmtCtx) { assert(Tok.is(tok::identifier) && Tok.getIdentifierInfo() && @@ -818,11 +738,6 @@ StmtResult Parser::ParseLabeledStatement(ParsedAttributes &Attrs, SubStmt.get()); } -/// ParseCaseStatement -/// labeled-statement: -/// 'case' constant-expression ':' statement -/// [GNU] 'case' constant-expression '...' constant-expression ':' statement -/// StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx, bool MissingCase, ExprResult Expr) { assert((MissingCase || Tok.is(tok::kw_case)) && "Not a case stmt!"); @@ -972,11 +887,6 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx, return TopLevelCase; } -/// ParseDefaultStatement -/// labeled-statement: -/// 'default' ':' statement -/// Note that this does not parse the 'statement' at the end. -/// StmtResult Parser::ParseDefaultStatement(ParsedStmtContext StmtCtx) { assert(Tok.is(tok::kw_default) && "Not a default stmt!"); @@ -1027,28 +937,6 @@ StmtResult Parser::ParseCompoundStatement(bool isStmtExpr) { Scope::DeclScope | Scope::CompoundStmtScope); } -/// ParseCompoundStatement - Parse a "{}" block. -/// -/// compound-statement: [C99 6.8.2] -/// { block-item-list[opt] } -/// [GNU] { label-declarations block-item-list } [TODO] -/// -/// block-item-list: -/// block-item -/// block-item-list block-item -/// -/// block-item: -/// declaration -/// [GNU] '__extension__' declaration -/// statement -/// -/// [GNU] label-declarations: -/// [GNU] label-declaration -/// [GNU] label-declarations label-declaration -/// -/// [GNU] label-declaration: -/// [GNU] '__label__' identifier-list ';' -/// StmtResult Parser::ParseCompoundStatement(bool isStmtExpr, unsigned ScopeFlags) { assert(Tok.is(tok::l_brace) && "Not a compound stmt!"); @@ -1065,9 +953,6 @@ StmtResult Parser::ParseCompoundStatement(bool isStmtExpr, return R; } -/// Parse any pragmas at the start of the compound expression. We handle these -/// separately since some pragmas (FP_CONTRACT) must appear before any C -/// statement in the compound, but may be intermingled with other pragmas. void Parser::ParseCompoundStatementLeadingPragmas() { bool checkForPragmas = true; while (checkForPragmas) { @@ -1147,8 +1032,6 @@ void Parser::DiagnoseLabelAtEndOfCompoundStatement() { } } -/// Consume any extra semi-colons resulting in null statements, -/// returning true if any tok::semi were consumed. bool Parser::ConsumeNullStmt(StmtVector &Stmts) { if (!Tok.is(tok::semi)) return false; @@ -1196,10 +1079,6 @@ StmtResult Parser::handleExprStmt(ExprResult E, ParsedStmtContext StmtCtx) { return Actions.ActOnExprStmt(E, /*DiscardedValue=*/!IsStmtExprResult); } -/// ParseCompoundStatementBody - Parse a sequence of statements optionally -/// followed by a label and invoke the ActOnCompoundStmt action. This expects -/// the '{' to be the current token, and consume the '}' at the end of the -/// block. It does not manipulate the scope stack. StmtResult Parser::ParseCompoundStatementBody(bool isStmtExpr) { PrettyStackTraceLoc CrashInfo(PP.getSourceManager(), Tok.getLocation(), @@ -1353,20 +1232,6 @@ StmtResult Parser::ParseCompoundStatementBody(bool isStmtExpr) { Stmts, isStmtExpr); } -/// ParseParenExprOrCondition: -/// [C ] '(' expression ')' -/// [C++] '(' condition ')' -/// [C++1z] '(' init-statement[opt] condition ')' -/// -/// This function parses and performs error recovery on the specified condition -/// or expression (depending on whether we're in C++ or C mode). This function -/// goes out of its way to recover well. It returns true if there was a parser -/// error (the right paren couldn't be found), which indicates that the caller -/// should try to recover harder. It returns false if the condition is -/// successfully parsed. Note that a successful parse can still have semantic -/// errors in the condition. -/// Additionally, it will assign the location of the outer-most '(' and ')', -/// to LParenLoc and RParenLoc, respectively. bool Parser::ParseParenExprOrCondition(StmtResult *InitStmt, Sema::ConditionResult &Cond, SourceLocation Loc, @@ -1521,15 +1386,6 @@ struct MisleadingIndentationChecker { } -/// ParseIfStatement -/// if-statement: [C99 6.8.4.1] -/// 'if' '(' expression ')' statement -/// 'if' '(' expression ')' statement 'else' statement -/// [C++] 'if' '(' condition ')' statement -/// [C++] 'if' '(' condition ')' statement 'else' statement -/// [C++23] 'if' '!' [opt] consteval compound-statement -/// [C++23] 'if' '!' [opt] consteval compound-statement 'else' statement -/// StmtResult Parser::ParseIfStatement(SourceLocation *TrailingElseLoc) { assert(Tok.is(tok::kw_if) && "Not an if stmt!"); SourceLocation IfLoc = ConsumeToken(); // eat the 'if'. @@ -1753,10 +1609,6 @@ StmtResult Parser::ParseIfStatement(SourceLocation *TrailingElseLoc) { ThenStmt.get(), ElseLoc, ElseStmt.get()); } -/// ParseSwitchStatement -/// switch-statement: -/// 'switch' '(' expression ')' statement -/// [C++] 'switch' '(' condition ')' statement StmtResult Parser::ParseSwitchStatement(SourceLocation *TrailingElseLoc) { assert(Tok.is(tok::kw_switch) && "Not a switch stmt!"); SourceLocation SwitchLoc = ConsumeToken(); // eat the 'switch'. @@ -1840,10 +1692,6 @@ StmtResult Parser::ParseSwitchStatement(SourceLocation *TrailingElseLoc) { return Actions.ActOnFinishSwitchStmt(SwitchLoc, Switch.get(), Body.get()); } -/// ParseWhileStatement -/// while-statement: [C99 6.8.5.1] -/// 'while' '(' expression ')' statement -/// [C++] 'while' '(' condition ')' statement StmtResult Parser::ParseWhileStatement(SourceLocation *TrailingElseLoc) { assert(Tok.is(tok::kw_while) && "Not a while stmt!"); SourceLocation WhileLoc = Tok.getLocation(); @@ -1920,10 +1768,6 @@ StmtResult Parser::ParseWhileStatement(SourceLocation *TrailingElseLoc) { return Actions.ActOnWhileStmt(WhileLoc, LParen, Cond, RParen, Body.get()); } -/// ParseDoStatement -/// do-statement: [C99 6.8.5.2] -/// 'do' statement 'while' '(' expression ')' ';' -/// Note: this lets the caller parse the end ';'. StmtResult Parser::ParseDoStatement() { assert(Tok.is(tok::kw_do) && "Not a do stmt!"); SourceLocation DoLoc = ConsumeToken(); // eat the 'do'. @@ -2025,29 +1869,6 @@ bool Parser::isForRangeIdentifier() { return false; } -/// ParseForStatement -/// for-statement: [C99 6.8.5.3] -/// 'for' '(' expr[opt] ';' expr[opt] ';' expr[opt] ')' statement -/// 'for' '(' declaration expr[opt] ';' expr[opt] ')' statement -/// [C++] 'for' '(' for-init-statement condition[opt] ';' expression[opt] ')' -/// [C++] statement -/// [C++0x] 'for' -/// 'co_await'[opt] [Coroutines] -/// '(' for-range-declaration ':' for-range-initializer ')' -/// statement -/// [OBJC2] 'for' '(' declaration 'in' expr ')' statement -/// [OBJC2] 'for' '(' expr 'in' expr ')' statement -/// -/// [C++] for-init-statement: -/// [C++] expression-statement -/// [C++] simple-declaration -/// [C++23] alias-declaration -/// -/// [C++0x] for-range-declaration: -/// [C++0x] attribute-specifier-seq[opt] type-specifier-seq declarator -/// [C++0x] for-range-initializer: -/// [C++0x] expression -/// [C++0x] braced-init-list [TODO] StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) { assert(Tok.is(tok::kw_for) && "Not a for stmt!"); SourceLocation ForLoc = ConsumeToken(); // eat the 'for'. @@ -2434,13 +2255,6 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) { Body.get()); } -/// ParseGotoStatement -/// jump-statement: -/// 'goto' identifier ';' -/// [GNU] 'goto' '*' expression ';' -/// -/// Note: this lets the caller parse the end ';'. -/// StmtResult Parser::ParseGotoStatement() { assert(Tok.is(tok::kw_goto) && "Not a goto stmt!"); SourceLocation GotoLoc = ConsumeToken(); // eat the 'goto'. @@ -2469,34 +2283,16 @@ StmtResult Parser::ParseGotoStatement() { return Res; } -/// ParseContinueStatement -/// jump-statement: -/// 'continue' ';' -/// -/// Note: this lets the caller parse the end ';'. -/// StmtResult Parser::ParseContinueStatement() { SourceLocation ContinueLoc = ConsumeToken(); // eat the 'continue'. return Actions.ActOnContinueStmt(ContinueLoc, getCurScope()); } -/// ParseBreakStatement -/// jump-statement: -/// 'break' ';' -/// -/// Note: this lets the caller parse the end ';'. -/// StmtResult Parser::ParseBreakStatement() { SourceLocation BreakLoc = ConsumeToken(); // eat the 'break'. return Actions.ActOnBreakStmt(BreakLoc, getCurScope()); } -/// ParseReturnStatement -/// jump-statement: -/// 'return' expression[opt] ';' -/// 'return' braced-init-list ';' -/// 'co_return' expression[opt] ';' -/// 'co_return' braced-init-list ';' StmtResult Parser::ParseReturnStatement() { assert((Tok.is(tok::kw_return) || Tok.is(tok::kw_co_return)) && "Not a return stmt!"); @@ -2602,11 +2398,6 @@ Decl *Parser::ParseFunctionStatementBody(Decl *Decl, ParseScope &BodyScope) { return Actions.ActOnFinishFunctionBody(Decl, FnBody.get()); } -/// ParseFunctionTryBlock - Parse a C++ function-try-block. -/// -/// function-try-block: -/// 'try' ctor-initializer[opt] compound-statement handler-seq -/// Decl *Parser::ParseFunctionTryBlock(Decl *Decl, ParseScope &BodyScope) { assert(Tok.is(tok::kw_try) && "Expected 'try'"); SourceLocation TryLoc = ConsumeToken(); @@ -2679,11 +2470,6 @@ bool Parser::trySkippingFunctionBody() { return true; } -/// ParseCXXTryBlock - Parse a C++ try-block. -/// -/// try-block: -/// 'try' compound-statement handler-seq -/// StmtResult Parser::ParseCXXTryBlock() { assert(Tok.is(tok::kw_try) && "Expected 'try'"); @@ -2691,22 +2477,6 @@ StmtResult Parser::ParseCXXTryBlock() { return ParseCXXTryBlockCommon(TryLoc); } -/// ParseCXXTryBlockCommon - Parse the common part of try-block and -/// function-try-block. -/// -/// try-block: -/// 'try' compound-statement handler-seq -/// -/// function-try-block: -/// 'try' ctor-initializer[opt] compound-statement handler-seq -/// -/// handler-seq: -/// handler handler-seq[opt] -/// -/// [Borland] try-block: -/// 'try' compound-statement seh-except-block -/// 'try' compound-statement seh-finally-block -/// StmtResult Parser::ParseCXXTryBlockCommon(SourceLocation TryLoc, bool FnTry) { if (Tok.isNot(tok::l_brace)) return StmtError(Diag(Tok, diag::err_expected) << tok::l_brace); @@ -2764,16 +2534,6 @@ StmtResult Parser::ParseCXXTryBlockCommon(SourceLocation TryLoc, bool FnTry) { } } -/// ParseCXXCatchBlock - Parse a C++ catch block, called handler in the standard -/// -/// handler: -/// 'catch' '(' exception-declaration ')' compound-statement -/// -/// exception-declaration: -/// attribute-specifier-seq[opt] type-specifier-seq declarator -/// attribute-specifier-seq[opt] type-specifier-seq abstract-declarator[opt] -/// '...' -/// StmtResult Parser::ParseCXXCatchBlock(bool FnCatch) { assert(Tok.is(tok::kw_catch) && "Expected 'catch'"); diff --git a/clang/lib/Parse/ParseStmtAsm.cpp b/clang/lib/Parse/ParseStmtAsm.cpp index 264e8c73799c8..40983e5db6d5a 100644 --- a/clang/lib/Parse/ParseStmtAsm.cpp +++ b/clang/lib/Parse/ParseStmtAsm.cpp @@ -196,7 +196,6 @@ void ClangAsmParserCallback::handleDiagnostic(const llvm::SMDiagnostic &D) { TheParser.Diag(Loc, diag::err_inline_ms_asm_parsing) << D.getMessage(); } -/// Parse an identifier in an MS-style inline assembly block. ExprResult Parser::ParseMSAsmIdentifier(llvm::SmallVectorImpl &LineToks, unsigned &NumLineToksConsumed, bool IsUnevaluatedContext) { @@ -351,7 +350,6 @@ static bool buildMSAsmString(Preprocessor &PP, SourceLocation AsmLoc, return false; } -// Determine if this is a GCC-style asm statement. bool Parser::isGCCAsmStatement(const Token &TokAfterAsm) const { return TokAfterAsm.is(tok::l_paren) || isGNUAsmQualifier(TokAfterAsm); } @@ -360,21 +358,6 @@ bool Parser::isGNUAsmQualifier(const Token &TokAfterAsm) const { return getGNUAsmQualifier(TokAfterAsm) != GNUAsmQualifiers::AQ_unspecified; } -/// ParseMicrosoftAsmStatement. When -fms-extensions/-fasm-blocks is enabled, -/// this routine is called to collect the tokens for an MS asm statement. -/// -/// [MS] ms-asm-statement: -/// ms-asm-block -/// ms-asm-block ms-asm-statement -/// -/// [MS] ms-asm-block: -/// '__asm' ms-asm-line '\n' -/// '__asm' '{' ms-asm-instruction-block[opt] '}' ';'[opt] -/// -/// [MS] ms-asm-instruction-block -/// ms-asm-line -/// ms-asm-line '\n' ms-asm-instruction-block -/// StmtResult Parser::ParseMicrosoftAsmStatement(SourceLocation AsmLoc) { SourceManager &SrcMgr = PP.getSourceManager(); SourceLocation EndLoc = AsmLoc; @@ -671,15 +654,6 @@ StmtResult Parser::ParseMicrosoftAsmStatement(SourceLocation AsmLoc) { ClobberRefs, Exprs, EndLoc); } -/// parseGNUAsmQualifierListOpt - Parse a GNU extended asm qualifier list. -/// asm-qualifier: -/// volatile -/// inline -/// goto -/// -/// asm-qualifier-list: -/// asm-qualifier -/// asm-qualifier-list asm-qualifier bool Parser::parseGNUAsmQualifierListOpt(GNUAsmQualifiers &AQ) { while (true) { const GNUAsmQualifiers::AQ A = getGNUAsmQualifier(Tok); @@ -699,25 +673,6 @@ bool Parser::parseGNUAsmQualifierListOpt(GNUAsmQualifiers &AQ) { return false; } -/// ParseAsmStatement - Parse a GNU extended asm statement. -/// asm-statement: -/// gnu-asm-statement -/// ms-asm-statement -/// -/// [GNU] gnu-asm-statement: -/// 'asm' asm-qualifier-list[opt] '(' asm-argument ')' ';' -/// -/// [GNU] asm-argument: -/// asm-string-literal -/// asm-string-literal ':' asm-operands[opt] -/// asm-string-literal ':' asm-operands[opt] ':' asm-operands[opt] -/// asm-string-literal ':' asm-operands[opt] ':' asm-operands[opt] -/// ':' asm-clobbers -/// -/// [GNU] asm-clobbers: -/// asm-string-literal -/// asm-clobbers ',' asm-string-literal -/// StmtResult Parser::ParseAsmStatement(bool &msAsm) { assert(Tok.is(tok::kw_asm) && "Not an asm stmt"); SourceLocation AsmLoc = ConsumeToken(); @@ -868,19 +823,6 @@ StmtResult Parser::ParseAsmStatement(bool &msAsm) { T.getCloseLocation()); } -/// ParseAsmOperands - Parse the asm-operands production as used by -/// asm-statement, assuming the leading ':' token was eaten. -/// -/// [GNU] asm-operands: -/// asm-operand -/// asm-operands ',' asm-operand -/// -/// [GNU] asm-operand: -/// asm-string-literal '(' expression ')' -/// '[' identifier ']' asm-string-literal '(' expression ')' -/// -// -// FIXME: Avoid unnecessary std::string trashing. bool Parser::ParseAsmOperandsOpt(SmallVectorImpl &Names, SmallVectorImpl &Constraints, SmallVectorImpl &Exprs) { diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp index dbe5e94747c67..88a0079483d9b 100644 --- a/clang/lib/Parse/ParseTemplate.cpp +++ b/clang/lib/Parse/ParseTemplate.cpp @@ -24,9 +24,6 @@ #include "llvm/Support/TimeProfiler.h" using namespace clang; -/// Re-enter a possible template scope, creating as many template parameter -/// scopes as necessary. -/// \return The number of template parameter scopes entered. unsigned Parser::ReenterTemplateScopes(MultiParseScope &S, Decl *D) { return Actions.ActOnReenterTemplateScope(D, [&] { S.Enter(Scope::TemplateParamScope); @@ -34,8 +31,6 @@ unsigned Parser::ReenterTemplateScopes(MultiParseScope &S, Decl *D) { }); } -/// Parse a template declaration, explicit instantiation, or -/// explicit specialization. Parser::DeclGroupPtrTy Parser::ParseDeclarationStartingWithTemplate(DeclaratorContext Context, SourceLocation &DeclEnd, @@ -51,30 +46,6 @@ Parser::ParseDeclarationStartingWithTemplate(DeclaratorContext Context, AccessSpecifier::AS_none); } -/// Parse a template declaration or an explicit specialization. -/// -/// Template declarations include one or more template parameter lists -/// and either the function or class template declaration. Explicit -/// specializations contain one or more 'template < >' prefixes -/// followed by a (possibly templated) declaration. Since the -/// syntactic form of both features is nearly identical, we parse all -/// of the template headers together and let semantic analysis sort -/// the declarations from the explicit specializations. -/// -/// template-declaration: [C++ temp] -/// 'export'[opt] 'template' '<' template-parameter-list '>' declaration -/// -/// template-declaration: [C++2a] -/// template-head declaration -/// template-head concept-definition -/// -/// TODO: requires-clause -/// template-head: [C++2a] -/// 'template' '<' template-parameter-list '>' -/// requires-clause[opt] -/// -/// explicit-specialization: [ C++ temp.expl.spec] -/// 'template' '<' '>' declaration Parser::DeclGroupPtrTy Parser::ParseTemplateDeclarationOrSpecialization( DeclaratorContext Context, SourceLocation &DeclEnd, ParsedAttributes &AccessAttrs, AccessSpecifier AS) { @@ -186,16 +157,6 @@ Parser::DeclGroupPtrTy Parser::ParseTemplateDeclarationOrSpecialization( AS); } -/// Parse a single declaration that declares a template, -/// template specialization, or explicit instantiation of a template. -/// -/// \param DeclEnd will receive the source location of the last token -/// within this declaration. -/// -/// \param AS the access specifier associated with this -/// declaration. Will be AS_none for namespace-scope declarations. -/// -/// \returns the new declaration. Parser::DeclGroupPtrTy Parser::ParseDeclarationAfterTemplate( DeclaratorContext Context, ParsedTemplateInfo &TemplateInfo, ParsingDeclRAIIObject &DiagsFromTParams, SourceLocation &DeclEnd, @@ -269,12 +230,6 @@ Parser::DeclGroupPtrTy Parser::ParseDeclarationAfterTemplate( return ParseDeclGroup(DS, Context, DeclAttrs, TemplateInfo, &DeclEnd); } -/// \brief Parse a single declaration that declares a concept. -/// -/// \param DeclEnd will receive the source location of the last token -/// within this declaration. -/// -/// \returns the new declaration. Decl * Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, SourceLocation &DeclEnd) { @@ -363,15 +318,6 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, Attrs); } -/// ParseTemplateParameters - Parses a template-parameter-list enclosed in -/// angle brackets. Depth is the depth of this template-parameter-list, which -/// is the number of template headers directly enclosing this template header. -/// TemplateParams is the current list of template parameters we're building. -/// The template parameter we parse will be added to this list. LAngleLoc and -/// RAngleLoc will receive the positions of the '<' and '>', respectively, -/// that enclose this template parameter list. -/// -/// \returns true if an error occurred, false otherwise. bool Parser::ParseTemplateParameters( MultiParseScope &TemplateScopes, unsigned Depth, SmallVectorImpl &TemplateParams, SourceLocation &LAngleLoc, @@ -406,14 +352,6 @@ bool Parser::ParseTemplateParameters( return false; } -/// ParseTemplateParameterList - Parse a template parameter list. If -/// the parsing fails badly (i.e., closing bracket was left out), this -/// will try to put the token stream in a reasonable position (closing -/// a statement, etc.) and return false. -/// -/// template-parameter-list: [C++ temp] -/// template-parameter -/// template-parameter-list ',' template-parameter bool Parser::ParseTemplateParameterList(const unsigned Depth, SmallVectorImpl &TemplateParams) { @@ -448,8 +386,6 @@ Parser::ParseTemplateParameterList(const unsigned Depth, return true; } -/// Determine whether the parser is at the start of a template -/// type parameter. Parser::TPResult Parser::isStartOfTemplateTypeParameter() { if (Tok.is(tok::kw_class)) { // "class" may be the start of an elaborated-type-specifier or a @@ -531,26 +467,6 @@ Parser::TPResult Parser::isStartOfTemplateTypeParameter() { } } -/// ParseTemplateParameter - Parse a template-parameter (C++ [temp.param]). -/// -/// template-parameter: [C++ temp.param] -/// type-parameter -/// parameter-declaration -/// -/// type-parameter: (See below) -/// type-parameter-key ...[opt] identifier[opt] -/// type-parameter-key identifier[opt] = type-id -/// (C++2a) type-constraint ...[opt] identifier[opt] -/// (C++2a) type-constraint identifier[opt] = type-id -/// 'template' '<' template-parameter-list '>' type-parameter-key -/// ...[opt] identifier[opt] -/// 'template' '<' template-parameter-list '>' type-parameter-key -/// identifier[opt] '=' id-expression -/// -/// type-parameter-key: -/// class -/// typename -/// NamedDecl *Parser::ParseTemplateParameter(unsigned Depth, unsigned Position) { switch (isStartOfTemplateTypeParameter()) { @@ -608,8 +524,6 @@ NamedDecl *Parser::ParseTemplateParameter(unsigned Depth, unsigned Position) { return ParseNonTypeTemplateParameter(Depth, Position); } -/// Check whether the current token is a template-id annotation denoting a -/// type-constraint. bool Parser::isTypeConstraintAnnotation() { const Token &T = Tok.is(tok::annot_cxxscope) ? NextToken() : Tok; if (T.isNot(tok::annot_template_id)) @@ -619,14 +533,6 @@ bool Parser::isTypeConstraintAnnotation() { return ExistingAnnot->Kind == TNK_Concept_template; } -/// Try parsing a type-constraint at the current location. -/// -/// type-constraint: -/// nested-name-specifier[opt] concept-name -/// nested-name-specifier[opt] concept-name -/// '<' template-argument-list[opt] '>'[opt] -/// -/// \returns true if an error occurred, and false otherwise. bool Parser::TryAnnotateTypeConstraint() { if (!getLangOpts().CPlusPlus20) return false; @@ -685,15 +591,6 @@ bool Parser::TryAnnotateTypeConstraint() { return false; } -/// ParseTypeParameter - Parse a template type parameter (C++ [temp.param]). -/// Other kinds of template parameters are parsed in -/// ParseTemplateTemplateParameter and ParseNonTypeTemplateParameter. -/// -/// type-parameter: [C++ temp.param] -/// 'class' ...[opt][C++0x] identifier[opt] -/// 'class' identifier[opt] '=' type-id -/// 'typename' ...[opt][C++0x] identifier[opt] -/// 'typename' identifier[opt] '=' type-id NamedDecl *Parser::ParseTypeParameter(unsigned Depth, unsigned Position) { assert((Tok.isOneOf(tok::kw_class, tok::kw_typename) || isTypeConstraintAnnotation()) && @@ -790,18 +687,6 @@ NamedDecl *Parser::ParseTypeParameter(unsigned Depth, unsigned Position) { return NewDecl; } -/// ParseTemplateTemplateParameter - Handle the parsing of template -/// template parameters. -/// -/// type-parameter: [C++ temp.param] -/// template-head type-parameter-key ...[opt] identifier[opt] -/// template-head type-parameter-key identifier[opt] = id-expression -/// type-parameter-key: -/// 'class' -/// 'typename' [C++1z] -/// template-head: [C++2a] -/// 'template' '<' template-parameter-list '>' -/// requires-clause[opt] NamedDecl *Parser::ParseTemplateTemplateParameter(unsigned Depth, unsigned Position) { assert(Tok.is(tok::kw_template) && "Expected 'template' keyword"); @@ -914,12 +799,6 @@ NamedDecl *Parser::ParseTemplateTemplateParameter(unsigned Depth, ParamName, NameLoc, Depth, Position, EqualLoc, DefaultArg); } -/// ParseNonTypeTemplateParameter - Handle the parsing of non-type -/// template parameters (e.g., in "template class array;"). -/// -/// template-parameter: -/// ... -/// parameter-declaration NamedDecl * Parser::ParseNonTypeTemplateParameter(unsigned Depth, unsigned Position) { // Parse the declaration-specifiers (i.e., the type). @@ -1004,21 +883,6 @@ void Parser::DiagnoseMisplacedEllipsisInDeclarator(SourceLocation EllipsisLoc, AlreadyHasEllipsis, D.hasName()); } -/// Parses a '>' at the end of a template list. -/// -/// If this function encounters '>>', '>>>', '>=', or '>>=', it tries -/// to determine if these tokens were supposed to be a '>' followed by -/// '>', '>>', '>=', or '>='. It emits an appropriate diagnostic if necessary. -/// -/// \param RAngleLoc the location of the consumed '>'. -/// -/// \param ConsumeLastToken if true, the '>' is consumed. -/// -/// \param ObjCGenericList if true, this is the '>' closing an Objective-C -/// type parameter or type argument list, rather than a C++ template parameter -/// or argument list. -/// -/// \returns true, if current token does not start with '>', false otherwise. bool Parser::ParseGreaterThanInTemplateList(SourceLocation LAngleLoc, SourceLocation &RAngleLoc, bool ConsumeLastToken, @@ -1177,17 +1041,6 @@ bool Parser::ParseGreaterThanInTemplateList(SourceLocation LAngleLoc, return false; } -/// Parses a template-id that after the template name has -/// already been parsed. -/// -/// This routine takes care of parsing the enclosed template argument -/// list ('<' template-parameter-list [opt] '>') and placing the -/// results into a form that can be transferred to semantic analysis. -/// -/// \param ConsumeLastToken if true, then we will consume the last -/// token that forms the template-id. Otherwise, we will leave the -/// last token in the stream (e.g., so that it can be replaced with an -/// annotation token). bool Parser::ParseTemplateIdAfterTemplateName(bool ConsumeLastToken, SourceLocation &LAngleLoc, TemplateArgList &TemplateArgs, @@ -1222,47 +1075,6 @@ bool Parser::ParseTemplateIdAfterTemplateName(bool ConsumeLastToken, Invalid; } -/// Replace the tokens that form a simple-template-id with an -/// annotation token containing the complete template-id. -/// -/// The first token in the stream must be the name of a template that -/// is followed by a '<'. This routine will parse the complete -/// simple-template-id and replace the tokens with a single annotation -/// token with one of two different kinds: if the template-id names a -/// type (and \p AllowTypeAnnotation is true), the annotation token is -/// a type annotation that includes the optional nested-name-specifier -/// (\p SS). Otherwise, the annotation token is a template-id -/// annotation that does not include the optional -/// nested-name-specifier. -/// -/// \param Template the declaration of the template named by the first -/// token (an identifier), as returned from \c Action::isTemplateName(). -/// -/// \param TNK the kind of template that \p Template -/// refers to, as returned from \c Action::isTemplateName(). -/// -/// \param SS if non-NULL, the nested-name-specifier that precedes -/// this template name. -/// -/// \param TemplateKWLoc if valid, specifies that this template-id -/// annotation was preceded by the 'template' keyword and gives the -/// location of that keyword. If invalid (the default), then this -/// template-id was not preceded by a 'template' keyword. -/// -/// \param AllowTypeAnnotation if true (the default), then a -/// simple-template-id that refers to a class template, template -/// template parameter, or other template that produces a type will be -/// replaced with a type annotation token. Otherwise, the -/// simple-template-id is always replaced with a template-id -/// annotation token. -/// -/// \param TypeConstraint if true, then this is actually a type-constraint, -/// meaning that the template argument list can be omitted (and the template in -/// question must be a concept). -/// -/// If an unrecoverable parse error occurs and no annotation token can be -/// formed, this function returns true. -/// bool Parser::AnnotateTemplateIdToken(TemplateTy Template, TemplateNameKind TNK, CXXScopeSpec &SS, SourceLocation TemplateKWLoc, @@ -1350,21 +1162,6 @@ bool Parser::AnnotateTemplateIdToken(TemplateTy Template, TemplateNameKind TNK, return false; } -/// Replaces a template-id annotation token with a type -/// annotation token. -/// -/// If there was a failure when forming the type from the template-id, -/// a type annotation token will still be created, but will have a -/// NULL type pointer to signify an error. -/// -/// \param SS The scope specifier appearing before the template-id, if any. -/// -/// \param AllowImplicitTypename whether this is a context where T::type -/// denotes a dependent type. -/// \param IsClassName Is this template-id appearing in a context where we -/// know it names a class, such as in an elaborated-type-specifier or -/// base-specifier? ('typename' and 'template' are unneeded and disallowed -/// in those contexts.) void Parser::AnnotateTemplateIdTokenAsType( CXXScopeSpec &SS, ImplicitTypenameContext AllowImplicitTypename, bool IsClassName) { @@ -1405,7 +1202,6 @@ static bool isEndOfTemplateArgument(Token Tok) { tok::greatergreatergreater); } -/// Parse a C++ template template argument. ParsedTemplateArgument Parser::ParseTemplateTemplateArgument() { if (!Tok.is(tok::identifier) && !Tok.is(tok::coloncolon) && !Tok.is(tok::annot_cxxscope)) @@ -1483,14 +1279,6 @@ ParsedTemplateArgument Parser::ParseTemplateTemplateArgument() { return Result; } -/// ParseTemplateArgument - Parse a C++ template argument (C++ [temp.names]). -/// -/// template-argument: [C++ 14.2] -/// constant-expression -/// type-id -/// id-expression -/// braced-init-list [C++26, DR] -/// ParsedTemplateArgument Parser::ParseTemplateArgument() { // C++ [temp.arg]p2: // In a template-argument, an ambiguity between a type-id and an @@ -1543,14 +1331,6 @@ ParsedTemplateArgument Parser::ParseTemplateArgument() { ExprArg.get(), Loc); } -/// ParseTemplateArgumentList - Parse a C++ template-argument-list -/// (C++ [temp.names]). Returns true if there was an error. -/// -/// template-argument-list: [C++ 14.2] -/// template-argument -/// template-argument-list ',' template-argument -/// -/// \param Template is only used for code completion, and may be null. bool Parser::ParseTemplateArgumentList(TemplateArgList &TemplateArgs, TemplateTy Template, SourceLocation OpenLoc) { @@ -1588,13 +1368,6 @@ bool Parser::ParseTemplateArgumentList(TemplateArgList &TemplateArgs, return false; } -/// Parse a C++ explicit template instantiation -/// (C++ [temp.explicit]). -/// -/// explicit-instantiation: -/// 'extern' [opt] 'template' declaration -/// -/// Note that the 'extern' is a GNU extension and C++11 feature. Parser::DeclGroupPtrTy Parser::ParseExplicitInstantiation( DeclaratorContext Context, SourceLocation ExternLoc, SourceLocation TemplateLoc, SourceLocation &DeclEnd, @@ -1622,7 +1395,6 @@ void Parser::LateTemplateParserCallback(void *P, LateParsedTemplate &LPT) { ((Parser *)P)->ParseLateTemplatedFuncDef(LPT); } -/// Late parse a C++ function template in Microsoft mode. void Parser::ParseLateTemplatedFuncDef(LateParsedTemplate &LPT) { if (!LPT.D) return; @@ -1706,7 +1478,6 @@ void Parser::ParseLateTemplatedFuncDef(LateParsedTemplate &LPT) { } } -/// Lex a delayed template function for late parsing. void Parser::LexTemplateFunctionForLateParsing(CachedTokens &Toks) { tok::TokenKind kind = Tok.getKind(); if (!ConsumeAndStoreFunctionPrologue(Toks)) { @@ -1723,10 +1494,6 @@ void Parser::LexTemplateFunctionForLateParsing(CachedTokens &Toks) { } } -/// We've parsed something that could plausibly be intended to be a template -/// name (\p LHS) followed by a '<' token, and the following code can't possibly -/// be an expression. Determine if this is likely to be a template-id and if so, -/// diagnose it. bool Parser::diagnoseUnknownTemplateId(ExprResult LHS, SourceLocation Less) { TentativeParsingAction TPA(*this); // FIXME: We could look at the token sequence in a lot more detail here. diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp index fcd76c75c9bfb..cc02ee51618aa 100644 --- a/clang/lib/Parse/ParseTentative.cpp +++ b/clang/lib/Parse/ParseTentative.cpp @@ -16,36 +16,6 @@ #include "clang/Sema/ParsedTemplate.h" using namespace clang; -/// isCXXDeclarationStatement - C++-specialized function that disambiguates -/// between a declaration or an expression statement, when parsing function -/// bodies. Returns true for declaration, false for expression. -/// -/// declaration-statement: -/// block-declaration -/// -/// block-declaration: -/// simple-declaration -/// asm-definition -/// namespace-alias-definition -/// using-declaration -/// using-directive -/// [C++0x] static_assert-declaration -/// -/// asm-definition: -/// 'asm' '(' string-literal ')' ';' -/// -/// namespace-alias-definition: -/// 'namespace' identifier = qualified-namespace-specifier ';' -/// -/// using-declaration: -/// 'using' typename[opt] '::'[opt] nested-name-specifier -/// unqualified-id ';' -/// 'using' '::' unqualified-id ; -/// -/// using-directive: -/// 'using' 'namespace' '::'[opt] nested-name-specifier[opt] -/// namespace-name ';' -/// bool Parser::isCXXDeclarationStatement( bool DisambiguatingWithExpression /*=false*/) { assert(getLangOpts().CPlusPlus && "Must be called for C++ only."); @@ -113,26 +83,6 @@ bool Parser::isCXXDeclarationStatement( } } -/// isCXXSimpleDeclaration - C++-specialized function that disambiguates -/// between a simple-declaration or an expression-statement. -/// If during the disambiguation process a parsing error is encountered, -/// the function returns true to let the declaration parsing code handle it. -/// Returns false if the statement is disambiguated as expression. -/// -/// simple-declaration: -/// decl-specifier-seq init-declarator-list[opt] ';' -/// decl-specifier-seq ref-qualifier[opt] '[' identifier-list ']' -/// brace-or-equal-initializer ';' [C++17] -/// -/// (if AllowForRangeDecl specified) -/// for ( for-range-declaration : for-range-initializer ) statement -/// -/// for-range-declaration: -/// decl-specifier-seq declarator -/// decl-specifier-seq ref-qualifier[opt] '[' identifier-list ']' -/// -/// In any of the above cases there can be a preceding attribute-specifier-seq, -/// but the caller is expected to handle that. bool Parser::isCXXSimpleDeclaration(bool AllowForRangeDecl) { // C++ 6.8p1: // There is an ambiguity in the grammar involving expression-statements and @@ -197,8 +147,6 @@ bool Parser::isCXXSimpleDeclaration(bool AllowForRangeDecl) { return TPR == TPResult::True; } -/// Try to consume a token sequence that we've already identified as -/// (potentially) starting a decl-specifier. Parser::TPResult Parser::TryConsumeDeclarationSpecifier() { switch (Tok.getKind()) { case tok::kw__Atomic: @@ -265,14 +213,6 @@ Parser::TPResult Parser::TryConsumeDeclarationSpecifier() { return TPResult::Ambiguous; } -/// simple-declaration: -/// decl-specifier-seq init-declarator-list[opt] ';' -/// -/// (if AllowForRangeDecl specified) -/// for ( for-range-declaration : for-range-initializer ) statement -/// for-range-declaration: -/// attribute-specifier-seqopt type-specifier-seq declarator -/// Parser::TPResult Parser::TryParseSimpleDeclaration(bool AllowForRangeDecl) { bool DeclSpecifierIsAuto = Tok.is(tok::kw_auto); if (TryConsumeDeclarationSpecifier() == TPResult::Error) @@ -301,33 +241,6 @@ Parser::TPResult Parser::TryParseSimpleDeclaration(bool AllowForRangeDecl) { return TPResult::Ambiguous; } -/// Tentatively parse an init-declarator-list in order to disambiguate it from -/// an expression. -/// -/// init-declarator-list: -/// init-declarator -/// init-declarator-list ',' init-declarator -/// -/// init-declarator: -/// declarator initializer[opt] -/// [GNU] declarator simple-asm-expr[opt] attributes[opt] initializer[opt] -/// -/// initializer: -/// brace-or-equal-initializer -/// '(' expression-list ')' -/// -/// brace-or-equal-initializer: -/// '=' initializer-clause -/// [C++11] braced-init-list -/// -/// initializer-clause: -/// assignment-expression -/// braced-init-list -/// -/// braced-init-list: -/// '{' initializer-list ','[opt] '}' -/// '{' '}' -/// Parser::TPResult Parser::TryParseInitDeclaratorList(bool MayHaveTrailingReturnType) { while (true) { @@ -519,23 +432,6 @@ bool Parser::isEnumBase(bool AllowSemi) { return R != TPResult::False; } -/// Disambiguates between a declaration in a condition, a -/// simple-declaration in an init-statement, and an expression for -/// a condition of a if/switch statement. -/// -/// condition: -/// expression -/// type-specifier-seq declarator '=' assignment-expression -/// [C++11] type-specifier-seq declarator '=' initializer-clause -/// [C++11] type-specifier-seq declarator braced-init-list -/// [GNU] type-specifier-seq declarator simple-asm-expr[opt] attributes[opt] -/// '=' assignment-expression -/// simple-declaration: -/// decl-specifier-seq init-declarator-list[opt] ';' -/// -/// Note that, unlike isCXXSimpleDeclaration, we must disambiguate all the way -/// to the ';' to disambiguate cases like 'int(x))' (an expression) from -/// 'int(x);' (a simple-declaration in an init-statement). Parser::ConditionOrInitStatement Parser::isCXXConditionDeclarationOrInitStatement(bool CanBeInitStatement, bool CanBeForRangeDecl) { @@ -607,23 +503,6 @@ Parser::isCXXConditionDeclarationOrInitStatement(bool CanBeInitStatement, return ConditionOrInitStatement::Expression; } -/// Determine whether the next set of tokens contains a type-id. -/// -/// The context parameter states what context we're parsing right -/// now, which affects how this routine copes with the token -/// following the type-id. If the context is -/// TentativeCXXTypeIdContext::InParens, we have already parsed the '(' and we -/// will cease lookahead when we hit the corresponding ')'. If the context is -/// TentativeCXXTypeIdContext::AsTemplateArgument, we've already parsed the '<' -/// or ',' before this template argument, and will cease lookahead when we hit a -/// '>', '>>' (in C++0x), or ','; or, in C++0x, an ellipsis immediately -/// preceding such. Returns true for a type-id and false for an expression. -/// If during the disambiguation process a parsing error is encountered, -/// the function returns true to let the declaration parsing code handle it. -/// -/// type-id: -/// type-specifier-seq abstract-declarator[opt] -/// bool Parser::isCXXTypeId(TentativeCXXTypeIdContext Context, bool &isAmbiguous) { isAmbiguous = false; @@ -704,40 +583,6 @@ bool Parser::isCXXTypeId(TentativeCXXTypeIdContext Context, bool &isAmbiguous) { return TPR == TPResult::True; } -/// Returns true if this is a C++11 attribute-specifier. Per -/// C++11 [dcl.attr.grammar]p6, two consecutive left square bracket tokens -/// always introduce an attribute. In Objective-C++11, this rule does not -/// apply if either '[' begins a message-send. -/// -/// If Disambiguate is true, we try harder to determine whether a '[[' starts -/// an attribute-specifier, and return -/// CXX11AttributeKind::InvalidAttributeSpecifier if not. -/// -/// If OuterMightBeMessageSend is true, we assume the outer '[' is either an -/// Obj-C message send or the start of an attribute. Otherwise, we assume it -/// is not an Obj-C message send. -/// -/// C++11 [dcl.attr.grammar]: -/// -/// attribute-specifier: -/// '[' '[' attribute-list ']' ']' -/// alignment-specifier -/// -/// attribute-list: -/// attribute[opt] -/// attribute-list ',' attribute[opt] -/// attribute '...' -/// attribute-list ',' attribute '...' -/// -/// attribute: -/// attribute-token attribute-argument-clause[opt] -/// -/// attribute-token: -/// identifier -/// identifier '::' identifier -/// -/// attribute-argument-clause: -/// '(' balanced-token-seq ')' CXX11AttributeKind Parser::isCXX11AttributeSpecifier(bool Disambiguate, bool OuterMightBeMessageSend) { @@ -941,24 +786,6 @@ Parser::TPResult Parser::TryParsePtrOperatorSeq() { } } -/// operator-function-id: -/// 'operator' operator -/// -/// operator: one of -/// new delete new[] delete[] + - * / % ^ [...] -/// -/// conversion-function-id: -/// 'operator' conversion-type-id -/// -/// conversion-type-id: -/// type-specifier-seq conversion-declarator[opt] -/// -/// conversion-declarator: -/// ptr-operator conversion-declarator[opt] -/// -/// literal-operator-id: -/// 'operator' string-literal identifier -/// 'operator' user-defined-string-literal Parser::TPResult Parser::TryParseOperatorId() { assert(Tok.is(tok::kw_operator)); ConsumeToken(); @@ -1035,59 +862,6 @@ Parser::TPResult Parser::TryParseOperatorId() { return TryParsePtrOperatorSeq(); } -/// declarator: -/// direct-declarator -/// ptr-operator declarator -/// -/// direct-declarator: -/// declarator-id -/// direct-declarator '(' parameter-declaration-clause ')' -/// cv-qualifier-seq[opt] exception-specification[opt] -/// direct-declarator '[' constant-expression[opt] ']' -/// '(' declarator ')' -/// [GNU] '(' attributes declarator ')' -/// -/// abstract-declarator: -/// ptr-operator abstract-declarator[opt] -/// direct-abstract-declarator -/// -/// direct-abstract-declarator: -/// direct-abstract-declarator[opt] -/// '(' parameter-declaration-clause ')' cv-qualifier-seq[opt] -/// exception-specification[opt] -/// direct-abstract-declarator[opt] '[' constant-expression[opt] ']' -/// '(' abstract-declarator ')' -/// [C++0x] ... -/// -/// ptr-operator: -/// '*' cv-qualifier-seq[opt] -/// '&' -/// [C++0x] '&&' [TODO] -/// '::'[opt] nested-name-specifier '*' cv-qualifier-seq[opt] -/// -/// cv-qualifier-seq: -/// cv-qualifier cv-qualifier-seq[opt] -/// -/// cv-qualifier: -/// 'const' -/// 'volatile' -/// -/// declarator-id: -/// '...'[opt] id-expression -/// -/// id-expression: -/// unqualified-id -/// qualified-id [TODO] -/// -/// unqualified-id: -/// identifier -/// operator-function-id -/// conversion-function-id -/// literal-operator-id -/// '~' class-name [TODO] -/// '~' decltype-specifier [TODO] -/// template-id [TODO] -/// Parser::TPResult Parser::TryParseDeclarator(bool mayBeAbstract, bool mayHaveIdentifier, bool mayHaveDirectInit, @@ -1222,118 +996,7 @@ class TentativeParseCCC final : public CorrectionCandidateCallback { } }; } -/// isCXXDeclarationSpecifier - Returns TPResult::True if it is a declaration -/// specifier, TPResult::False if it is not, TPResult::Ambiguous if it could -/// be either a decl-specifier or a function-style cast, and TPResult::Error -/// if a parsing error was found and reported. -/// -/// If InvalidAsDeclSpec is not null, some cases that would be ill-formed as -/// declaration specifiers but possibly valid as some other kind of construct -/// return TPResult::Ambiguous instead of TPResult::False. When this happens, -/// the intent is to keep trying to disambiguate, on the basis that we might -/// find a better reason to treat this construct as a declaration later on. -/// When this happens and the name could possibly be valid in some other -/// syntactic context, *InvalidAsDeclSpec is set to 'true'. The current cases -/// that trigger this are: -/// -/// * When parsing X::Y (with no 'typename') where X is dependent -/// * When parsing X where X is undeclared -/// -/// decl-specifier: -/// storage-class-specifier -/// type-specifier -/// function-specifier -/// 'friend' -/// 'typedef' -/// [C++11] 'constexpr' -/// [C++20] 'consteval' -/// [GNU] attributes declaration-specifiers[opt] -/// -/// storage-class-specifier: -/// 'register' -/// 'static' -/// 'extern' -/// 'mutable' -/// 'auto' -/// [GNU] '__thread' -/// [C++11] 'thread_local' -/// [C11] '_Thread_local' -/// -/// function-specifier: -/// 'inline' -/// 'virtual' -/// 'explicit' -/// -/// typedef-name: -/// identifier -/// -/// type-specifier: -/// simple-type-specifier -/// class-specifier -/// enum-specifier -/// elaborated-type-specifier -/// typename-specifier -/// cv-qualifier -/// -/// simple-type-specifier: -/// '::'[opt] nested-name-specifier[opt] type-name -/// '::'[opt] nested-name-specifier 'template' -/// simple-template-id [TODO] -/// 'char' -/// 'wchar_t' -/// 'bool' -/// 'short' -/// 'int' -/// 'long' -/// 'signed' -/// 'unsigned' -/// 'float' -/// 'double' -/// 'void' -/// [GNU] typeof-specifier -/// [GNU] '_Complex' -/// [C++11] 'auto' -/// [GNU] '__auto_type' -/// [C++11] 'decltype' ( expression ) -/// [C++1y] 'decltype' ( 'auto' ) -/// -/// type-name: -/// class-name -/// enum-name -/// typedef-name -/// -/// elaborated-type-specifier: -/// class-key '::'[opt] nested-name-specifier[opt] identifier -/// class-key '::'[opt] nested-name-specifier[opt] 'template'[opt] -/// simple-template-id -/// 'enum' '::'[opt] nested-name-specifier[opt] identifier -/// -/// enum-name: -/// identifier -/// -/// enum-specifier: -/// 'enum' identifier[opt] '{' enumerator-list[opt] '}' -/// 'enum' identifier[opt] '{' enumerator-list ',' '}' -/// -/// class-specifier: -/// class-head '{' member-specification[opt] '}' -/// -/// class-head: -/// class-key identifier[opt] base-clause[opt] -/// class-key nested-name-specifier identifier base-clause[opt] -/// class-key nested-name-specifier[opt] simple-template-id -/// base-clause[opt] -/// -/// class-key: -/// 'class' -/// 'struct' -/// 'union' -/// -/// cv-qualifier: -/// 'const' -/// 'volatile' -/// [GNU] restrict -/// + Parser::TPResult Parser::isCXXDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename, Parser::TPResult BracedCastResult, @@ -1956,10 +1619,6 @@ bool Parser::isCXXDeclarationSpecifierAType() { } } -/// [GNU] typeof-specifier: -/// 'typeof' '(' expressions ')' -/// 'typeof' '(' type-name ')' -/// Parser::TPResult Parser::TryParseTypeofSpecifier() { assert(Tok.is(tok::kw_typeof) && "Expected 'typeof'!"); ConsumeToken(); @@ -1973,8 +1632,6 @@ Parser::TPResult Parser::TryParseTypeofSpecifier() { return TPResult::Ambiguous; } -/// [ObjC] protocol-qualifiers: -//// '<' identifier-list '>' Parser::TPResult Parser::TryParseProtocolQualifiers() { assert(Tok.is(tok::less) && "Expected '<' for qualifier list"); ConsumeToken(); @@ -1997,16 +1654,6 @@ Parser::TPResult Parser::TryParseProtocolQualifiers() { return TPResult::Error; } -/// isCXXFunctionDeclarator - Disambiguates between a function declarator or -/// a constructor-style initializer, when parsing declaration statements. -/// Returns true for function declarator and false for constructor-style -/// initializer. -/// If during the disambiguation process a parsing error is encountered, -/// the function returns true to let the declaration parsing code handle it. -/// -/// '(' parameter-declaration-clause ')' cv-qualifier-seq[opt] -/// exception-specification[opt] -/// bool Parser::isCXXFunctionDeclarator( bool *IsAmbiguous, ImplicitTypenameContext AllowImplicitTypename) { @@ -2052,23 +1699,6 @@ bool Parser::isCXXFunctionDeclarator( return TPR != TPResult::False; } -/// parameter-declaration-clause: -/// parameter-declaration-list[opt] '...'[opt] -/// parameter-declaration-list ',' '...' -/// -/// parameter-declaration-list: -/// parameter-declaration -/// parameter-declaration-list ',' parameter-declaration -/// -/// parameter-declaration: -/// attribute-specifier-seq[opt] decl-specifier-seq declarator attributes[opt] -/// attribute-specifier-seq[opt] decl-specifier-seq declarator attributes[opt] -/// '=' assignment-expression -/// attribute-specifier-seq[opt] decl-specifier-seq abstract-declarator[opt] -/// attributes[opt] -/// attribute-specifier-seq[opt] decl-specifier-seq abstract-declarator[opt] -/// attributes[opt] '=' assignment-expression -/// Parser::TPResult Parser::TryParseParameterDeclarationClause( bool *InvalidAsDeclaration, bool VersusTemplateArgument, ImplicitTypenameContext AllowImplicitTypename) { @@ -2182,18 +1812,6 @@ Parser::TPResult Parser::TryParseParameterDeclarationClause( return TPResult::Ambiguous; } -/// TryParseFunctionDeclarator - We parsed a '(' and we want to try to continue -/// parsing as a function declarator. -/// If TryParseFunctionDeclarator fully parsed the function declarator, it will -/// return TPResult::Ambiguous, otherwise it will return either False() or -/// Error(). -/// -/// '(' parameter-declaration-clause ')' cv-qualifier-seq[opt] -/// exception-specification[opt] -/// -/// exception-specification: -/// 'throw' '(' type-id-list[opt] ')' -/// Parser::TPResult Parser::TryParseFunctionDeclarator(bool MayHaveTrailingReturnType) { // The '(' is already parsed. @@ -2259,10 +1877,6 @@ Parser::TryParseFunctionDeclarator(bool MayHaveTrailingReturnType) { return TPResult::Ambiguous; } -// When parsing an identifier after an arrow it may be a member expression, -// in which case we should not annotate it as an independant expression -// so we just lookup that name, if it's not a type the construct is not -// a function declaration. bool Parser::NameAfterArrowIsNonType() { assert(Tok.is(tok::identifier)); Token Next = NextToken(); @@ -2286,8 +1900,6 @@ bool Parser::NameAfterArrowIsNonType() { return false; } -/// '[' constant-expression[opt] ']' -/// Parser::TPResult Parser::TryParseBracketDeclarator() { ConsumeBracket(); @@ -2308,10 +1920,6 @@ Parser::TPResult Parser::TryParseBracketDeclarator() { return TPResult::Ambiguous; } -/// Determine whether we might be looking at the '<' template-argument-list '>' -/// of a template-id or simple-template-id, rather than a less-than comparison. -/// This will often fail and produce an ambiguity, but should never be wrong -/// if it returns True or False. Parser::TPResult Parser::isTemplateArgumentList(unsigned TokensToSkip) { if (!TokensToSkip) { if (Tok.isNot(tok::less)) @@ -2359,8 +1967,6 @@ Parser::TPResult Parser::isTemplateArgumentList(unsigned TokensToSkip) { return TPResult::False; } -/// Determine whether we might be looking at the '(' of a C++20 explicit(bool) -/// in an earlier language mode. Parser::TPResult Parser::isExplicitBool() { assert(Tok.is(tok::l_paren) && "expected to be looking at a '(' token"); diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 9117971ce212f..55a768580d393 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -59,8 +59,8 @@ Parser::Parser(Preprocessor &pp, Sema &actions, bool skipFunctionBodies) PreferredType(&actions.getASTContext(), pp.isCodeCompletionEnabled()), Actions(actions), Diags(PP.getDiagnostics()), StackHandler(Diags), GreaterThanIsOperator(true), ColonIsSacred(false), - InMessageExpression(false), TemplateParameterDepth(0), - ParsingInObjCContainer(false) { + InMessageExpression(false), ParsingInObjCContainer(false), + TemplateParameterDepth(0) { SkipFunctionBodies = pp.isCodeCompletionEnabled() || skipFunctionBodies; Tok.startToken(); Tok.setKind(tok::eof); @@ -101,12 +101,6 @@ DiagnosticBuilder Parser::DiagCompat(const Token &Tok, unsigned CompatDiagId) { return DiagCompat(Tok.getLocation(), CompatDiagId); } -/// Emits a diagnostic suggesting parentheses surrounding a -/// given range. -/// -/// \param Loc The location where we'll emit the diagnostic. -/// \param DK The kind of diagnostic to emit. -/// \param ParenRange Source range enclosing code that should be parenthesized. void Parser::SuggestParentheses(SourceLocation Loc, unsigned DK, SourceRange ParenRange) { SourceLocation EndLoc = PP.getLocForEndOfToken(ParenRange.getEnd()); @@ -291,14 +285,6 @@ static bool HasFlagsSet(Parser::SkipUntilFlags L, Parser::SkipUntilFlags R) { return (static_cast(L) & static_cast(R)) != 0; } -/// SkipUntil - Read tokens until we get to the specified token, then consume -/// it (unless no flag StopBeforeMatch). Because we cannot guarantee that the -/// token will ever occur, this skips to the next token, or to some likely -/// good stopping point. If StopAtSemi is true, skipping will stop at a ';' -/// character. -/// -/// If SkipUntil finds the specified token, it returns true, otherwise it -/// returns false. bool Parser::SkipUntil(ArrayRef Toks, SkipUntilFlags Flags) { // We always want this function to skip at least one token if the first token // isn't T and if not at EOF. @@ -432,7 +418,6 @@ bool Parser::SkipUntil(ArrayRef Toks, SkipUntilFlags Flags) { // Scope manipulation //===----------------------------------------------------------------------===// -/// EnterScope - Start a new scope. void Parser::EnterScope(unsigned ScopeFlags) { if (NumCachedScopes) { Scope *N = ScopeCache[--NumCachedScopes]; @@ -443,7 +428,6 @@ void Parser::EnterScope(unsigned ScopeFlags) { } } -/// ExitScope - Pop a scope off the scope stack. void Parser::ExitScope() { assert(getCurScope() && "Scope imbalance!"); @@ -460,8 +444,6 @@ void Parser::ExitScope() { ScopeCache[NumCachedScopes++] = OldScope; } -/// Set the flags for the current scope to ScopeFlags. If ManageFlags is false, -/// this object does nothing. Parser::ParseScopeFlags::ParseScopeFlags(Parser *Self, unsigned ScopeFlags, bool ManageFlags) : CurScope(ManageFlags ? Self->getCurScope() : nullptr) { @@ -471,8 +453,6 @@ Parser::ParseScopeFlags::ParseScopeFlags(Parser *Self, unsigned ScopeFlags, } } -/// Restore the flags for the current scope to what they were before this -/// object overrode them. Parser::ParseScopeFlags::~ParseScopeFlags() { if (CurScope) CurScope->setFlags(OldFlags); @@ -501,8 +481,6 @@ Parser::~Parser() { DestroyTemplateIds(); } -/// Initialize - Warm up the parser. -/// void Parser::Initialize() { // Create the translation unit scope. Install it as the current scope. assert(getCurScope() == nullptr && "A scope is already active?"); @@ -614,16 +592,6 @@ void Parser::DestroyTemplateIds() { TemplateIds.clear(); } -/// Parse the first top-level declaration in a translation unit. -/// -/// translation-unit: -/// [C] external-declaration -/// [C] translation-unit external-declaration -/// [C++] top-level-declaration-seq[opt] -/// [C++20] global-module-fragment[opt] module-declaration -/// top-level-declaration-seq[opt] private-module-fragment[opt] -/// -/// Note that in C, it is an error if there is no first declaration. bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result, Sema::ModuleImportState &ImportState) { Actions.ActOnStartOfTranslationUnit(); @@ -645,12 +613,6 @@ bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result, return NoTopLevelDecls; } -/// ParseTopLevelDecl - Parse one top-level declaration, return whatever the -/// action tells us to. This returns true if the EOF was encountered. -/// -/// top-level-declaration: -/// declaration -/// [C++20] module-import-declaration bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, Sema::ModuleImportState &ImportState) { DestroyTemplateIdAnnotationsRAIIObj CleanupRAII(*this); @@ -799,35 +761,6 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, return false; } -/// ParseExternalDeclaration: -/// -/// The `Attrs` that are passed in are C++11 attributes and appertain to the -/// declaration. -/// -/// external-declaration: [C99 6.9], declaration: [C++ dcl.dcl] -/// function-definition -/// declaration -/// [GNU] asm-definition -/// [GNU] __extension__ external-declaration -/// [OBJC] objc-class-definition -/// [OBJC] objc-class-declaration -/// [OBJC] objc-alias-declaration -/// [OBJC] objc-protocol-definition -/// [OBJC] objc-method-definition -/// [OBJC] @end -/// [C++] linkage-specification -/// [GNU] asm-definition: -/// simple-asm-expr ';' -/// [C++11] empty-declaration -/// [C++11] attribute-declaration -/// -/// [C++11] empty-declaration: -/// ';' -/// -/// [C++0x/GNU] 'extern' 'template' declaration -/// -/// [C++20] module-import-declaration -/// Parser::DeclGroupPtrTy Parser::ParseExternalDeclaration(ParsedAttributes &Attrs, ParsedAttributes &DeclSpecAttrs, @@ -1105,8 +1038,6 @@ Parser::ParseExternalDeclaration(ParsedAttributes &Attrs, return Actions.ConvertDeclToDeclGroup(SingleDecl); } -/// Determine whether the current token, if it occurs after a -/// declarator, continues a declaration or declaration list. bool Parser::isDeclarationAfterDeclarator() { // Check for '= delete' or '= default' if (getLangOpts().CPlusPlus && Tok.is(tok::equal)) { @@ -1124,8 +1055,6 @@ bool Parser::isDeclarationAfterDeclarator() { Tok.is(tok::l_paren)); // int X(0) -> not a function def [C++] } -/// Determine whether the current token, if it occurs after a -/// declarator, indicates the start of a function definition. bool Parser::isStartOfFunctionDefinition(const ParsingDeclarator &Declarator) { assert(Declarator.isFunctionDeclarator() && "Isn't a function declarator"); if (Tok.is(tok::l_brace)) // int X() {} @@ -1145,22 +1074,6 @@ bool Parser::isStartOfFunctionDefinition(const ParsingDeclarator &Declarator) { Tok.is(tok::kw_try); // X() try { ... } } -/// Parse either a function-definition or a declaration. We can't tell which -/// we have until we read up to the compound-statement in function-definition. -/// TemplateParams, if non-NULL, provides the template parameters when we're -/// parsing a C++ template-declaration. -/// -/// function-definition: [C99 6.9.1] -/// decl-specs declarator declaration-list[opt] compound-statement -/// [C90] function-definition: [C99 6.7.1] - implicit int result -/// [C90] decl-specs[opt] declarator declaration-list[opt] compound-statement -/// -/// declaration: [C99 6.7] -/// declaration-specifiers init-declarator-list[opt] ';' -/// [!C99] init-declarator-list ';' [TODO: warn in c99 mode] -/// [OMP] threadprivate-directive -/// [OMP] allocate-directive [TODO] -/// Parser::DeclGroupPtrTy Parser::ParseDeclOrFunctionDefInternal( ParsedAttributes &Attrs, ParsedAttributes &DeclSpecAttrs, ParsingDeclSpec &DS, AccessSpecifier AS) { @@ -1298,20 +1211,6 @@ Parser::DeclGroupPtrTy Parser::ParseDeclarationOrFunctionDefinition( } } -/// ParseFunctionDefinition - We parsed and verified that the specified -/// Declarator is well formed. If this is a K&R-style function, read the -/// parameters declaration-list, then start the compound-statement. -/// -/// function-definition: [C99 6.9.1] -/// decl-specs declarator declaration-list[opt] compound-statement -/// [C90] function-definition: [C99 6.7.1] - implicit int result -/// [C90] decl-specs[opt] declarator declaration-list[opt] compound-statement -/// [C++] function-definition: [C++ 8.4] -/// decl-specifier-seq[opt] declarator ctor-initializer[opt] -/// function-body -/// [C++] function-definition: [C++ 8.4] -/// decl-specifier-seq[opt] declarator function-try-block -/// Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D, const ParsedTemplateInfo &TemplateInfo, LateParsedAttrList *LateParsedAttrs) { @@ -1573,8 +1472,6 @@ void Parser::SkipFunctionBody() { } } -/// ParseKNRParamDeclarations - Parse 'declaration-list[opt]' which provides -/// types for a function with a K&R-style identifier list for arguments. void Parser::ParseKNRParamDeclarations(Declarator &D) { // We know that the top-level of this declarator is a function. DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo(); @@ -1688,16 +1585,6 @@ void Parser::ParseKNRParamDeclarations(Declarator &D) { Actions.ActOnFinishKNRParamDeclarations(getCurScope(), D, Tok.getLocation()); } - -/// ParseAsmStringLiteral - This is just a normal string-literal, but is not -/// allowed to be a wide string, and is not subject to character translation. -/// Unlike GCC, we also diagnose an empty string literal when parsing for an -/// asm label as opposed to an asm statement, because such a construct does not -/// behave well. -/// -/// [GNU] asm-string-literal: -/// string-literal -/// ExprResult Parser::ParseAsmStringLiteral(bool ForAsmLabel) { ExprResult AsmString; @@ -1735,11 +1622,6 @@ ExprResult Parser::ParseAsmStringLiteral(bool ForAsmLabel) { return Actions.ActOnGCCAsmStmtString(AsmString.get(), ForAsmLabel); } -/// ParseSimpleAsm -/// -/// [GNU] simple-asm-expr: -/// 'asm' '(' asm-string-literal ')' -/// ExprResult Parser::ParseSimpleAsm(bool ForAsmLabel, SourceLocation *EndLoc) { assert(Tok.is(tok::kw_asm) && "Not an asm!"); SourceLocation Loc = ConsumeToken(); @@ -1776,9 +1658,6 @@ ExprResult Parser::ParseSimpleAsm(bool ForAsmLabel, SourceLocation *EndLoc) { return Result; } -/// Get the TemplateIdAnnotation from the token and put it in the -/// cleanup pool so that it gets destroyed when parsing the current top level -/// declaration is finished. TemplateIdAnnotation *Parser::takeTemplateIdAnnotation(const Token &tok) { assert(tok.is(tok::annot_template_id) && "Expected template-id token"); TemplateIdAnnotation * @@ -1804,16 +1683,6 @@ void Parser::AnnotateScopeToken(CXXScopeSpec &SS, bool IsNewAnnotation) { PP.AnnotateCachedTokens(Tok); } -/// Attempt to classify the name at the current token position. This may -/// form a type, scope or primary expression annotation, or replace the token -/// with a typo-corrected keyword. This is only appropriate when the current -/// name must refer to an entity which has already been declared. -/// -/// \param CCC Indicates how to perform typo-correction for this name. If NULL, -/// no typo correction will be performed. -/// \param AllowImplicitTypename Whether we are in a context where a dependent -/// nested-name-specifier without typename is treated as a type (e.g. -/// T::type). AnnotatedNameKind Parser::TryAnnotateName(CorrectionCandidateCallback *CCC, ImplicitTypenameContext AllowImplicitTypename) { @@ -2016,28 +1885,6 @@ bool Parser::TryKeywordIdentFallback(bool DisableKeyword) { return true; } -/// TryAnnotateTypeOrScopeToken - If the current token position is on a -/// typename (possibly qualified in C++) or a C++ scope specifier not followed -/// by a typename, TryAnnotateTypeOrScopeToken will replace one or more tokens -/// with a single annotation token representing the typename or C++ scope -/// respectively. -/// This simplifies handling of C++ scope specifiers and allows efficient -/// backtracking without the need to re-parse and resolve nested-names and -/// typenames. -/// It will mainly be called when we expect to treat identifiers as typenames -/// (if they are typenames). For example, in C we do not expect identifiers -/// inside expressions to be treated as typenames so it will not be called -/// for expressions in C. -/// The benefit for C/ObjC is that a typename will be annotated and -/// Actions.getTypeName will not be needed to be called again (e.g. getTypeName -/// will not be called twice, once to check whether we have a declaration -/// specifier, and another one to get the actual type inside -/// ParseDeclarationSpecifiers). -/// -/// This returns true if an error occurred. -/// -/// Note that this routine emits an error if you call it with ::new or ::delete -/// as the current tokens, so only call it in contexts where these are invalid. bool Parser::TryAnnotateTypeOrScopeToken( ImplicitTypenameContext AllowImplicitTypename) { assert((Tok.is(tok::identifier) || Tok.is(tok::coloncolon) || @@ -2164,9 +2011,6 @@ bool Parser::TryAnnotateTypeOrScopeToken( AllowImplicitTypename); } -/// Try to annotate a type or scope token, having already parsed an -/// optional scope specifier. \p IsNewScope should be \c true unless the scope -/// specifier was extracted from an existing tok::annot_cxxscope annotation. bool Parser::TryAnnotateTypeOrScopeTokenAfterScopeSpec( CXXScopeSpec &SS, bool IsNewScope, ImplicitTypenameContext AllowImplicitTypename) { @@ -2283,12 +2127,6 @@ bool Parser::TryAnnotateTypeOrScopeTokenAfterScopeSpec( return false; } -/// TryAnnotateScopeToken - Like TryAnnotateTypeOrScopeToken but only -/// annotates C++ scope specifiers and template-ids. This returns -/// true if there was an error that could not be recovered from. -/// -/// Note that this routine emits an error if you call it with ::new or ::delete -/// as the current tokens, so only call it in contexts where these are invalid. bool Parser::TryAnnotateCXXScopeToken(bool EnteringContext) { assert(getLangOpts().CPlusPlus && "Call sites of this function should be guarded by checking for C++"); @@ -2496,19 +2334,6 @@ void Parser::ParseMicrosoftIfExistsExternalDeclaration() { Braces.consumeClose(); } -/// Parse a declaration beginning with the 'module' keyword or C++20 -/// context-sensitive keyword (optionally preceded by 'export'). -/// -/// module-declaration: [C++20] -/// 'export'[opt] 'module' module-name attribute-specifier-seq[opt] ';' -/// -/// global-module-fragment: [C++2a] -/// 'module' ';' top-level-declaration-seq[opt] -/// module-declaration: [C++2a] -/// 'export'[opt] 'module' module-name module-partition[opt] -/// attribute-specifier-seq[opt] ';' -/// private-module-fragment: [C++2a] -/// 'module' ':' 'private' ';' top-level-declaration-seq[opt] Parser::DeclGroupPtrTy Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) { SourceLocation StartLoc = Tok.getLocation(); @@ -2590,21 +2415,6 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) { ImportState); } -/// Parse a module import declaration. This is essentially the same for -/// Objective-C and C++20 except for the leading '@' (in ObjC) and the -/// trailing optional attributes (in C++). -/// -/// [ObjC] @import declaration: -/// '@' 'import' module-name ';' -/// [ModTS] module-import-declaration: -/// 'import' module-name attribute-specifier-seq[opt] ';' -/// [C++20] module-import-declaration: -/// 'export'[opt] 'import' module-name -/// attribute-specifier-seq[opt] ';' -/// 'export'[opt] 'import' module-partition -/// attribute-specifier-seq[opt] ';' -/// 'export'[opt] 'import' header-name -/// attribute-specifier-seq[opt] ';' Decl *Parser::ParseModuleImport(SourceLocation AtLoc, Sema::ModuleImportState &ImportState) { SourceLocation StartLoc = AtLoc.isInvalid() ? Tok.getLocation() : AtLoc; @@ -2730,13 +2540,6 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, return Import.get(); } -/// Parse a C++ / Objective-C module name (both forms use the same -/// grammar). -/// -/// module-name: -/// module-name-qualifier[opt] identifier -/// module-name-qualifier: -/// module-name-qualifier[opt] identifier '.' bool Parser::ParseModuleName(SourceLocation UseLoc, SmallVectorImpl &Path, bool IsImport) { @@ -2765,10 +2568,6 @@ bool Parser::ParseModuleName(SourceLocation UseLoc, } } -/// Try recover parser when module annotation appears where it must not -/// be found. -/// \returns false if the recover was successful and parsing may be continued, or -/// true if parser must bail out to top level and handle the token there. bool Parser::parseMisplacedModuleImport() { while (true) { switch (Tok.getKind()) { diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp index 35364a4d6f2ac..7a561638aebbc 100644 --- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp +++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp @@ -668,6 +668,26 @@ BuiltinTypeDeclBuilder::addHandleConstructorFromBinding() { .finalize(); } +BuiltinTypeDeclBuilder & +BuiltinTypeDeclBuilder::addHandleConstructorFromImplicitBinding() { + if (Record->isCompleteDefinition()) + return *this; + + using PH = BuiltinTypeMethodBuilder::PlaceHolder; + ASTContext &AST = SemaRef.getASTContext(); + QualType HandleType = getResourceHandleField()->getType(); + + return BuiltinTypeMethodBuilder(*this, "", AST.VoidTy, false, true) + .addParam("spaceNo", AST.UnsignedIntTy) + .addParam("range", AST.IntTy) + .addParam("index", AST.UnsignedIntTy) + .addParam("orderId", AST.UnsignedIntTy) + .callBuiltin("__builtin_hlsl_resource_handlefromimplicitbinding", + HandleType, PH::Handle, PH::_0, PH::_1, PH::_2, PH::_3) + .assign(PH::Handle, PH::LastStmt) + .finalize(); +} + BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addArraySubscriptOperators() { ASTContext &AST = Record->getASTContext(); DeclarationName Subscript = diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h index db617dc53c899..a52e2938104c7 100644 --- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h +++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h @@ -76,9 +76,10 @@ class BuiltinTypeDeclBuilder { AccessSpecifier Access = AccessSpecifier::AS_private); BuiltinTypeDeclBuilder &addArraySubscriptOperators(); - // Builtin types methods + // Builtin types constructors BuiltinTypeDeclBuilder &addDefaultHandleConstructor(); BuiltinTypeDeclBuilder &addHandleConstructorFromBinding(); + BuiltinTypeDeclBuilder &addHandleConstructorFromImplicitBinding(); // Builtin types methods BuiltinTypeDeclBuilder &addLoadMethods(); diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index f09232a9db4da..38bde7c28e946 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -132,7 +132,8 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S, return BuiltinTypeDeclBuilder(S, Decl) .addHandleMember(RC, IsROV, RawBuffer) .addDefaultHandleConstructor() - .addHandleConstructorFromBinding(); + .addHandleConstructorFromBinding() + .addHandleConstructorFromImplicitBinding(); } // This function is responsible for constructing the constraint expression for diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp index a0cfabbc71998..146cd78195663 100644 --- a/clang/lib/Sema/SemaARM.cpp +++ b/clang/lib/Sema/SemaARM.cpp @@ -389,7 +389,7 @@ bool SemaARM::CheckImmediateArg(CallExpr *TheCall, unsigned CheckTy, if (SemaRef.BuiltinConstantArg(TheCall, ArgIdx, Imm)) return true; - if (std::find(Set.begin(), Set.end(), Imm.getSExtValue()) == Set.end()) + if (!llvm::is_contained(Set, Imm.getSExtValue())) return Diag(TheCall->getBeginLoc(), ErrDiag) << Arg->getSourceRange(); return false; }; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 55121b90fa167..84b84de28c511 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -14,6 +14,7 @@ #include "CheckExprLifetime.h" #include "clang/AST/APValue.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/ASTDiagnostic.h" #include "clang/AST/Attr.h" #include "clang/AST/AttrIterator.h" #include "clang/AST/CharUnits.h" @@ -5342,29 +5343,29 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) { } for (unsigned i = 2; i < TheCall->getNumArgs(); i++) { - if (TheCall->getArg(i)->isTypeDependent() || - TheCall->getArg(i)->isValueDependent()) + Expr *Arg = TheCall->getArg(i); + if (Arg->isTypeDependent() || Arg->isValueDependent()) continue; std::optional Result; - if (!(Result = TheCall->getArg(i)->getIntegerConstantExpr(Context))) + if (!(Result = Arg->getIntegerConstantExpr(Context))) return ExprError(Diag(TheCall->getBeginLoc(), diag::err_shufflevector_nonconstant_argument) - << TheCall->getArg(i)->getSourceRange()); + << Arg->getSourceRange()); // Allow -1 which will be translated to undef in the IR. if (Result->isSigned() && Result->isAllOnes()) - continue; - - if (Result->getActiveBits() > 64 || - Result->getZExtValue() >= numElements * 2) + ; + else if (Result->getActiveBits() > 64 || + Result->getZExtValue() >= numElements * 2) return ExprError(Diag(TheCall->getBeginLoc(), diag::err_shufflevector_argument_too_large) - << TheCall->getArg(i)->getSourceRange()); - } + << Arg->getSourceRange()); - SmallVector exprs; + TheCall->setArg(i, ConstantExpr::Create(Context, Arg, APValue(*Result))); + } + SmallVector exprs; for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; i++) { exprs.push_back(TheCall->getArg(i)); TheCall->setArg(i, nullptr); @@ -11871,6 +11872,47 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) { } } +static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source, + const Type *Target, Expr *E, + QualType T, + SourceLocation CC) { + assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() && + Source != Target); + Expr::EvalResult Result; + if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects, + S.isConstantEvaluatedContext())) { + llvm::APSInt Value(32); + Value = Result.Val.getInt(); + bool IsASCII = Value <= 0x7F; + bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF); + bool ConversionPreservesSemantics = + IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP); + + if (!ConversionPreservesSemantics) { + auto IsSingleCodeUnitCP = [](const QualType &T, + const llvm::APSInt &Value) { + if (T->isChar8Type()) + return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); + if (T->isChar16Type()) + return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); + assert(T->isChar32Type()); + return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); + }; + + S.Diag(CC, diag::warn_impcast_unicode_char_type_constant) + << E->getType() << T + << IsSingleCodeUnitCP(E->getType().getUnqualifiedType(), Value) + << FormatUTFCodeUnitAsCodepoint(Value.getExtValue(), E->getType()); + } + } else { + bool LosesPrecision = S.getASTContext().getIntWidth(E->getType()) > + S.getASTContext().getIntWidth(T); + DiagnoseImpCast(S, E, T, CC, + LosesPrecision ? diag::warn_impcast_unicode_precision + : diag::warn_impcast_unicode_char_type); + } +} + void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC, bool *ICContext, bool IsListInit) { if (E->isTypeDependent() || E->isValueDependent()) return; @@ -12208,6 +12250,11 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC, DiscardMisalignedMemberAddress(Target, E); + if (Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) { + DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC); + return; + } + if (Target->isBooleanType()) DiagnoseIntInBoolContext(*this, E); diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 15b9c97489e7f..543bd450c554e 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -1999,8 +1999,9 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) { }); if (NC.getCompoundKind() == FormulaType::Kind) { + auto SizeLeft = Left.size(); Res = std::move(Left); - Res.reserve(Left.size() + Right.size()); + Res.reserve(SizeLeft + Right.size()); std::for_each(std::make_move_iterator(Right.begin()), std::make_move_iterator(Right.end()), Add); return Res; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 152f3f340cd50..6dae243b520f0 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -16093,8 +16093,11 @@ void Sema::computeNRVO(Stmt *Body, FunctionScopeInfo *Scope) { for (unsigned I = 0, E = Scope->Returns.size(); I != E; ++I) { if (const VarDecl *NRVOCandidate = Returns[I]->getNRVOCandidate()) { - if (!NRVOCandidate->isNRVOVariable()) + if (!NRVOCandidate->isNRVOVariable()) { + Diag(Returns[I]->getRetValue()->getExprLoc(), + diag::warn_not_eliding_copy_on_return); Returns[I]->setNRVOCandidate(nullptr); + } } } } @@ -19399,9 +19402,9 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl, // Verify that all the fields are okay. SmallVector RecFields; - + const FieldDecl *PreviousField = nullptr; for (ArrayRef::iterator i = Fields.begin(), end = Fields.end(); - i != end; ++i) { + i != end; PreviousField = cast(*i), ++i) { FieldDecl *FD = cast(*i); // Get the type for the field. @@ -19617,6 +19620,29 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl, if (Record && FD->getType().isVolatileQualified()) Record->setHasVolatileMember(true); + bool ReportMSBitfieldStoragePacking = + Record && PreviousField && + !Diags.isIgnored(diag::warn_ms_bitfield_mismatched_storage_packing, + Record->getLocation()); + auto IsNonDependentBitField = [](const FieldDecl *FD) { + return FD->isBitField() && !FD->getType()->isDependentType(); + }; + + if (ReportMSBitfieldStoragePacking && IsNonDependentBitField(FD) && + IsNonDependentBitField(PreviousField)) { + CharUnits FDStorageSize = Context.getTypeSizeInChars(FD->getType()); + CharUnits PreviousFieldStorageSize = + Context.getTypeSizeInChars(PreviousField->getType()); + if (FDStorageSize != PreviousFieldStorageSize) { + Diag(FD->getLocation(), + diag::warn_ms_bitfield_mismatched_storage_packing) + << FD << FD->getType() << FDStorageSize.getQuantity() + << PreviousFieldStorageSize.getQuantity(); + Diag(PreviousField->getLocation(), + diag::note_ms_bitfield_mismatched_storage_size_previous) + << PreviousField << PreviousField->getType(); + } + } // Keep track of the number of named members. if (FD->getIdentifier()) ++NumNamedMembers; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index cbccb567e2adf..1bd9056cad812 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -6274,7 +6274,7 @@ static void ReferenceDllExportedMembers(Sema &S, CXXRecordDecl *Class) { } } MarkingDllexportedContext(S, Class, ClassAttr->getLocation()); - if (S.Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()) + if (S.Context.getTargetInfo().getTriple().isOSCygMing()) S.MarkVTableUsed(Class->getLocation(), Class, true); for (Decl *Member : Class->decls()) { @@ -6576,7 +6576,7 @@ void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) { // declarations, except in MinGW mode. if (ClassExported && !ClassAttr->isInherited() && TSK == TSK_ExplicitInstantiationDeclaration && - !Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()) { + !Context.getTargetInfo().getTriple().isOSCygMing()) { Class->dropAttr(); return; } @@ -8651,6 +8651,18 @@ class DefaultedComparisonAnalyzer assert(Best->BuiltinParamTypes[2].isNull() && "invalid builtin comparison"); + // FIXME: If the type we deduced is a vector type, we mark the + // comparison as deleted because we don't yet support this. + if (isa(T)) { + if (Diagnose == ExplainDeleted) { + S.Diag(FD->getLocation(), + diag::note_defaulted_comparison_vector_types) + << FD; + S.Diag(Subobj.Decl->getLocation(), diag::note_declared_at); + } + return Result::deleted(); + } + if (NeedsDeducing) { std::optional Cat = getComparisonCategoryForBuiltinCmp(T); @@ -18740,7 +18752,7 @@ NamedDecl *Sema::ActOnFriendFunctionDecl(Scope *S, Declarator &D, // a template-id, the function name is not unqualified because these is // no name. While the wording requires some reading in-between the // lines, GCC, MSVC, and EDG all consider a friend function - // specialization definitions // to be de facto explicit specialization + // specialization definitions to be de facto explicit specialization // and diagnose them as such. } else if (isTemplateId) { Diag(NameInfo.getBeginLoc(), diag::err_friend_specialization_def); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 2c81f7c583eb6..91e63c7cb8677 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -15,6 +15,7 @@ #include "UsedDeclVisitor.h" #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/ASTDiagnostic.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/ASTMutationListener.h" #include "clang/AST/CXXInheritance.h" @@ -1568,6 +1569,79 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS, } } +static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS, + Expr *RHS, SourceLocation Loc, + ArithConvKind ACK) { + QualType LHSType = LHS->getType().getUnqualifiedType(); + QualType RHSType = RHS->getType().getUnqualifiedType(); + + if (!SemaRef.getLangOpts().CPlusPlus || !LHSType->isUnicodeCharacterType() || + !RHSType->isUnicodeCharacterType()) + return; + + if (ACK == ArithConvKind::Comparison) { + if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) + return; + + auto IsSingleCodeUnitCP = [](const QualType &T, const llvm::APSInt &Value) { + if (T->isChar8Type()) + return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue()); + if (T->isChar16Type()) + return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue()); + assert(T->isChar32Type()); + return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue()); + }; + + Expr::EvalResult LHSRes, RHSRes; + bool LHSSuccess = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + bool RHSuccess = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(), + Expr::SE_AllowSideEffects, + SemaRef.isConstantEvaluatedContext()); + + // Don't warn if the one known value is a representable + // in the type of both expressions. + if (LHSSuccess != RHSuccess) { + Expr::EvalResult &Res = LHSSuccess ? LHSRes : RHSRes; + if (IsSingleCodeUnitCP(LHSType, Res.Val.getInt()) && + IsSingleCodeUnitCP(RHSType, Res.Val.getInt())) + return; + } + + if (!LHSSuccess || !RHSuccess) { + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types) + << LHS->getSourceRange() << RHS->getSourceRange() << LHSType + << RHSType; + return; + } + + llvm::APSInt LHSValue(32); + LHSValue = LHSRes.Val.getInt(); + llvm::APSInt RHSValue(32); + RHSValue = RHSRes.Val.getInt(); + + bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue); + bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue); + if (LHSSafe && RHSSafe) + return; + + SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant) + << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType + << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType) + << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType); + return; + } + + if (SemaRef.getASTContext().hasSameType(LHSType, RHSType)) + return; + + SemaRef.Diag(Loc, diag::warn_arith_conv_mixed_unicode_types) + << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType + << RHSType; + return; +} + /// UsualArithmeticConversions - Performs various conversions that are common to /// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this /// routine returns the first non-arithmetic type found. The client is @@ -1575,8 +1649,11 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS, QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS, SourceLocation Loc, ArithConvKind ACK) { + checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK); + CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), Loc, ACK); + if (ACK != ArithConvKind::CompAssign) { LHS = UsualUnaryConversions(LHS.get()); if (LHS.isInvalid()) @@ -7241,10 +7318,20 @@ Sema::BuildCompoundLiteralExpr(SourceLocation LParenLoc, TypeSourceInfo *TInfo, ? VK_PRValue : VK_LValue; + // C99 6.5.2.5 + // "If the compound literal occurs outside the body of a function, the + // initializer list shall consist of constant expressions." if (IsFileScope) if (auto ILE = dyn_cast(LiteralExpr)) for (unsigned i = 0, j = ILE->getNumInits(); i != j; i++) { Expr *Init = ILE->getInit(i); + if (!Init->isTypeDependent() && !Init->isValueDependent() && + !Init->isConstantInitializer(Context, /*IsForRef=*/false)) { + Diag(Init->getExprLoc(), diag::err_init_element_not_constant) + << Init->getSourceBitField(); + return ExprError(); + } + ILE->setInit(i, ConstantExpr::Create(Context, Init)); } diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index 053414ff7a1a7..39c162c3b835d 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -1385,7 +1385,7 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R, // lvalue. Because this is inherently unsafe as an atomic operation, the // warning defaults to an error. if (const auto *ATy = BaseType->getAs()) { - S.DiagRuntimeBehavior(OpLoc, nullptr, + S.DiagRuntimeBehavior(OpLoc, BaseExpr.get(), S.PDiag(diag::warn_atomic_member_access)); BaseType = ATy->getValueType().getUnqualifiedType(); BaseExpr = ImplicitCastExpr::Create( diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 744ec439b2393..c4dcfd3eb8f99 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -537,6 +537,18 @@ void createHostLayoutStructForBuffer(Sema &S, HLSLBufferDecl *BufDecl) { BufDecl->addLayoutStruct(LS); } +static void addImplicitBindingAttrToBuffer(Sema &S, HLSLBufferDecl *BufDecl, + uint32_t ImplicitBindingOrderID) { + RegisterType RT = + BufDecl->isCBuffer() ? RegisterType::CBuffer : RegisterType::SRV; + auto *Attr = + HLSLResourceBindingAttr::CreateImplicit(S.getASTContext(), "", "0", {}); + std::optional RegSlot; + Attr->setBinding(RT, RegSlot, 0); + Attr->setImplicitBindingOrderID(ImplicitBindingOrderID); + BufDecl->addAttr(Attr); +} + // Handle end of cbuffer/tbuffer declaration void SemaHLSL::ActOnFinishBuffer(Decl *Dcl, SourceLocation RBrace) { auto *BufDecl = cast(Dcl); @@ -547,9 +559,17 @@ void SemaHLSL::ActOnFinishBuffer(Decl *Dcl, SourceLocation RBrace) { // create buffer layout struct createHostLayoutStructForBuffer(SemaRef, BufDecl); - if (std::none_of(Dcl->attr_begin(), Dcl->attr_end(), - [](Attr *A) { return isa(A); })) + HLSLResourceBindingAttr *RBA = Dcl->getAttr(); + if (!RBA || !RBA->hasRegisterSlot()) { SemaRef.Diag(Dcl->getLocation(), diag::warn_hlsl_implicit_binding); + // Use HLSLResourceBindingAttr to transfer implicit binding order_ID + // to codegen. If it does not exist, create an implicit attribute. + uint32_t OrderID = getNextImplicitBindingOrderID(); + if (RBA) + RBA->setImplicitBindingOrderID(OrderID); + else + addImplicitBindingAttrToBuffer(SemaRef, BufDecl, OrderID); + } SemaRef.PopDeclContext(); } @@ -959,7 +979,7 @@ void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) { IdentifierInfo *Ident = AL.getArgAsIdent(0)->getIdentifierInfo(); if (auto *RS = D->getAttr()) { - if (RS->getSignature() != Ident) { + if (RS->getSignatureIdent() != Ident) { Diag(AL.getLoc(), diag::err_disallowed_duplicate_attribute) << RS; return; } @@ -970,10 +990,11 @@ void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) { LookupResult R(SemaRef, Ident, SourceLocation(), Sema::LookupOrdinaryName); if (SemaRef.LookupQualifiedName(R, D->getDeclContext())) - if (isa(R.getFoundDecl())) { + if (auto *SignatureDecl = + dyn_cast(R.getFoundDecl())) { // Perform validation of constructs here - D->addAttr(::new (getASTContext()) - RootSignatureAttr(getASTContext(), AL, Ident)); + D->addAttr(::new (getASTContext()) RootSignatureAttr( + getASTContext(), AL, Ident, SignatureDecl)); } } @@ -1999,6 +2020,8 @@ void SemaHLSL::ActOnEndOfTranslationUnit(TranslationUnitDecl *TU) { HLSLBufferDecl *DefaultCBuffer = HLSLBufferDecl::CreateDefaultCBuffer( SemaRef.getASTContext(), SemaRef.getCurLexicalContext(), DefaultCBufferDecls); + addImplicitBindingAttrToBuffer(SemaRef, DefaultCBuffer, + getNextImplicitBindingOrderID()); SemaRef.getCurLexicalContext()->addDecl(DefaultCBuffer); createHostLayoutStructForBuffer(SemaRef, DefaultCBuffer); @@ -2454,6 +2477,20 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { TheCall->setType(ResourceTy); break; } + case Builtin::BI__builtin_hlsl_resource_handlefromimplicitbinding: { + ASTContext &AST = SemaRef.getASTContext(); + if (SemaRef.checkArgCount(TheCall, 5) || + CheckResourceHandle(&SemaRef, TheCall, 0) || + CheckArgTypeMatches(&SemaRef, TheCall->getArg(1), AST.UnsignedIntTy) || + CheckArgTypeMatches(&SemaRef, TheCall->getArg(2), AST.IntTy) || + CheckArgTypeMatches(&SemaRef, TheCall->getArg(3), AST.UnsignedIntTy) || + CheckArgTypeMatches(&SemaRef, TheCall->getArg(4), AST.UnsignedIntTy)) + return true; + // use the type of the handle (arg0) as a return type + QualType ResourceTy = TheCall->getArg(0)->getType(); + TheCall->setType(ResourceTy); + break; + } case Builtin::BI__builtin_hlsl_and: case Builtin::BI__builtin_hlsl_or: { if (SemaRef.checkArgCount(TheCall, 2)) @@ -3285,8 +3322,10 @@ static bool initVarDeclWithCtor(Sema &S, VarDecl *VD, VD->getLocation(), SourceLocation(), SourceLocation()); InitializationSequence InitSeq(S, Entity, Kind, Args); - ExprResult Init = InitSeq.Perform(S, Entity, Kind, Args); + if (InitSeq.Failed()) + return false; + ExprResult Init = InitSeq.Perform(S, Entity, Kind, Args); if (!Init.get()) return false; @@ -3296,27 +3335,42 @@ static bool initVarDeclWithCtor(Sema &S, VarDecl *VD, return true; } -static bool initGlobalResourceDecl(Sema &S, VarDecl *VD) { +bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) { + std::optional RegisterSlot; + uint32_t SpaceNo = 0; HLSLResourceBindingAttr *RBA = VD->getAttr(); - if (!RBA || !RBA->hasRegisterSlot()) - // FIXME: add support for implicit binding (llvm/llvm-project#110722) - return false; + if (RBA) { + if (RBA->hasRegisterSlot()) + RegisterSlot = RBA->getSlotNumber(); + SpaceNo = RBA->getSpaceNumber(); + } - ASTContext &AST = S.getASTContext(); + ASTContext &AST = SemaRef.getASTContext(); uint64_t UIntTySize = AST.getTypeSize(AST.UnsignedIntTy); uint64_t IntTySize = AST.getTypeSize(AST.IntTy); - Expr *Args[] = { - IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, RBA->getSlotNumber()), - AST.UnsignedIntTy, SourceLocation()), - IntegerLiteral::Create(AST, - llvm::APInt(UIntTySize, RBA->getSpaceNumber()), - AST.UnsignedIntTy, SourceLocation()), - IntegerLiteral::Create(AST, llvm::APInt(IntTySize, 1), AST.IntTy, - SourceLocation()), - IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, 0), AST.UnsignedIntTy, - SourceLocation())}; - - return initVarDeclWithCtor(S, VD, Args); + IntegerLiteral *RangeSize = IntegerLiteral::Create( + AST, llvm::APInt(IntTySize, 1), AST.IntTy, SourceLocation()); + IntegerLiteral *Index = IntegerLiteral::Create( + AST, llvm::APInt(UIntTySize, 0), AST.UnsignedIntTy, SourceLocation()); + IntegerLiteral *Space = + IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, SpaceNo), + AST.UnsignedIntTy, SourceLocation()); + + // resource with explicit binding + if (RegisterSlot.has_value()) { + IntegerLiteral *RegSlot = IntegerLiteral::Create( + AST, llvm::APInt(UIntTySize, RegisterSlot.value()), AST.UnsignedIntTy, + SourceLocation()); + Expr *Args[] = {RegSlot, Space, RangeSize, Index}; + return initVarDeclWithCtor(SemaRef, VD, Args); + } + + // resource with implicit binding + IntegerLiteral *OrderId = IntegerLiteral::Create( + AST, llvm::APInt(UIntTySize, getNextImplicitBindingOrderID()), + AST.UnsignedIntTy, SourceLocation()); + Expr *Args[] = {Space, RangeSize, Index, OrderId}; + return initVarDeclWithCtor(SemaRef, VD, Args); } // Returns true if the initialization has been handled. @@ -3334,8 +3388,9 @@ bool SemaHLSL::ActOnUninitializedVarDecl(VarDecl *VD) { // FIXME: We currectly support only simple resources - no arrays of resources // or resources in user defined structs. // (llvm/llvm-project#133835, llvm/llvm-project#133837) - if (VD->getType()->isHLSLResourceRecord()) - return initGlobalResourceDecl(SemaRef, VD); + // Initialize resources at the global scope + if (VD->hasGlobalStorage() && VD->getType()->isHLSLResourceRecord()) + return initGlobalResourceDecl(VD); return false; } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index e5670dab03cb0..adce0a15bc320 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -637,11 +637,11 @@ ExprResult InitListChecker::PerformEmptyInit(SourceLocation Loc, } InitializationSequence InitSeq(SemaRef, Entity, Kind, SubInit); - // libstdc++4.6 marks the vector default constructor as explicit in - // _GLIBCXX_DEBUG mode, so recover using the C++03 logic in that case. - // stlport does so too. Look for std::__debug for libstdc++, and for - // std:: for stlport. This is effectively a compiler-side implementation of - // LWG2193. + // HACK: libstdc++ prior to 4.9 marks the vector default constructor + // as explicit in _GLIBCXX_DEBUG mode, so recover using the C++03 logic + // in that case. stlport does so too. + // Look for std::__debug for libstdc++, and for std:: for stlport. + // This is effectively a compiler-side implementation of LWG2193. if (!InitSeq && EmptyInitList && InitSeq.getFailureKind() == InitializationSequence::FK_ExplicitConstructor) { OverloadCandidateSet::iterator Best; @@ -1976,6 +1976,8 @@ void InitListChecker::CheckVectorType(const InitializedEntity &Entity, typeCode = "s"; else if (elementType->isUnsignedIntegerType()) typeCode = "u"; + else if (elementType->isMFloat8Type()) + typeCode = "mf"; else llvm_unreachable("Invalid element type!"); @@ -6240,24 +6242,6 @@ static void TryUserDefinedConversion(Sema &S, } } -/// An egregious hack for compatibility with libstdc++-4.2: in , -/// a function with a pointer return type contains a 'return false;' statement. -/// In C++11, 'false' is not a null pointer, so this breaks the build of any -/// code using that header. -/// -/// Work around this by treating 'return false;' as zero-initializing the result -/// if it's used in a pointer-returning function in a system header. -static bool isLibstdcxxPointerReturnFalseHack(Sema &S, - const InitializedEntity &Entity, - const Expr *Init) { - return S.getLangOpts().CPlusPlus11 && - Entity.getKind() == InitializedEntity::EK_Result && - Entity.getType()->isPointerType() && - isa(Init) && - !cast(Init)->getValue() && - S.getSourceManager().isInSystemHeader(Init->getExprLoc()); -} - /// The non-zero enum values here are indexes into diagnostic alternatives. enum InvalidICRKind { IIK_okay, IIK_nonlocal, IIK_nonscalar }; @@ -6943,12 +6927,10 @@ void InitializationSequence::InitializeFrom(Sema &S, AddPassByIndirectCopyRestoreStep(DestType, ShouldCopy); } else if (ICS.isBad()) { - if (isLibstdcxxPointerReturnFalseHack(S, Entity, Initializer)) - AddZeroInitializationStep(Entity.getType()); - else if (DeclAccessPair Found; - Initializer->getType() == Context.OverloadTy && - !S.ResolveAddressOfOverloadedFunction(Initializer, DestType, - /*Complain=*/false, Found)) + if (DeclAccessPair Found; + Initializer->getType() == Context.OverloadTy && + !S.ResolveAddressOfOverloadedFunction(Initializer, DestType, + /*Complain=*/false, Found)) SetFailed(InitializationSequence::FK_AddressOfOverloadFailed); else if (Initializer->getType()->isFunctionType() && isExprAnUnaddressableFunction(S, Initializer)) diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp index 6187e0e719bb1..88bd963a738ce 100644 --- a/clang/lib/Sema/SemaOpenACCClause.cpp +++ b/clang/lib/Sema/SemaOpenACCClause.cpp @@ -1332,6 +1332,12 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorClause( break; case OpenACCDirectiveKind::ParallelLoop: break; + case OpenACCDirectiveKind::Invalid: + // This can happen when the directive was not recognized, but we continued + // anyway. Since there is a lot of stuff that can happen (including + // 'allow anything' in the parallel loop case), just skip all checking and + // continue. + break; } } @@ -1369,6 +1375,14 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitWorkerClause( switch (Clause.getDirectiveKind()) { default: llvm_unreachable("Invalid directive kind for this clause"); + case OpenACCDirectiveKind::Invalid: + // This can happen in cases where the directive was not recognized but we + // continued anyway. Kernels allows kind of any integer argument, so we + // can assume it is that (rather than marking the argument invalid like + // with parallel/serial/routine), and just continue as if nothing + // happened. We'll skip the 'kernels' checking vs num-workers, since this + // MIGHT be something else. + break; case OpenACCDirectiveKind::Loop: switch (SemaRef.getActiveComputeConstructInfo().Kind) { case OpenACCDirectiveKind::Invalid: @@ -2037,6 +2051,12 @@ SemaOpenACC::CheckGangExpr(ArrayRef ExistingClauses, default: llvm_unreachable("Non compute construct in active compute construct?"); } + case OpenACCDirectiveKind::Invalid: + // This can happen in cases where the the directive was not recognized but + // we continued anyway. Since the validity checking is all-over the place + // (it can be a star/integer, or a constant expr depending on the tag), we + // just give up and return an ExprError here. + return ExprError(); default: llvm_unreachable("Invalid directive kind for a Gang clause"); } diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index e20a41c10ccaa..23304e12f8c31 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -7846,6 +7846,8 @@ static void AddMethodTemplateCandidateImmediately( MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info, PartialOverloading, /*AggregateDeductionCandidate=*/false, /*PartialOrdering=*/false, ObjectType, ObjectClassification, + CandidateSet.getKind() == + clang::OverloadCandidateSet::CSK_AddressOfOverloadSet, [&](ArrayRef ParamTypes) { return S.CheckNonDependentConversions( MethodTmpl, ParamTypes, Args, CandidateSet, Conversions, @@ -7960,6 +7962,8 @@ static void AddTemplateOverloadCandidateImmediately( /*PartialOrdering=*/false, /*ObjectType=*/QualType(), /*ObjectClassification=*/Expr::Classification(), + CandidateSet.getKind() == + OverloadCandidateSet::CSK_AddressOfOverloadSet, [&](ArrayRef ParamTypes) { return S.CheckNonDependentConversions( FunctionTemplate, ParamTypes, Args, CandidateSet, Conversions, diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 7940340064eda..14f9d1d03c5ed 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -4353,6 +4353,38 @@ struct PartialSpecMatchResult { VarTemplatePartialSpecializationDecl *Partial; TemplateArgumentList *Args; }; + +// HACK 2025-05-13: workaround std::format_kind since libstdc++ 15.1 (2025-04) +// See GH139067 / https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120190 +static bool IsLibstdcxxStdFormatKind(Preprocessor &PP, VarDecl *Var) { + if (Var->getName() != "format_kind" || + !Var->getDeclContext()->isStdNamespace()) + return false; + + MacroInfo *MacroGLIBCXX = + PP.getMacroInfo(PP.getIdentifierInfo("__GLIBCXX__")); + + if (!MacroGLIBCXX || MacroGLIBCXX->getNumTokens() != 1) + return false; + + const Token &RevisionDateTok = MacroGLIBCXX->getReplacementToken(0); + bool Invalid = false; + std::string RevisionDate = PP.getSpelling(RevisionDateTok, &Invalid); + StringRef FixDate = "30251231"; + + if (Invalid) + return false; + + // The format of the revision date is in compressed ISO date format. + // See https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_macros.html + // So we can use string comparison. + // + // Checking old versions of libstdc++ is not needed because 15.1 is the first + // release in which users can access std::format_kind. + // + // FIXME: Correct FixDate once the issue is fixed. + return RevisionDate.size() == 8 && RevisionDate <= FixDate; +} } // end anonymous namespace DeclResult @@ -4384,6 +4416,8 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc, if (VarDecl *Var = Template->getTemplatedDecl(); ParsingInitForAutoVars.count(Var) && + // See comments on this function definition + !IsLibstdcxxStdFormatKind(PP, Var) && llvm::equal( CTAI.CanonicalConverted, Template->getTemplateParameters()->getInjectedTemplateArgs(Context), @@ -8987,8 +9021,7 @@ Sema::CheckSpecializationInstantiationRedecl(SourceLocation NewLoc, // The declaration itself has not actually been instantiated, so it is // still okay to specialize it. StripImplicitInstantiation( - PrevDecl, - Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()); + PrevDecl, Context.getTargetInfo().getTriple().isOSCygMing()); return false; } // Fall through @@ -9383,7 +9416,10 @@ bool Sema::CheckFunctionTemplateSpecialization( // Mark the prior declaration as an explicit specialization, so that later // clients know that this is an explicit specialization. - if (!isFriend) { + // A dependent friend specialization which has a definition should be treated + // as explicit specialization, despite being invalid. + if (FunctionDecl *InstFrom = FD->getInstantiatedFromMemberFunction(); + !isFriend || (InstFrom && InstFrom->getDependentSpecializationInfo())) { // Since explicit specializations do not inherit '=delete' from their // primary function template - check if the 'specialization' that was // implicitly generated (during template argument deduction for partial @@ -9855,7 +9891,7 @@ DeclResult Sema::ActOnExplicitInstantiation( : TSK_ExplicitInstantiationDeclaration; if (TSK == TSK_ExplicitInstantiationDeclaration && - !Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()) { + !Context.getTargetInfo().getTriple().isOSCygMing()) { // Check for dllexport class template instantiation declarations, // except for MinGW mode. for (const ParsedAttr &AL : Attr) { @@ -9920,7 +9956,7 @@ DeclResult Sema::ActOnExplicitInstantiation( = PrevDecl ? PrevDecl->getTemplateSpecializationKind() : TSK_Undeclared; if (TSK == TSK_ExplicitInstantiationDefinition && PrevDecl != nullptr && - Context.getTargetInfo().getTriple().isWindowsGNUEnvironment()) { + Context.getTargetInfo().getTriple().isOSCygMing()) { // Check for dllexport class template instantiation definitions in MinGW // mode, if a previous declaration of the instantiation was seen. for (const ParsedAttr &AL : Attr) { @@ -10088,7 +10124,7 @@ DeclResult Sema::ActOnExplicitInstantiation( // In MinGW mode, export the template instantiation if the declaration // was marked dllexport. if (PrevDecl_TSK == TSK_ExplicitInstantiationDeclaration && - Context.getTargetInfo().getTriple().isWindowsGNUEnvironment() && + Context.getTargetInfo().getTriple().isOSCygMing() && PrevDecl->hasAttr()) { dllExportImportClassTemplateSpecialization(*this, Def); } @@ -11370,7 +11406,12 @@ class ExplicitSpecializationVisibilityChecker { template void checkImpl(SpecDecl *Spec) { bool IsHiddenExplicitSpecialization = false; - if (Spec->getTemplateSpecializationKind() == TSK_ExplicitSpecialization) { + TemplateSpecializationKind SpecKind = Spec->getTemplateSpecializationKind(); + // Some invalid friend declarations are written as specializations but are + // instantiated implicitly. + if constexpr (std::is_same_v) + SpecKind = Spec->getTemplateSpecializationKindForInstantiation(); + if (SpecKind == TSK_ExplicitSpecialization) { IsHiddenExplicitSpecialization = Spec->getMemberSpecializationInfo() ? !CheckMemberSpecialization(Spec) : !CheckExplicitSpecialization(Spec); diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 5dc06ebc2a235..217d57d67f067 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -4432,6 +4432,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( bool PartialOverloading, bool AggregateDeductionCandidate, bool PartialOrdering, QualType ObjectType, Expr::Classification ObjectClassification, + bool ForOverloadSetAddressResolution, llvm::function_ref)> CheckNonDependent) { if (FunctionTemplate->isInvalidDecl()) return TemplateDeductionResult::Invalid; @@ -4440,7 +4441,15 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( unsigned NumParams = Function->getNumParams(); bool HasExplicitObject = false; int ExplicitObjectOffset = 0; - if (Function->hasCXXExplicitFunctionObjectParameter()) { + + // [C++26] [over.call.func]p3 + // If the primary-expression is the address of an overload set, + // the argument list is the same as the expression-list in the call. + // Otherwise, the argument list is the expression-list in the call augmented + // by the addition of an implied object argument as in a qualified function + // call. + if (!ForOverloadSetAddressResolution && + Function->hasCXXExplicitFunctionObjectParameter()) { HasExplicitObject = true; ExplicitObjectOffset = 1; } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 01065f22b34a8..d4f99c1fa16f6 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -5756,8 +5756,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation, RebuildTypeSourceInfoForDefaultSpecialMembers(); SetDeclDefaulted(Function, PatternDecl->getLocation()); } else { - NamedDecl *ND = Function; - DeclContext *DC = ND->getLexicalDeclContext(); + DeclContext *DC = Function->getLexicalDeclContext(); std::optional> Innermost; if (auto *Primary = Function->getPrimaryTemplate(); Primary && diff --git a/clang/lib/Sema/SemaWasm.cpp b/clang/lib/Sema/SemaWasm.cpp index 3362e1d717a6c..6faea24a46b09 100644 --- a/clang/lib/Sema/SemaWasm.cpp +++ b/clang/lib/Sema/SemaWasm.cpp @@ -54,12 +54,27 @@ static bool CheckWasmBuiltinArgIsInteger(Sema &S, CallExpr *E, bool SemaWasm::BuiltinWasmRefNullExtern(CallExpr *TheCall) { if (SemaRef.checkArgCount(TheCall, /*DesiredArgCount=*/0)) return true; - TheCall->setType(getASTContext().getWebAssemblyExternrefType()); return false; } +bool SemaWasm::BuiltinWasmRefIsNullExtern(CallExpr *TheCall) { + if (SemaRef.checkArgCount(TheCall, 1)) { + return true; + } + + Expr *ArgExpr = TheCall->getArg(0); + if (!ArgExpr->getType().isWebAssemblyExternrefType()) { + SemaRef.Diag(ArgExpr->getBeginLoc(), + diag::err_wasm_builtin_arg_must_be_externref_type) + << 1 << ArgExpr->getSourceRange(); + return true; + } + + return false; +} + bool SemaWasm::BuiltinWasmRefNullFunc(CallExpr *TheCall) { ASTContext &Context = getASTContext(); if (SemaRef.checkArgCount(TheCall, /*DesiredArgCount=*/0)) @@ -220,6 +235,8 @@ bool SemaWasm::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI, return BuiltinWasmRefNullExtern(TheCall); case WebAssembly::BI__builtin_wasm_ref_null_func: return BuiltinWasmRefNullFunc(TheCall); + case WebAssembly::BI__builtin_wasm_ref_is_null_extern: + return BuiltinWasmRefIsNullExtern(TheCall); case WebAssembly::BI__builtin_wasm_table_get: return BuiltinWasmTableGet(TheCall); case WebAssembly::BI__builtin_wasm_table_set: diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 7de510c85bfed..1b3d3c22aa9f5 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -1774,6 +1774,29 @@ struct InputFileEntry { uint32_t ContentHash[2]; InputFileEntry(FileEntryRef File) : File(File) {} + + void trySetContentHash( + Preprocessor &PP, + llvm::function_ref()> GetMemBuff) { + ContentHash[0] = 0; + ContentHash[1] = 0; + + if (!PP.getHeaderSearchInfo() + .getHeaderSearchOpts() + .ValidateASTInputFilesContent) + return; + + auto MemBuff = GetMemBuff(); + if (!MemBuff) { + PP.Diag(SourceLocation(), diag::err_module_unable_to_hash_content) + << File.getName(); + return; + } + + uint64_t Hash = xxh3_64bits(MemBuff->getBuffer()); + ContentHash[0] = uint32_t(Hash); + ContentHash[1] = uint32_t(Hash >> 32); + } }; } // namespace @@ -1848,25 +1871,41 @@ void ASTWriter::WriteInputFiles(SourceManager &SourceMgr) { !IsSLocFileEntryAffecting[IncludeFileID.ID]; Entry.IsModuleMap = isModuleMap(File.getFileCharacteristic()); - uint64_t ContentHash = 0; - if (PP->getHeaderSearchInfo() - .getHeaderSearchOpts() - .ValidateASTInputFilesContent) { - auto MemBuff = Cache->getBufferIfLoaded(); - if (MemBuff) - ContentHash = xxh3_64bits(MemBuff->getBuffer()); - else - PP->Diag(SourceLocation(), diag::err_module_unable_to_hash_content) - << Entry.File.getName(); - } - Entry.ContentHash[0] = uint32_t(ContentHash); - Entry.ContentHash[1] = uint32_t(ContentHash >> 32); + Entry.trySetContentHash(*PP, [&] { return Cache->getBufferIfLoaded(); }); + if (Entry.IsSystemFile) SystemFiles.push_back(Entry); else UserFiles.push_back(Entry); } + // FIXME: Make providing input files not in the SourceManager more flexible. + // The SDKSettings.json file is necessary for correct evaluation of + // availability annotations. + StringRef Sysroot = PP->getHeaderSearchInfo().getHeaderSearchOpts().Sysroot; + if (!Sysroot.empty()) { + SmallString<128> SDKSettingsJSON = Sysroot; + llvm::sys::path::append(SDKSettingsJSON, "SDKSettings.json"); + FileManager &FM = PP->getFileManager(); + if (auto FE = FM.getOptionalFileRef(SDKSettingsJSON)) { + InputFileEntry Entry(*FE); + Entry.IsSystemFile = true; + Entry.IsTransient = false; + Entry.BufferOverridden = false; + Entry.IsTopLevel = true; + Entry.IsModuleMap = false; + std::unique_ptr MB; + Entry.trySetContentHash(*PP, [&]() -> std::optional { + if (auto MBOrErr = FM.getBufferForFile(Entry.File)) { + MB = std::move(*MBOrErr); + return MB->getMemBufferRef(); + } + return std::nullopt; + }); + SystemFiles.push_back(Entry); + } + } + // User files go at the front, system files at the back. auto SortedFiles = llvm::concat(std::move(UserFiles), std::move(SystemFiles)); diff --git a/clang/lib/StaticAnalyzer/Checkers/AnalyzerStatsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/AnalyzerStatsChecker.cpp index d030e69a2a6e0..0aaa32faefa39 100644 --- a/clang/lib/StaticAnalyzer/Checkers/AnalyzerStatsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/AnalyzerStatsChecker.cpp @@ -45,9 +45,7 @@ void AnalyzerStatsChecker::checkEndAnalysis(ExplodedGraph &G, const SourceManager &SM = B.getSourceManager(); llvm::SmallPtrSet reachable; - // Root node should have the location context of the top most function. - const ExplodedNode *GraphRoot = *G.roots_begin(); - const LocationContext *LC = GraphRoot->getLocation().getLocationContext(); + const LocationContext *LC = Eng.getRootLocationContext(); const Decl *D = LC->getDecl(); diff --git a/clang/lib/StaticAnalyzer/Core/BugReporter.cpp b/clang/lib/StaticAnalyzer/Core/BugReporter.cpp index 28b96f2717210..d5bc3ac2962d5 100644 --- a/clang/lib/StaticAnalyzer/Core/BugReporter.cpp +++ b/clang/lib/StaticAnalyzer/Core/BugReporter.cpp @@ -2660,8 +2660,7 @@ BugPathGetter::BugPathGetter(const ExplodedGraph *OriginalGraph, // Perform a forward BFS to find all the shortest paths. std::queue WS; - assert(TrimmedGraph->num_roots() == 1); - WS.push(*TrimmedGraph->roots_begin()); + WS.push(TrimmedGraph->getRoot()); unsigned Priority = 0; while (!WS.empty()) { @@ -2722,7 +2721,9 @@ BugPathInfo *BugPathGetter::getNextBugPath() { // Are we at the final node? if (OrigN->pred_empty()) { - GNew->addRoot(NewN); + assert(OrigN == TrimmedGraph->getRoot() && + "There should be only one root!"); + GNew->designateAsRoot(NewN); break; } diff --git a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp index 8ba304b3af0ca..2e6631f2f620c 100644 --- a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp @@ -87,8 +87,9 @@ void CoreEngine::setBlockCounter(BlockCounter C) { /// ExecuteWorkList - Run the worklist algorithm for a maximum number of steps. bool CoreEngine::ExecuteWorkList(const LocationContext *L, unsigned MaxSteps, ProgramStateRef InitState) { - if (G.num_roots() == 0) { // Initialize the analysis by constructing - // the root if none exists. + if (G.empty()) { + assert(!G.getRoot() && "empty graph must not have a root node"); + // Initialize the analysis by constructing the root if there are no nodes. const CFGBlock *Entry = &(L->getCFG()->getEntry()); @@ -117,7 +118,7 @@ bool CoreEngine::ExecuteWorkList(const LocationContext *L, unsigned MaxSteps, bool IsNew; ExplodedNode *Node = G.getNode(StartLoc, InitState, false, &IsNew); assert(IsNew); - G.addRoot(Node); + G.designateAsRoot(Node); NodeBuilderContext BuilderCtx(*this, StartLoc.getDst(), Node); ExplodedNodeSet DstBegin; @@ -548,15 +549,11 @@ void CoreEngine::HandleVirtualBaseBranch(const CFGBlock *B, void CoreEngine::generateNode(const ProgramPoint &Loc, ProgramStateRef State, ExplodedNode *Pred) { + assert(Pred); bool IsNew; ExplodedNode *Node = G.getNode(Loc, State, false, &IsNew); - if (Pred) - Node->addPredecessor(Pred, G); // Link 'Node' with its predecessor. - else { - assert(IsNew); - G.addRoot(Node); // 'Node' has no predecessor. Make it a root. - } + Node->addPredecessor(Pred, G); // Link 'Node' with its predecessor. // Only add 'Node' to the worklist if it was freshly generated. if (IsNew) WList->enqueue(Node); diff --git a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp index 7b2cccce93cfe..098922d94061f 100644 --- a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp @@ -442,6 +442,10 @@ std::unique_ptr ExplodedGraph::trim(ArrayRef Sinks, InterExplodedGraphMap *ForwardMap, InterExplodedGraphMap *InverseMap) const { + // FIXME: The two-pass algorithm of this function (which was introduced in + // 2008) is terribly overcomplicated and should be replaced by a single + // (backward) pass. + if (Nodes.empty()) return nullptr; @@ -467,8 +471,9 @@ ExplodedGraph::trim(ArrayRef Sinks, if (!Pass1.insert(N).second) continue; - // If this is a root enqueue it to the second worklist. + // If this is the root enqueue it to the second worklist. if (N->Preds.empty()) { + assert(N == getRoot() && "Found non-root node with no predecessors!"); WL2.push_back(N); continue; } @@ -477,12 +482,14 @@ ExplodedGraph::trim(ArrayRef Sinks, WL1.append(N->Preds.begin(), N->Preds.end()); } - // We didn't hit a root? Return with a null pointer for the new graph. + // We didn't hit the root? Return with a null pointer for the new graph. if (WL2.empty()) return nullptr; + assert(WL2.size() == 1 && "There must be only one root!"); + // Create an empty graph. - std::unique_ptr G = MakeEmptyGraph(); + std::unique_ptr G = std::make_unique(); // ===- Pass 2 (forward DFS to construct the new graph) -=== while (!WL2.empty()) { @@ -503,9 +510,11 @@ ExplodedGraph::trim(ArrayRef Sinks, // Also record the reverse mapping from the new node to the old node. if (InverseMap) (*InverseMap)[NewN] = N; - // If this node is a root, designate it as such in the graph. - if (N->Preds.empty()) - G->addRoot(NewN); + // If this node is the root, designate it as such in the graph. + if (N->Preds.empty()) { + assert(N == getRoot()); + G->designateAsRoot(NewN); + } // In the case that some of the intended predecessors of NewN have already // been created, we should hook them up as predecessors. diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index f71441a3bb49b..ebad83dad0c8f 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -2529,7 +2529,7 @@ static const LocationContext *getInlinedLocationContext(ExplodedNode *Node, ExplodedGraph &G) { const LocationContext *CalleeLC = Node->getLocation().getLocationContext(); const LocationContext *RootLC = - (*G.roots_begin())->getLocation().getLocationContext(); + G.getRoot()->getLocation().getLocationContext(); if (CalleeLC->getStackFrame() == RootLC->getStackFrame()) return nullptr; diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp index a6ade661d04a2..a469df4ca7160 100644 --- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp @@ -80,6 +80,49 @@ void UnarySymExpr::dumpToStream(raw_ostream &os) const { os << ')'; } +const Stmt *SymbolConjured::getStmt() const { + // Sometimes the CFG element is invalid, avoid dereferencing it. + if (Elem.getParent() == nullptr || + Elem.getIndexInBlock() >= Elem.getParent()->size()) + return nullptr; + switch (Elem->getKind()) { + case CFGElement::Initializer: + if (const auto *Init = Elem->castAs().getInitializer()) { + return Init->getInit(); + } + return nullptr; + case CFGElement::ScopeBegin: + return Elem->castAs().getTriggerStmt(); + case CFGElement::ScopeEnd: + return Elem->castAs().getTriggerStmt(); + case CFGElement::NewAllocator: + return Elem->castAs().getAllocatorExpr(); + case CFGElement::LifetimeEnds: + return Elem->castAs().getTriggerStmt(); + case CFGElement::LoopExit: + return Elem->castAs().getLoopStmt(); + case CFGElement::Statement: + return Elem->castAs().getStmt(); + case CFGElement::Constructor: + return Elem->castAs().getStmt(); + case CFGElement::CXXRecordTypedCall: + return Elem->castAs().getStmt(); + case CFGElement::AutomaticObjectDtor: + return Elem->castAs().getTriggerStmt(); + case CFGElement::DeleteDtor: + return Elem->castAs().getDeleteExpr(); + case CFGElement::BaseDtor: + return nullptr; + case CFGElement::MemberDtor: + return nullptr; + case CFGElement::TemporaryDtor: + return Elem->castAs().getBindTemporaryExpr(); + case CFGElement::CleanupFunction: + return nullptr; + } + return nullptr; +} + void SymbolConjured::dumpToStream(raw_ostream &os) const { os << getKindStr() << getSymbolID() << '{' << T << ", LC" << LCtx->getID(); if (auto *S = getStmt()) diff --git a/clang/test/AST/ByteCode/builtin-bit-cast.cpp b/clang/test/AST/ByteCode/builtin-bit-cast.cpp index 187f180afd3da..3c5e89d7d5a74 100644 --- a/clang/test/AST/ByteCode/builtin-bit-cast.cpp +++ b/clang/test/AST/ByteCode/builtin-bit-cast.cpp @@ -503,6 +503,16 @@ namespace OversizedBitField { #endif } +namespace Discarded { + enum my_byte : unsigned char {}; + struct pad { + char a; + int b; + }; + constexpr int bad_my_byte = (__builtin_bit_cast(my_byte[8], pad{1, 2}), 0); // both-error {{must be initialized by a constant expression}} \ + // both-note {{indeterminate value can only initialize an object of type 'unsigned char' or 'std::byte';}} +} + typedef bool bool9 __attribute__((ext_vector_type(9))); // both-error@+2 {{constexpr variable 'bad_bool9_to_short' must be initialized by a constant expression}} // both-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(9)))' (vector of 9 'bool' values) is not allowed in a constant expression; element size 1 * element count 9 is not a multiple of the byte size 8}} diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index 8fb19fcfcd3fe..0b2234ef83298 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -997,3 +997,21 @@ namespace NastyChar { template constexpr auto to_nasty_char() { return t; } constexpr auto result = to_nasty_char<"12345">(); } + +namespace TempDtor { + struct A { + int n; + }; + constexpr A &&a_ref = A(); // both-note {{temporary created here}} + constexpr void destroy_extern_2() { // both-error {{never produces a constant expression}} + a_ref.~A(); // both-note {{destruction of temporary is not allowed in a constant expression outside the expression that created the temporary}} + } +} + +namespace OnePastEndDtor { + struct A {int n; }; + constexpr void destroy_past_end() { // both-error {{never produces a constant expression}} + A a; + (&a+1)->~A(); // both-note {{destruction of dereferenced one-past-the-end pointer}} + } +} diff --git a/clang/test/AST/ByteCode/lifetimes26.cpp b/clang/test/AST/ByteCode/lifetimes26.cpp new file mode 100644 index 0000000000000..a5203ae77bc13 --- /dev/null +++ b/clang/test/AST/ByteCode/lifetimes26.cpp @@ -0,0 +1,49 @@ +// RUN: %clang_cc1 -verify=expected,both -std=c++26 %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -verify=ref,both -std=c++26 %s + +// both-no-diagnostics + +namespace std { + struct type_info; + struct destroying_delete_t { + explicit destroying_delete_t() = default; + } inline constexpr destroying_delete{}; + struct nothrow_t { + explicit nothrow_t() = default; + } inline constexpr nothrow{}; + using size_t = decltype(sizeof(0)); + enum class align_val_t : size_t {}; +}; + +constexpr void *operator new(std::size_t, void *p) { return p; } +namespace std { + template constexpr T *construct(T *p) { return new (p) T; } + template constexpr void destroy(T *p) { p->~T(); } +} + +constexpr bool foo() { + using T = bool; + bool b = true; + b.~T(); + new (&b) bool(false); + return b; +} +static_assert(!foo()); + +struct S {}; +constexpr bool foo2() { + S s; + s.~S(); + new (&s) S{}; + return true; +} +static_assert(foo2()); + +constexpr void destroy_pointer() { + using T = int*; + T p; + p.~T(); + std::construct(&p); +} +static_assert((destroy_pointer(), true)); + diff --git a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl index 5fba939d29cfe..8b9aa99a5314e 100644 --- a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl +++ b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl @@ -78,5 +78,27 @@ RESOURCE Buffer; // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr +// Constructor from implicit binding + +// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]] 'void (unsigned int, int, unsigned int, unsigned int)' inline +// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int' +// CHECK-NEXT: ParmVarDecl {{.*}} range 'int' +// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int' +// CHECK-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int' +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-NEXT: BinaryOperator {{.*}} '=' +// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle +// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue implicit this +// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t +// CHECK-NEXT: ImplicitCastExpr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} '' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding' +// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle +// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue implicit this +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int' +// CHECK-NEXT: AlwaysInlineAttr + // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'const element_type &(unsigned int) const' // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'element_type &(unsigned int)' diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl index 63265a0003582..f8659313ff19c 100644 --- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl @@ -125,6 +125,28 @@ RESOURCE Buffer; // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr +// Constructor from implicit binding + +// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]] 'void (unsigned int, int, unsigned int, unsigned int)' inline +// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int' +// CHECK-NEXT: ParmVarDecl {{.*}} range 'int' +// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int' +// CHECK-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int' +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-NEXT: BinaryOperator {{.*}} '=' +// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle +// CHECK-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]' lvalue implicit this +// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t +// CHECK-NEXT: ImplicitCastExpr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} '' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding' +// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle +// CHECK-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]' lvalue implicit this +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int' +// CHECK-NEXT: AlwaysInlineAttr + // Subscript operators // CHECK-SUBSCRIPT: CXXMethodDecl {{.*}} operator[] 'const hlsl_device element_type &(unsigned int) const' diff --git a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl index 6074c1e8bcdd2..dad1ef17a1f86 100644 --- a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl +++ b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl @@ -92,6 +92,28 @@ RESOURCE Buffer; // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr +// Constructor from implicit binding + +// CHECK: CXXConstructorDecl {{.*}} [[RESOURCE]] 'void (unsigned int, int, unsigned int, unsigned int)' inline +// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int' +// CHECK-NEXT: ParmVarDecl {{.*}} range 'int' +// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int' +// CHECK-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int' +// CHECK-NEXT: CompoundStmt {{.*}} +// CHECK-NEXT: BinaryOperator {{.*}} '=' +// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle +// CHECK-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]' lvalue implicit this +// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t +// CHECK-NEXT: ImplicitCastExpr {{.*}} +// CHECK-NEXT: DeclRefExpr {{.*}} '' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding' +// CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle +// CHECK-NEXT: CXXThisExpr {{.*}} '[[RESOURCE]]' lvalue implicit this +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int' +// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int' +// CHECK-NEXT: AlwaysInlineAttr + // Subsctript operators // CHECK: CXXMethodDecl {{.*}} operator[] 'const hlsl_device element_type &(unsigned int) const' diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl index 4cca9cc742c07..0cc72fd370633 100644 --- a/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl +++ b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl @@ -21,6 +21,7 @@ cbuffer A { // AST: HLSLBufferDecl {{.*}} line:11:9 cbuffer A // AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// AST-NEXT: HLSLResourceBindingAttr {{.*}} Implicit "" "0" // AST-NEXT: FullComment // AST-NEXT: ParagraphComment // AST-NEXT: TextComment {{.*}} Text=" CBuffer decl." diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl index 4fe8aed5cb31a..dc86000a87ee3 100644 --- a/clang/test/AST/HLSL/packoffset.hlsl +++ b/clang/test/AST/HLSL/packoffset.hlsl @@ -5,6 +5,7 @@ cbuffer A { // CHECK-NEXT:-HLSLResourceClassAttr {{.*}} Implicit CBuffer + // CHECK-NEXT: HLSLResourceBindingAttr {{.*}} Implicit "" "0" // CHECK-NEXT: VarDecl {{.*}} A1 'hlsl_constant float4' // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0 float4 A1 : packoffset(c); diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl index 9c73f587b7210..7fb06f8d3524a 100644 --- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl +++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl @@ -16,12 +16,14 @@ float foo() { // Make sure cbuffer/tbuffer works for PCH. // CHECK: HLSLBufferDecl {{.*}} line:{{[0-9]+}}:9 imported cbuffer A // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer +// CHECK-NEXT: HLSLResourceBindingAttr {{.*}} Implicit "" "0" // CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'hlsl_constant float' // CHECK-NEXT: CXXRecordDecl {{.*}} imported implicit struct __cblayout_A definition // CHECK: FieldDecl {{.*}} imported a 'float' // CHECK: HLSLBufferDecl {{.*}} line:{{[0-9]+}}:9 imported tbuffer B // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV +// CHECK-NEXT: HLSLResourceBindingAttr {{.*}} Implicit "" "0" // CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'hlsl_constant float' // CHECK-NEXT: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} imported implicit struct __cblayout_B definition // CHECK: FieldDecl 0x{{[0-9a-f]+}} {{.*}} imported b 'float' diff --git a/clang/test/Analysis/ftime-trace-no-init.cpp b/clang/test/Analysis/ftime-trace-no-init.cpp new file mode 100644 index 0000000000000..7fb289b19da78 --- /dev/null +++ b/clang/test/Analysis/ftime-trace-no-init.cpp @@ -0,0 +1,5 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling %s -ftime-trace=%t.raw.json -verify +// expected-no-diagnostics + +// GitHub issue 139779 +struct {} a; // no-crash diff --git a/clang/test/Analysis/generate_analyzer_options_docs.test b/clang/test/Analysis/generate_analyzer_options_docs.test new file mode 100644 index 0000000000000..0c95346504ae3 --- /dev/null +++ b/clang/test/Analysis/generate_analyzer_options_docs.test @@ -0,0 +1,14 @@ +The documentation of analyzer options is generated by a script that parses +AnalyzerOptions.def. The following line validates that this script +"understands" everything in its input files: + +RUN: %python %src_dir/docs/tools/generate_analyzer_options_docs.py \ +RUN: --options-def %src_include_dir/clang/StaticAnalyzer/Core/AnalyzerOptions.def \ +RUN: --template %src_dir/docs/analyzer/user-docs/Options.rst.in \ +RUN: --out %t.rst + +Moreover, verify that the documentation (e.g. this fragment of the +documentation of the "mode" option) can be found in the output file: + +RUN: FileCheck --input-file=%t.rst %s +CHECK: Controls the high-level analyzer mode diff --git a/clang/test/CIR/CodeGen/switch_flat_op.cpp b/clang/test/CIR/CodeGen/switch_flat_op.cpp new file mode 100644 index 0000000000000..a9fc095025eb0 --- /dev/null +++ b/clang/test/CIR/CodeGen/switch_flat_op.cpp @@ -0,0 +1,77 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: cir-opt --mlir-print-ir-before=cir-flatten-cfg --cir-flatten-cfg %t.cir -o %t.flattened.before.cir 2> %t.before +// RUN: FileCheck --input-file=%t.before %s --check-prefix=BEFORE +// RUN: cir-opt --mlir-print-ir-after=cir-flatten-cfg --cir-flatten-cfg %t.cir -o %t.flattened.after.cir 2> %t.after +// RUN: FileCheck --input-file=%t.after %s --check-prefix=AFTER + +void swf(int a) { + switch (int b = 3; a) { + case 3: + b = b * 2; + break; + case 4 ... 5: + b = b * 3; + break; + default: + break; + } + +} + +// BEFORE: cir.func @_Z3swfi +// BEFORE: %[[VAR_B:.*]] = cir.alloca !s32i, !cir.ptr, ["b", init] {alignment = 4 : i64} +// BEFORE: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// BEFORE: cir.switch (%[[COND:.*]] : !s32i) { +// BEFORE: cir.case(equal, [#cir.int<3> : !s32i]) { +// BEFORE: %[[LOAD_B_EQ:.*]] = cir.load %[[VAR_B]] : !cir.ptr, !s32i +// BEFORE: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// BEFORE: %[[MUL_EQ:.*]] = cir.binop(mul, %[[LOAD_B_EQ]], %[[CONST_2]]) nsw : !s32i +// BEFORE: cir.store %[[MUL_EQ]], %[[VAR_B]] : !s32i, !cir.ptr +// BEFORE: cir.break +// BEFORE: } +// BEFORE: cir.case(range, [#cir.int<4> : !s32i, #cir.int<5> : !s32i]) { +// BEFORE: %[[LOAD_B_RANGE:.*]] = cir.load %[[VAR_B]] : !cir.ptr, !s32i +// BEFORE: %[[CONST_3_RANGE:.*]] = cir.const #cir.int<3> : !s32i +// BEFORE: %[[MUL_RANGE:.*]] = cir.binop(mul, %[[LOAD_B_RANGE]], %[[CONST_3_RANGE]]) nsw : !s32i +// BEFORE: cir.store %[[MUL_RANGE]], %[[VAR_B]] : !s32i, !cir.ptr +// BEFORE: cir.break +// BEFORE: } +// BEFORE: cir.case(default, []) { +// BEFORE: cir.break +// BEFORE: } +// BEFORE: cir.yield +// BEFORE: } +// BEFORE: } +// BEFORE: cir.return + +// AFTER: cir.func @_Z3swfi +// AFTER: %[[VAR_A:.*]] = cir.alloca !s32i, !cir.ptr, ["a", init] {alignment = 4 : i64} +// AFTER: cir.store %arg0, %[[VAR_A]] : !s32i, !cir.ptr +// AFTER: %[[VAR_B:.*]] = cir.alloca !s32i, !cir.ptr, ["b", init] {alignment = 4 : i64} +// AFTER: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// AFTER: cir.store %[[CONST_3]], %[[VAR_B]] : !s32i, !cir.ptr +// AFTER: cir.switch.flat %[[COND:.*]] : !s32i, ^bb[[#BB6:]] [ +// AFTER: 3: ^bb[[#BB4:]], +// AFTER: 4: ^bb[[#BB5:]], +// AFTER: 5: ^bb[[#BB5:]] +// AFTER: ] +// AFTER: ^bb[[#BB4]]: +// AFTER: %[[LOAD_B_EQ:.*]] = cir.load %[[VAR_B]] : !cir.ptr, !s32i +// AFTER: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// AFTER: %[[MUL_EQ:.*]] = cir.binop(mul, %[[LOAD_B_EQ]], %[[CONST_2]]) nsw : !s32i +// AFTER: cir.store %[[MUL_EQ]], %[[VAR_B]] : !s32i, !cir.ptr +// AFTER: cir.br ^bb[[#BB7:]] +// AFTER: ^bb[[#BB5]]: +// AFTER: %[[LOAD_B_RANGE:.*]] = cir.load %[[VAR_B]] : !cir.ptr, !s32i +// AFTER: %[[CONST_3_AGAIN:.*]] = cir.const #cir.int<3> : !s32i +// AFTER: %[[MUL_RANGE:.*]] = cir.binop(mul, %[[LOAD_B_RANGE]], %[[CONST_3_AGAIN]]) nsw : !s32i +// AFTER: cir.store %[[MUL_RANGE]], %[[VAR_B]] : !s32i, !cir.ptr +// AFTER: cir.br ^bb[[#BB7]] +// AFTER: ^bb[[#BB6]]: +// AFTER: cir.br ^bb[[#BB7]] +// AFTER: ^bb[[#BB7]]: +// AFTER: cir.br ^bb[[#BB8:]] +// AFTER: ^bb[[#BB8]]: +// AFTER: cir.return +// AFTER: } + diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp index 350e5f8efc2bd..d55ce762ce6f1 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp @@ -947,4 +947,67 @@ extern "C" void acc_combined(int N, int cond) { // CHECK-NEXT: acc.yield // CHECK-NEXT: } loc +#pragma acc parallel loop async + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.parallel combined(loop) async { + // CHECK-NEXT: acc.loop combined(parallel) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop async(cond) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.serial combined(loop) async(%[[CONV_CAST]] : si32) { + // CHECK-NEXT: acc.loop combined(serial) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels loop async device_type(nvidia, radeon) async + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.kernels combined(loop) async([#acc.device_type, #acc.device_type, #acc.device_type]) { + // CHECK-NEXT: acc.loop combined(kernels) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc parallel loop async(3) device_type(nvidia, radeon) async(cond) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.parallel combined(loop) async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { + // CHECK-NEXT: acc.loop combined(parallel) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop async device_type(nvidia, radeon) async(cond) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.serial combined(loop) async([#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { + // CHECK-NEXT: acc.loop combined(serial) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels loop async(3) device_type(nvidia, radeon) async + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.kernels combined(loop) async([#acc.device_type, #acc.device_type], %[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.loop combined(kernels) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc } diff --git a/clang/test/CIR/CodeGenOpenACC/data.c b/clang/test/CIR/CodeGenOpenACC/data.c index 7887df6503f08..5c8d32dfd2b73 100644 --- a/clang/test/CIR/CodeGenOpenACC/data.c +++ b/clang/test/CIR/CodeGenOpenACC/data.c @@ -37,9 +37,9 @@ void acc_data(int cond) { #pragma acc data default(none) async {} - // CHECK-NEXT: acc.data { + // CHECK-NEXT: acc.data async { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type], defaultAttr = #acc} + // CHECK-NEXT: } attributes {defaultAttr = #acc} #pragma acc data default(none) async(cond) {} @@ -51,9 +51,9 @@ void acc_data(int cond) { #pragma acc data default(none) async device_type(nvidia, radeon) async {} - // CHECK-NEXT: acc.data { + // CHECK-NEXT: acc.data async([#acc.device_type, #acc.device_type, #acc.device_type]) { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type, #acc.device_type], defaultAttr = #acc} + // CHECK-NEXT: } attributes {defaultAttr = #acc} #pragma acc data default(none) async(3) device_type(nvidia, radeon) async(cond) {} @@ -69,17 +69,17 @@ void acc_data(int cond) { {} // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 - // CHECK-NEXT: acc.data async(%[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { + // CHECK-NEXT: acc.data async([#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type], defaultAttr = #acc} + // CHECK-NEXT: } attributes {defaultAttr = #acc} #pragma acc data default(none) async(3) device_type(nvidia, radeon) async {} // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 - // CHECK-NEXT: acc.data async(%[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.data async([#acc.device_type, #acc.device_type], %[[THREE_CAST]] : si32) { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type], defaultAttr = #acc} + // CHECK-NEXT: } attributes {defaultAttr = #acc} #pragma acc data default(none) if(cond) {} diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c index 7175e342c39bd..500bec875cf6b 100644 --- a/clang/test/CIR/CodeGenOpenACC/kernels.c +++ b/clang/test/CIR/CodeGenOpenACC/kernels.c @@ -212,9 +212,9 @@ void acc_kernels(int cond) { #pragma acc kernels async {} - // CHECK-NEXT: acc.kernels { + // CHECK-NEXT: acc.kernels async { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} + // CHECK-NEXT: } loc #pragma acc kernels async(cond) {} @@ -226,9 +226,9 @@ void acc_kernels(int cond) { #pragma acc kernels async device_type(nvidia, radeon) async {} - // CHECK-NEXT: acc.kernels { + // CHECK-NEXT: acc.kernels async([#acc.device_type, #acc.device_type, #acc.device_type]) { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } loc #pragma acc kernels async(3) device_type(nvidia, radeon) async(cond) {} @@ -244,17 +244,17 @@ void acc_kernels(int cond) { {} // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 - // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { + // CHECK-NEXT: acc.kernels async([#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} + // CHECK-NEXT: } loc #pragma acc kernels async(3) device_type(nvidia, radeon) async {} // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 - // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.kernels async([#acc.device_type, #acc.device_type], %[[THREE_CAST]] : si32) { // CHECK-NEXT: acc.terminator - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type]} + // CHECK-NEXT: } loc #pragma acc kernels num_gangs(1) {} diff --git a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp index b3299c0b4c137..95b04a314ad8e 100644 --- a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp +++ b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp @@ -13,7 +13,7 @@ void HelloWorld(int *A, int *B, int *C, int N) { // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: private}} #pragma acc parallel loop private(A) for(int i = 0; i <5; ++i); - // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: async}} -#pragma acc parallel loop async + // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: reduction}} +#pragma acc parallel loop reduction(+:A) for(int i = 0; i <5; ++i); } diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c index c9208566bf2b9..54b3cd024123f 100644 --- a/clang/test/CIR/CodeGenOpenACC/parallel.c +++ b/clang/test/CIR/CodeGenOpenACC/parallel.c @@ -211,9 +211,9 @@ void acc_parallel(int cond) { #pragma acc parallel async {} - // CHECK-NEXT: acc.parallel { + // CHECK-NEXT: acc.parallel async { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} + // CHECK-NEXT: } loc #pragma acc parallel async(cond) {} @@ -225,9 +225,9 @@ void acc_parallel(int cond) { #pragma acc parallel async device_type(nvidia, radeon) async {} - // CHECK-NEXT: acc.parallel { + // CHECK-NEXT: acc.parallel async([#acc.device_type, #acc.device_type, #acc.device_type]) { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } loc #pragma acc parallel async(3) device_type(nvidia, radeon) async(cond) {} @@ -243,17 +243,17 @@ void acc_parallel(int cond) { {} // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 - // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { + // CHECK-NEXT: acc.parallel async([#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} + // CHECK-NEXT: } loc #pragma acc parallel async(3) device_type(nvidia, radeon) async {} // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 - // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.parallel async([#acc.device_type, #acc.device_type], %[[THREE_CAST]] : si32) { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type]} + // CHECK-NEXT: } loc #pragma acc parallel num_gangs(1) {} diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c index 88a49a95f87d7..90756b04d8f52 100644 --- a/clang/test/CIR/CodeGenOpenACC/serial.c +++ b/clang/test/CIR/CodeGenOpenACC/serial.c @@ -108,9 +108,9 @@ void acc_serial(int cond) { #pragma acc serial async {} - // CHECK-NEXT: acc.serial { + // CHECK-NEXT: acc.serial async { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} + // CHECK-NEXT: } loc #pragma acc serial async(cond) {} @@ -122,9 +122,9 @@ void acc_serial(int cond) { #pragma acc serial async device_type(nvidia, radeon) async {} - // CHECK-NEXT: acc.serial { + // CHECK-NEXT: acc.serial async([#acc.device_type, #acc.device_type, #acc.device_type]) { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } loc #pragma acc serial async(3) device_type(nvidia, radeon) async(cond) {} @@ -140,17 +140,17 @@ void acc_serial(int cond) { {} // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 - // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { + // CHECK-NEXT: acc.serial async([#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type], %[[CONV_CAST]] : si32 [#acc.device_type]) { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} + // CHECK-NEXT: } loc #pragma acc serial async(3) device_type(nvidia, radeon) async {} // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 - // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.serial async([#acc.device_type, #acc.device_type], %[[THREE_CAST]] : si32) { // CHECK-NEXT: acc.yield - // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type, #acc.device_type]} + // CHECK-NEXT: } loc #pragma acc serial wait {} diff --git a/clang/test/CIR/CodeGenOpenACC/wait.c b/clang/test/CIR/CodeGenOpenACC/wait.c index 569846a91ab8a..c8e345d4b9a0e 100644 --- a/clang/test/CIR/CodeGenOpenACC/wait.c +++ b/clang/test/CIR/CodeGenOpenACC/wait.c @@ -15,7 +15,7 @@ void acc_wait(int cond) { // CHECK-NEXT: acc.wait if(%[[CONV_CAST]]) #pragma acc wait async - // CHECK-NEXT: acc.wait attributes {async} + // CHECK-NEXT: acc.wait async loc #pragma acc wait async(cond) // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr, !s32i @@ -32,7 +32,7 @@ void acc_wait(int cond) { // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32 // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 - // CHECK-NEXT: acc.wait(%[[ONE_CAST]], %[[TWO_CAST]] : si32, si32) attributes {async} + // CHECK-NEXT: acc.wait(%[[ONE_CAST]], %[[TWO_CAST]] : si32, si32) async loc #pragma acc wait(queues:1) if (cond) @@ -71,7 +71,7 @@ void acc_wait(int cond) { // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 - // CHECK-NEXT: acc.wait(%[[TWO_CAST]], %[[THREE_CAST]] : si32, si32) wait_devnum(%[[ONE_CAST]] : si32) attributes {async} + // CHECK-NEXT: acc.wait(%[[TWO_CAST]], %[[THREE_CAST]] : si32, si32) async wait_devnum(%[[ONE_CAST]] : si32) loc // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/IR/switch-flat.cir b/clang/test/CIR/IR/switch-flat.cir new file mode 100644 index 0000000000000..8c11a74484d39 --- /dev/null +++ b/clang/test/CIR/IR/switch-flat.cir @@ -0,0 +1,68 @@ +// RUN: cir-opt %s | FileCheck %s +!s32i = !cir.int + +cir.func @FlatSwitchWithoutDefault(%arg0: !s32i) { + cir.switch.flat %arg0 : !s32i, ^bb2 [ + 1: ^bb1 + ] + ^bb1: + cir.br ^bb2 + ^bb2: + cir.return +} + +// CHECK: cir.switch.flat %arg0 : !s32i, ^bb2 [ +// CHECK-NEXT: 1: ^bb1 +// CHECK-NEXT: ] +// CHECK-NEXT: ^bb1: +// CHECK-NEXT: cir.br ^bb2 +// CHECK-NEXT: ^bb2: +// CHECK-NEXT: cir.return + +cir.func @FlatSwitchWithDefault(%arg0: !s32i) { + cir.switch.flat %arg0 : !s32i, ^bb2 [ + 1: ^bb1 + ] + ^bb1: + cir.br ^bb3 + ^bb2: + cir.br ^bb3 + ^bb3: + cir.return +} + +// CHECK: cir.switch.flat %arg0 : !s32i, ^bb2 [ +// CHECK-NEXT: 1: ^bb1 +// CHECK-NEXT: ] +// CHECK-NEXT: ^bb1: +// CHECK-NEXT: cir.br ^bb3 +// CHECK-NEXT: ^bb2: +// CHECK-NEXT: cir.br ^bb3 +// CHECK-NEXT: ^bb3: +// CHECK-NEXT: cir.return + +cir.func @switchWithOperands(%arg0: !s32i, %arg1: !s32i, %arg2: !s32i) { + cir.switch.flat %arg0 : !s32i, ^bb3 [ + 0: ^bb1(%arg1, %arg2 : !s32i, !s32i), + 1: ^bb2(%arg2, %arg1 : !s32i, !s32i) + ] +^bb1: + cir.br ^bb3 + +^bb2: + cir.br ^bb3 + +^bb3: + cir.return +} + +// CHECK: cir.switch.flat %arg0 : !s32i, ^bb3 [ +// CHECK-NEXT: 0: ^bb1(%arg1, %arg2 : !s32i, !s32i), +// CHECK-NEXT: 1: ^bb2(%arg2, %arg1 : !s32i, !s32i) +// CHECK-NEXT: ] +// CHECK-NEXT: ^bb1: +// CHECK-NEXT: cir.br ^bb3 +// CHECK-NEXT: ^bb2: +// CHECK-NEXT: cir.br ^bb3 +// CHECK-NEXT: ^bb3: +// CHECK-NEXT: cir.return diff --git a/clang/test/CIR/Transforms/switch.cir b/clang/test/CIR/Transforms/switch.cir new file mode 100644 index 0000000000000..00b462a6075c9 --- /dev/null +++ b/clang/test/CIR/Transforms/switch.cir @@ -0,0 +1,318 @@ +// RUN: cir-opt %s -cir-flatten-cfg -o - | FileCheck %s + +!s8i = !cir.int +!s32i = !cir.int +!s64i = !cir.int + +module { + cir.func @shouldFlatSwitchWithDefault(%arg0: !s8i) { + cir.switch (%arg0 : !s8i) { + cir.case (equal, [#cir.int<1> : !s8i]) { + cir.break + } + cir.case (default, []) { + cir.break + } + cir.yield + } + cir.return + } +// CHECK: cir.func @shouldFlatSwitchWithDefault(%arg0: !s8i) { +// CHECK: cir.switch.flat %arg0 : !s8i, ^bb[[#DEFAULT:]] [ +// CHECK: 1: ^bb[[#CASE1:]] +// CHECK: ] +// CHECK: ^bb[[#CASE1]]: +// CHECK: cir.br ^bb[[#EXIT:]] +// CHECK: ^bb[[#DEFAULT]]: +// CHECK: cir.br ^bb[[#EXIT]] +// CHECK: ^bb[[#EXIT]]: +// CHECK: cir.return +// CHECK: } + + cir.func @shouldFlatSwitchWithoutDefault(%arg0: !s32i) { + cir.switch (%arg0 : !s32i) { + cir.case (equal, [#cir.int<1> : !s32i]) { + cir.break + } + cir.yield + } + cir.return + } +// CHECK: cir.func @shouldFlatSwitchWithoutDefault(%arg0: !s32i) { +// CHECK: cir.switch.flat %arg0 : !s32i, ^bb[[#EXIT:]] [ +// CHECK: 1: ^bb[[#CASE1:]] +// CHECK: ] +// CHECK: ^bb[[#CASE1]]: +// CHECK: cir.br ^bb[[#EXIT]] +// CHECK: ^bb[[#EXIT]]: +// CHECK: cir.return +// CHECK: } + + + cir.func @shouldFlatSwitchWithImplicitFallthrough(%arg0: !s64i) { + cir.switch (%arg0 : !s64i) { + cir.case (anyof, [#cir.int<1> : !s64i, #cir.int<2> : !s64i]) { + cir.break + } + cir.yield + } + cir.return + } +// CHECK: cir.func @shouldFlatSwitchWithImplicitFallthrough(%arg0: !s64i) { +// CHECK: cir.switch.flat %arg0 : !s64i, ^bb[[#EXIT:]] [ +// CHECK: 1: ^bb[[#CASE1N2:]], +// CHECK: 2: ^bb[[#CASE1N2]] +// CHECK: ] +// CHECK: ^bb[[#CASE1N2]]: +// CHECK: cir.br ^bb[[#EXIT]] +// CHECK: ^bb[[#EXIT]]: +// CHECK: cir.return +// CHECK: } + + + + cir.func @shouldFlatSwitchWithExplicitFallthrough(%arg0: !s64i) { + cir.switch (%arg0 : !s64i) { + cir.case (equal, [#cir.int<1> : !s64i]) { // case 1 has its own region + cir.yield // fallthrough to case 2 + } + cir.case (equal, [#cir.int<2> : !s64i]) { + cir.break + } + cir.yield + } + cir.return + } +// CHECK: cir.func @shouldFlatSwitchWithExplicitFallthrough(%arg0: !s64i) { +// CHECK: cir.switch.flat %arg0 : !s64i, ^bb[[#EXIT:]] [ +// CHECK: 1: ^bb[[#CASE1:]], +// CHECK: 2: ^bb[[#CASE2:]] +// CHECK: ] +// CHECK: ^bb[[#CASE1]]: +// CHECK: cir.br ^bb[[#CASE2]] +// CHECK: ^bb[[#CASE2]]: +// CHECK: cir.br ^bb[[#EXIT]] +// CHECK: ^bb[[#EXIT]]: +// CHECK: cir.return +// CHECK: } + + cir.func @shouldFlatSwitchWithFallthroughToExit(%arg0: !s64i) { + cir.switch (%arg0 : !s64i) { + cir.case (equal, [#cir.int<1> : !s64i]) { + cir.yield // fallthrough to exit + } + cir.yield + } + cir.return + } +// CHECK: cir.func @shouldFlatSwitchWithFallthroughToExit(%arg0: !s64i) { +// CHECK: cir.switch.flat %arg0 : !s64i, ^bb[[#EXIT:]] [ +// CHECK: 1: ^bb[[#CASE1:]] +// CHECK: ] +// CHECK: ^bb[[#CASE1]]: +// CHECK: cir.br ^bb[[#EXIT]] +// CHECK: ^bb[[#EXIT]]: +// CHECK: cir.return +// CHECK: } + + cir.func @shouldDropEmptySwitch(%arg0: !s64i) { + cir.switch (%arg0 : !s64i) { + cir.yield + } + // CHECK-NOT: llvm.switch + cir.return + } +// CHECK: cir.func @shouldDropEmptySwitch(%arg0: !s64i) +// CHECK-NOT: cir.switch.flat + + + cir.func @shouldFlatMultiBlockCase(%arg0: !s32i) { + %0 = cir.alloca !s32i, !cir.ptr, ["a", init] {alignment = 4 : i64} + cir.store %arg0, %0 : !s32i, !cir.ptr + cir.scope { + %1 = cir.load %0 : !cir.ptr, !s32i + cir.switch (%1 : !s32i) { + cir.case (equal, [#cir.int<3> : !s32i]) { + cir.return + ^bb1: // no predecessors + cir.break + } + cir.yield + } + } + cir.return + } + +// CHECK: cir.func @shouldFlatMultiBlockCase(%arg0: !s32i) { +// CHECK: %0 = cir.alloca !s32i, !cir.ptr, ["a", init] {alignment = 4 : i64} +// CHECK: cir.store %arg0, %0 : !s32i, !cir.ptr +// CHECK: cir.br ^bb1 +// CHECK: ^bb1: // pred: ^bb0 +// CHECK: %1 = cir.load %0 : !cir.ptr, !s32i +// CHECK: cir.switch.flat %1 : !s32i, ^bb[[#DEFAULT:]] [ +// CHECK: 3: ^bb[[#BB1:]] +// CHECK: ] +// CHECK: ^bb[[#BB1]]: +// CHECK: cir.return +// CHECK: ^bb[[#DEFAULT]]: +// CHECK: cir.br ^bb[[#RET_BB:]] +// CHECK: ^bb[[#RET_BB]]: // pred: ^bb[[#DEFAULT]] +// CHECK: cir.return +// CHECK: } + + + cir.func @shouldFlatNestedBreak(%arg0: !s32i, %arg1: !s32i) -> !s32i { + %0 = cir.alloca !s32i, !cir.ptr, ["x", init] {alignment = 4 : i64} + %1 = cir.alloca !s32i, !cir.ptr, ["y", init] {alignment = 4 : i64} + %2 = cir.alloca !s32i, !cir.ptr, ["__retval"] {alignment = 4 : i64} + cir.store %arg0, %0 : !s32i, !cir.ptr + cir.store %arg1, %1 : !s32i, !cir.ptr + cir.scope { + %5 = cir.load %0 : !cir.ptr, !s32i + cir.switch (%5 : !s32i) { + cir.case (equal, [#cir.int<0> : !s32i]) { + cir.scope { + %6 = cir.load %1 : !cir.ptr, !s32i + %7 = cir.const #cir.int<0> : !s32i + %8 = cir.cmp(ge, %6, %7) : !s32i, !cir.bool + cir.if %8 { + cir.break + } + } + cir.break + } + cir.yield + } + } + %3 = cir.const #cir.int<3> : !s32i + cir.store %3, %2 : !s32i, !cir.ptr + %4 = cir.load %2 : !cir.ptr, !s32i + cir.return %4 : !s32i + } +// CHECK: cir.func @shouldFlatNestedBreak(%arg0: !s32i, %arg1: !s32i) -> !s32i { +// CHECK: cir.switch.flat %[[COND:.*]] : !s32i, ^bb[[#DEFAULT_BB:]] [ +// CHECK: 0: ^bb[[#BB1:]] +// CHECK: ] +// CHECK: ^bb[[#BB1]]: +// CHECK: cir.br ^bb[[#COND_BB:]] +// CHECK: ^bb[[#COND_BB]]: +// CHECK: cir.brcond {{%.*}} ^bb[[#TRUE_BB:]], ^bb[[#FALSE_BB:]] +// CHECK: ^bb[[#TRUE_BB]]: +// CHECK: cir.br ^bb[[#DEFAULT_BB]] +// CHECK: ^bb[[#FALSE_BB]]: +// CHECK: cir.br ^bb[[#PRED_BB:]] +// CHECK: ^bb[[#PRED_BB]]: +// CHECK: cir.br ^bb[[#DEFAULT_BB]] +// CHECK: ^bb[[#DEFAULT_BB]]: +// CHECK: cir.br ^bb[[#RET_BB:]] +// CHECK: ^bb[[#RET_BB]]: +// CHECK: cir.return +// CHECK: } + + + cir.func @flatCaseRange(%arg0: !s32i) -> !s32i { + %0 = cir.alloca !s32i, !cir.ptr, ["x", init] {alignment = 4 : i64} + %1 = cir.alloca !s32i, !cir.ptr, ["__retval"] {alignment = 4 : i64} + %2 = cir.alloca !s32i, !cir.ptr, ["y", init] {alignment = 4 : i64} + cir.store %arg0, %0 : !s32i, !cir.ptr + %3 = cir.const #cir.int<0> : !s32i + cir.store %3, %2 : !s32i, !cir.ptr + cir.scope { + %6 = cir.load %0 : !cir.ptr, !s32i + cir.switch (%6 : !s32i) { + cir.case (equal, [#cir.int<-100> : !s32i]) { + %7 = cir.const #cir.int<1> : !s32i + cir.store %7, %2 : !s32i, !cir.ptr + cir.break + } + cir.case (range, [#cir.int<1> : !s32i, #cir.int<100> : !s32i]) { + %7 = cir.const #cir.int<2> : !s32i + cir.store %7, %2 : !s32i, !cir.ptr + cir.break + } + cir.case (default, []) { + %7 = cir.const #cir.int<3> : !s32i + cir.store %7, %2 : !s32i, !cir.ptr + cir.break + } + cir.yield + } + } + %4 = cir.load %2 : !cir.ptr, !s32i + cir.store %4, %1 : !s32i, !cir.ptr + %5 = cir.load %1 : !cir.ptr, !s32i + cir.return %5 : !s32i + } +// CHECK: cir.func @flatCaseRange(%arg0: !s32i) -> !s32i { +// CHECK: cir.switch.flat %[[X:[0-9]+]] : !s32i, ^[[JUDGE_RANGE:bb[0-9]+]] [ +// CHECK-NEXT: -100: ^[[CASE_EQUAL:bb[0-9]+]] +// CHECK-NEXT: ] +// CHECK-NEXT: ^[[UNRACHABLE_BB:.+]]: // no predecessors +// CHECK-NEXT: cir.br ^[[CASE_EQUAL]] +// CHECK-NEXT: ^[[CASE_EQUAL]]: +// CHECK-NEXT: cir.int<1> +// CHECK-NEXT: cir.store +// CHECK-NEXT: cir.br ^[[EPILOG:bb[0-9]+]] +// CHECK-NEXT: ^[[CASE_RANGE:bb[0-9]+]]: +// CHECK-NEXT: cir.int<2> +// CHECK-NEXT: cir.store +// CHECK-NEXT: cir.br ^[[EPILOG]] +// CHECK-NEXT: ^[[JUDGE_RANGE]]: +// CHECK-NEXT: %[[RANGE:[0-9]+]] = cir.const #cir.int<99> +// CHECK-NEXT: %[[LOWER_BOUND:[0-9]+]] = cir.const #cir.int<1> +// CHECK-NEXT: %[[DIFF:[0-9]+]] = cir.binop(sub, %[[X]], %[[LOWER_BOUND]]) +// CHECK-NEXT: %[[U_DIFF:[0-9]+]] = cir.cast(integral, %[[DIFF]] : !s32i), !u32i +// CHECK-NEXT: %[[U_RANGE:[0-9]+]] = cir.cast(integral, %[[RANGE]] : !s32i), !u32i +// CHECK-NEXT: %[[CMP_RESULT:[0-9]+]] = cir.cmp(le, %[[U_DIFF]], %[[U_RANGE]]) +// CHECK-NEXT: cir.brcond %[[CMP_RESULT]] ^[[CASE_RANGE]], ^[[CASE_DEFAULT:bb[0-9]+]] +// CHECK-NEXT: ^[[CASE_DEFAULT]]: +// CHECK-NEXT: cir.int<3> +// CHECK-NEXT: cir.store +// CHECK-NEXT: cir.br ^[[EPILOG]] +// CHECK-NEXT: ^[[EPILOG]]: +// CHECK-NEXT: cir.br ^[[EPILOG_END:bb[0-9]+]] +// CHECK-NEXT: ^[[EPILOG_END]]: +// CHECK: cir.return +// CHECK: } + + cir.func @_Z8bigRangei(%arg0: !s32i) { + %0 = cir.alloca !s32i, !cir.ptr, ["a", init] {alignment = 4 : i64} + cir.store %arg0, %0 : !s32i, !cir.ptr + cir.scope { + %1 = cir.load %0 : !cir.ptr, !s32i + cir.switch (%1 : !s32i) { + cir.case(range, [#cir.int<3> : !s32i, #cir.int<100> : !s32i]) { + cir.break + } + cir.case(default, []) { + cir.break + } + cir.yield + } + } + cir.return + } + +// CHECK: cir.func @_Z8bigRangei(%arg0: !s32i) { +// CHECK: cir.switch.flat %[[COND:.*]] : !s32i, ^bb[[#RANGE_BR:]] [ +// CHECK: ] +// CHECK: ^bb[[#NO_PRED_BB:]]: // no predecessors +// CHECK: cir.br ^bb[[#DEFAULT_BB:]] +// CHECK: ^bb[[#DEFAULT_BB]]: // 2 preds: ^bb[[#NO_PRED_BB]], ^bb[[#RANGE_BR]] +// CHECK: cir.br ^bb[[#EXIT:]] +// CHECK: ^bb[[#RANGE_BR]]: // pred: ^bb[[#BB2:]] +// CHECK: %[[CONST97:.*]] = cir.const #cir.int<97> : !s32i +// CHECK: %[[CONST3:.*]] = cir.const #cir.int<3> : !s32i +// CHECK: %[[SUB:.*]] = cir.binop(sub, %[[COND]], %[[CONST3]]) : !s32i +// CHECK: %[[CAST1:.*]] = cir.cast(integral, %[[SUB]] : !s32i), !u32i +// CHECK: %[[CAST2:.*]] = cir.cast(integral, %[[CONST97]] : !s32i), !u32i +// CHECK: %[[CMP:.*]] = cir.cmp(le, %[[CAST1]], %[[CAST2]]) : !u32i, !cir.bool +// CHECK: cir.brcond %7 ^bb[[#DEFAULT_BB]], ^bb[[#RANGE_BB:]] +// CHECK: ^bb[[#RANGE_BB]]: // pred: ^bb[[#RANGE_BR]] +// CHECK: cir.br ^bb[[#EXIT]] +// CHECK: ^bb[[#EXIT]]: // 2 preds: ^bb[[#DEFAULT_BB]], ^bb[[#RANGE_BB]] +// CHECK: cir.br ^bb[[#RET_BB:]] +// CHECK: ^bb[[#RET_BB]]: // pred: ^bb[[#EXIT]] +// CHECK: cir.return +// CHECK: } +} diff --git a/clang/test/CodeCompletion/source-loc-zero.cpp b/clang/test/CodeCompletion/source-loc-zero.cpp new file mode 100644 index 0000000000000..a428c1534ffde --- /dev/null +++ b/clang/test/CodeCompletion/source-loc-zero.cpp @@ -0,0 +1,11 @@ +// Regression test for #139375 +// Clang uses 1-based indexing for source locations given from the command-line. +// Verify that Clang rejects 0 as an invalid value for line or column number. + +// RUN: not %clang_cc1 -fsyntax-only -code-completion-at=%s:0:1 %s -o - 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DIAG %s +// RUN: not %clang_cc1 -fsyntax-only -code-completion-at=%s:1:0 %s -o - 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DIAG %s + +// CHECK-DIAG: error: invalid value '{{.*}}' in '-code-completion-at={{.*}}' +// CHECK-NEXT: hint: -code-completion-at=:: requires and to be integers greater than zero diff --git a/clang/test/CodeGen/AArch64/cpu-supports-target.c b/clang/test/CodeGen/AArch64/cpu-supports-target.c index a39ffd4e4a74d..9b551a0714e74 100644 --- a/clang/test/CodeGen/AArch64/cpu-supports-target.c +++ b/clang/test/CodeGen/AArch64/cpu-supports-target.c @@ -220,7 +220,7 @@ int test_versions() { //. // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } -// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" } +// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} diff --git a/clang/test/CodeGen/AArch64/fp8-init-list.c b/clang/test/CodeGen/AArch64/fp8-init-list.c index 872ee4f8a3d42..8b4b31a71c46a 100644 --- a/clang/test/CodeGen/AArch64/fp8-init-list.c +++ b/clang/test/CodeGen/AArch64/fp8-init-list.c @@ -12,14 +12,14 @@ // CHECK-LABEL: define dso_local <8 x i8> @vector_init_test( // CHECK-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer -// CHECK-NEXT: ret <8 x i8> [[VECINIT7]] +// CHECK-NEXT: [[VECINIT14:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT14]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z16vector_init_testu6__mfp8( // CHECK-CXX-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[VECINIT7:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer -// CHECK-CXX-NEXT: ret <8 x i8> [[VECINIT7]] +// CHECK-CXX-NEXT: [[VECINIT14:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer +// CHECK-CXX-NEXT: ret <8 x i8> [[VECINIT14]] // mfloat8x8_t vector_init_test(__mfp8 x) { return (mfloat8x8_t) {x, x, x, x, x, x, x, x}; @@ -34,15 +34,13 @@ struct S s; // CHECK-LABEL: define dso_local void @f( // CHECK-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[X]], i64 0 -// CHECK-NEXT: store i8 [[TMP0]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] // CHECK-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z1fu6__mfp8( // CHECK-CXX-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[X]], i64 0 -// CHECK-CXX-NEXT: store i8 [[TMP0]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] +// CHECK-CXX-NEXT: store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] // CHECK-CXX-NEXT: ret void // void f(__mfp8 x) { diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c new file mode 100644 index 0000000000000..fdc861836baf7 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c @@ -0,0 +1,1158 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +#include + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s + +// REQUIRES: aarch64-registered-target + +// CHECK-LABEL: define dso_local <8 x i8> @test_vset_lane_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// +mfloat8x8_t test_vset_lane_mf8(mfloat8_t a, mfloat8x8_t b) { + return vset_lane_mf8(a, b, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vsetq_lane_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// +mfloat8x16_t test_vsetq_lane_mf8(mfloat8_t a, mfloat8x16_t b) { + return vsetq_lane_mf8(a, b, 15); +} + + +// CHECK-LABEL: define dso_local <1 x i8> @test_vget_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vget_lane_mf8(mfloat8x8_t a) { + return vget_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <1 x i8> @test_vdupb_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vdupb_lane_mf8(mfloat8x8_t a) { + return vdupb_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <1 x i8> @test_vgetq_lane_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vgetq_lane_mf8(mfloat8x16_t a) { + return vgetq_lane_mf8(a, 15); +} + +// CHECK-LABEL: define dso_local <1 x i8> @test_vdupb_laneq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vdupb_laneq_mf8(mfloat8x16_t a) { + return vdupb_laneq_mf8(a, 15); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcreate_mf8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// +mfloat8x8_t test_vcreate_mf8(uint64_t a) { + return vcreate_mf8(a); +} + + +// CHECK-LABEL: define dso_local <8 x i8> @test_vdup_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <8 x i8> [[VEXT_I]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> [[VEXT1_I]], <8 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <8 x i8> [[VECINIT2_I]], <8 x i8> [[VEXT3_I]], <8 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <8 x i8> [[VECINIT4_I]], <8 x i8> [[VEXT5_I]], <8 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT6_I]], <8 x i8> [[VEXT7_I]], <8 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <8 x i8> [[VECINIT8_I]], <8 x i8> [[VEXT9_I]], <8 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <8 x i8> [[VECINIT10_I]], <8 x i8> [[VEXT11_I]], <8 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <8 x i8> [[VECINIT12_I]], <8 x i8> [[VEXT13_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VECINIT14_I]] +// +mfloat8x8_t test_vdup_n_mf8(mfloat8_t a) { + return vdup_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vdupq_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <16 x i8> [[VEXT_I]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> [[VEXT1_I]], <16 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <16 x i8> [[VECINIT2_I]], <16 x i8> [[VEXT3_I]], <16 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <16 x i8> [[VECINIT4_I]], <16 x i8> [[VEXT5_I]], <16 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <16 x i8> [[VECINIT6_I]], <16 x i8> [[VEXT7_I]], <16 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <16 x i8> [[VECINIT8_I]], <16 x i8> [[VEXT9_I]], <16 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <16 x i8> [[VECINIT10_I]], <16 x i8> [[VEXT11_I]], <16 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[VECINIT12_I]], <16 x i8> [[VEXT13_I]], <16 x i32> +// CHECK-NEXT: [[VEXT15_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT16_I:%.*]] = shufflevector <16 x i8> [[VECINIT14_I]], <16 x i8> [[VEXT15_I]], <16 x i32> +// CHECK-NEXT: [[VEXT17_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT18_I:%.*]] = shufflevector <16 x i8> [[VECINIT16_I]], <16 x i8> [[VEXT17_I]], <16 x i32> +// CHECK-NEXT: [[VEXT19_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT20_I:%.*]] = shufflevector <16 x i8> [[VECINIT18_I]], <16 x i8> [[VEXT19_I]], <16 x i32> +// CHECK-NEXT: [[VEXT21_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT22_I:%.*]] = shufflevector <16 x i8> [[VECINIT20_I]], <16 x i8> [[VEXT21_I]], <16 x i32> +// CHECK-NEXT: [[VEXT23_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT24_I:%.*]] = shufflevector <16 x i8> [[VECINIT22_I]], <16 x i8> [[VEXT23_I]], <16 x i32> +// CHECK-NEXT: [[VEXT25_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT26_I:%.*]] = shufflevector <16 x i8> [[VECINIT24_I]], <16 x i8> [[VEXT25_I]], <16 x i32> +// CHECK-NEXT: [[VEXT27_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT28_I:%.*]] = shufflevector <16 x i8> [[VECINIT26_I]], <16 x i8> [[VEXT27_I]], <16 x i32> +// CHECK-NEXT: [[VEXT29_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT30_I:%.*]] = shufflevector <16 x i8> [[VECINIT28_I]], <16 x i8> [[VEXT29_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VECINIT30_I]] +// +mfloat8x16_t test_vdupq_n_mf8(mfloat8_t a) { + return vdupq_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vmov_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <8 x i8> [[VEXT_I]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> [[VEXT1_I]], <8 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <8 x i8> [[VECINIT2_I]], <8 x i8> [[VEXT3_I]], <8 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <8 x i8> [[VECINIT4_I]], <8 x i8> [[VEXT5_I]], <8 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT6_I]], <8 x i8> [[VEXT7_I]], <8 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <8 x i8> [[VECINIT8_I]], <8 x i8> [[VEXT9_I]], <8 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <8 x i8> [[VECINIT10_I]], <8 x i8> [[VEXT11_I]], <8 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <8 x i8> [[VECINIT12_I]], <8 x i8> [[VEXT13_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VECINIT14_I]] +// +mfloat8x8_t test_vmov_n_mf8(mfloat8_t a) { + return vmov_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vmovq_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <16 x i8> [[VEXT_I]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> [[VEXT1_I]], <16 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <16 x i8> [[VECINIT2_I]], <16 x i8> [[VEXT3_I]], <16 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <16 x i8> [[VECINIT4_I]], <16 x i8> [[VEXT5_I]], <16 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <16 x i8> [[VECINIT6_I]], <16 x i8> [[VEXT7_I]], <16 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <16 x i8> [[VECINIT8_I]], <16 x i8> [[VEXT9_I]], <16 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <16 x i8> [[VECINIT10_I]], <16 x i8> [[VEXT11_I]], <16 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[VECINIT12_I]], <16 x i8> [[VEXT13_I]], <16 x i32> +// CHECK-NEXT: [[VEXT15_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT16_I:%.*]] = shufflevector <16 x i8> [[VECINIT14_I]], <16 x i8> [[VEXT15_I]], <16 x i32> +// CHECK-NEXT: [[VEXT17_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT18_I:%.*]] = shufflevector <16 x i8> [[VECINIT16_I]], <16 x i8> [[VEXT17_I]], <16 x i32> +// CHECK-NEXT: [[VEXT19_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT20_I:%.*]] = shufflevector <16 x i8> [[VECINIT18_I]], <16 x i8> [[VEXT19_I]], <16 x i32> +// CHECK-NEXT: [[VEXT21_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT22_I:%.*]] = shufflevector <16 x i8> [[VECINIT20_I]], <16 x i8> [[VEXT21_I]], <16 x i32> +// CHECK-NEXT: [[VEXT23_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT24_I:%.*]] = shufflevector <16 x i8> [[VECINIT22_I]], <16 x i8> [[VEXT23_I]], <16 x i32> +// CHECK-NEXT: [[VEXT25_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT26_I:%.*]] = shufflevector <16 x i8> [[VECINIT24_I]], <16 x i8> [[VEXT25_I]], <16 x i32> +// CHECK-NEXT: [[VEXT27_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT28_I:%.*]] = shufflevector <16 x i8> [[VECINIT26_I]], <16 x i8> [[VEXT27_I]], <16 x i32> +// CHECK-NEXT: [[VEXT29_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT30_I:%.*]] = shufflevector <16 x i8> [[VECINIT28_I]], <16 x i8> [[VEXT29_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VECINIT30_I]] +// +mfloat8x16_t test_vmovq_n_mf8(mfloat8_t a) { + return vmovq_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcombine_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vcombine_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vcombine_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vget_high_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vget_high_mf8(mfloat8x16_t a) { + return vget_high_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vget_low_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vget_low_mf8(mfloat8x16_t a) { + return vget_low_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL11_I]] +// +mfloat8x8_t test_vtbl1_mf8(mfloat8x8_t a, uint8x8_t b) { + return vtbl1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl2_mf8( +// CHECK-SAME: [2 x <8 x i8>] alignstack(8) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL13_I]] +// +mfloat8x8_t test_vtbl2_mf8(mfloat8x8x2_t a, uint8x8_t b) { + return vtbl2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl3_mf8( +// CHECK-SAME: [3 x <8 x i8>] alignstack(8) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL26_I]] +// +mfloat8x8_t test_vtbl3_mf8(mfloat8x8x3_t a, uint8x8_t b) { + return vtbl3_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl4_mf8( +// CHECK-SAME: [4 x <8 x i8>] alignstack(8) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_2_INSERT]], <8 x i8> [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL28_I]] +// +mfloat8x8_t test_vtbl4_mf8(mfloat8x8x4_t a, uint8x8_t b) { + return vtbl4_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], splat (i8 8) +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], splat (i8 -1) +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK-NEXT: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <8 x i8> [[VTBX_I]] +// +mfloat8x8_t test_vtbx1_mf8(mfloat8x8_t a, mfloat8x8_t b, uint8x8_t c) { + return vtbx1_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX13_I]] +// +mfloat8x8_t test_vtbx2_mf8(mfloat8x8_t a, mfloat8x8x2_t b, uint8x8_t c) { + return vtbx2_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx3_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], splat (i8 24) +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], splat (i8 -1) +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL26_I]] +// CHECK-NEXT: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <8 x i8> [[VTBX_I]] +// +mfloat8x8_t test_vtbx3_mf8(mfloat8x8_t a, mfloat8x8x3_t b, uint8x8_t c) { + return vtbx3_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx4_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_2_INSERT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX28_I]] +// +mfloat8x8_t test_vtbx4_mf8(mfloat8x8_t a, mfloat8x8x4_t b, uint8x8_t c) { + return vtbx4_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// +mfloat8x8_t test_vext_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vext_mf8(a, b, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// +mfloat8x16_t test_vextq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vextq_mf8(a, b, 7); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vrev64_mf8(mfloat8x8_t a) { + return vrev64_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vrev64q_mf8(mfloat8x16_t a) { + return vrev64q_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vrev32_mf8(mfloat8x8_t a) { + return vrev32_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vrev32q_mf8(mfloat8x16_t a) { + return vrev32q_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vrev16_mf8(mfloat8x8_t a) { + return vrev16_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vrev16q_mf8(mfloat8x16_t a) { + return vrev16q_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_mf8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// +mfloat8x8_t test_vbsl_mf8(uint8x8_t v1, mfloat8x8_t v2, mfloat8x8_t v3) { + return vbsl_mf8(v1, v2, v3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_mf8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> [[V2:%.*]], <16 x i8> [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// +mfloat8x16_t test_vbslq_mf8(uint8x16_t v1, mfloat8x16_t v2, mfloat8x16_t v3) { + return vbslq_mf8(v1, v2, v3); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x8x2_t @test_vtrn_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T:%.*]] poison, <8 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x8x2_t test_vtrn_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vtrn_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x16x2_t @test_vtrnq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T:%.*]] poison, <16 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x16x2_t test_vtrnq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vtrnq_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x8x2_t @test_vzip_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T:%.*]] poison, <8 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x8x2_t test_vzip_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vzip_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x16x2_t @test_vzipq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T:%.*]] poison, <16 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x16x2_t test_vzipq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vzipq_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x8x2_t @test_vuzp_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x8x2_t test_vuzp_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vuzp_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x16x2_t @test_vuzpq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x16x2_t test_vuzpq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vuzpq_mf8(a, b); +} + +// CHECK-LABEL: define dso_local void @test_vcopy_lane_mf8( +// CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[ARG_I8X8]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[TMP1]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopy_lane_mf8(mfloat8x8_t arg_i8x8) { + vcopy_lane_mf8(arg_i8x8, 0, arg_i8x8, 0); +} + +// CHECK-LABEL: define dso_local void @test_vcopyq_lane_mf8( +// CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]], <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[ARG_I8X8]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[TMP1]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopyq_lane_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { + vcopyq_lane_mf8(arg_i8x16, 0, arg_i8x8, 0); +} + +// CHECK-LABEL: define dso_local void @test_vcopy_laneq_mf8( +// CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]], <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[ARG_I8X16]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[TMP1]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopy_laneq_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { + vcopy_laneq_mf8(arg_i8x8, 0, arg_i8x16, 0); +} + +// CHECK-LABEL: define dso_local void @test_vcopyq_laneq_mf8( +// CHECK-SAME: <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[ARG_I8X16]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[TMP1]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopyq_laneq_mf8(mfloat8x16_t arg_i8x16) { + vcopyq_laneq_mf8(arg_i8x16, 0, arg_i8x16, 0); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vdup_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// +mfloat8x8_t test_vdup_lane_mf8(mfloat8x8_t a) { + return vdup_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vdupq_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// +mfloat8x16_t test_vdupq_lane_mf8(mfloat8x8_t a) { + return vdupq_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vdup_laneq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// +mfloat8x8_t test_vdup_laneq_mf8(mfloat8x16_t a) { + return vdup_laneq_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vdupq_laneq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// +mfloat8x16_t test_vdupq_laneq_mf8(mfloat8x16_t a) { + return vdupq_laneq_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vtrn1_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vtrn1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vtrn1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vtrn1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vzip1_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vzip1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vzip1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vzip1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vuzp1_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vuzp1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vuzp1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vuzp1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vtrn2_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vtrn2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vtrn2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vtrn2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vzip2_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vzip2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vzip2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vzip2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vuzp2_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vuzp2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vuzp2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vuzp2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl1_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// +mfloat8x8_t test_vqtbl1_mf8(mfloat8x16_t a, uint8x8_t b) { + return vqtbl1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL1_I]] +// +mfloat8x16_t test_vqtbl1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vqtbl1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl2_mf8( +// CHECK-SAME: [2 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// +mfloat8x8_t test_vqtbl2_mf8(mfloat8x16x2_t a, uint8x8_t b) { + return vqtbl2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl2q_mf8( +// CHECK-SAME: [2 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL2_I]] +// +mfloat8x16_t test_vqtbl2q_mf8(mfloat8x16x2_t a, mfloat8x16_t b) { + return vqtbl2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl3q_mf8( +// CHECK-SAME: [3 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL3_I]] +// +mfloat8x16_t test_vqtbl3q_mf8(mfloat8x16x3_t a, mfloat8x16_t b) { + return vqtbl3q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl3_mf8( +// CHECK-SAME: [3 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// +mfloat8x8_t test_vqtbl3_mf8(mfloat8x16x3_t a, uint8x8_t b) { + return vqtbl3_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl4_mf8( +// CHECK-SAME: [4 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// +mfloat8x8_t test_vqtbl4_mf8(mfloat8x16x4_t a, uint8x8_t b) { + return vqtbl4_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl4q_mf8( +// CHECK-SAME: [4 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL4_I]] +// +mfloat8x16_t test_vqtbl4q_mf8(mfloat8x16x4_t a, mfloat8x16_t b) { + return vqtbl4q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// +mfloat8x8_t test_vqtbx1_mf8(mfloat8x8_t a, mfloat8x16_t b, uint8x8_t c) { + return vqtbx1_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX1_I]] +// +mfloat8x16_t test_vqtbx1q_mf8(mfloat8x16_t a, mfloat8x16_t b, uint8x16_t c) { + return vqtbx1q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// +mfloat8x8_t test_vqtbx2_mf8(mfloat8x8_t a, mfloat8x16x2_t b, uint8x8_t c) { + return vqtbx2_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX2_I]] +// +mfloat8x16_t test_vqtbx2q_mf8(mfloat8x16_t a, mfloat8x16x2_t b, mfloat8x16_t c) { + return vqtbx2q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx3_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// +mfloat8x8_t test_vqtbx3_mf8(mfloat8x8_t a, mfloat8x16x3_t b, uint8x8_t c) { + return vqtbx3_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx3q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX3_I]] +// +mfloat8x16_t test_vqtbx3q_mf8(mfloat8x16_t a, mfloat8x16x3_t b, mfloat8x16_t c) { + return vqtbx3q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx4_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// +mfloat8x8_t test_vqtbx4_mf8(mfloat8x8_t a, mfloat8x16x4_t b, uint8x8_t c) { + return vqtbx4_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx4q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX4_I]] +// +mfloat8x16_t test_vqtbx4q_mf8(mfloat8x16_t a, mfloat8x16x4_t b, mfloat8x16_t c) { + return vqtbx4q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_mf8( +// CHECK-SAME: <8 x i8> [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +mfloat8x16_t test_vluti2_lane_mf8(mfloat8x8_t vn, uint8x8_t vm) { + return vluti2_lane_mf8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +mfloat8x16_t test_vluti2q_lane_mf8(mfloat8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_mf8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_mf8( +// CHECK-SAME: <8 x i8> [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +mfloat8x16_t test_vluti2_laneq_mf8(mfloat8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_mf8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +mfloat8x16_t test_vluti2q_laneq_mf8(mfloat8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_mf8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]] +// +mfloat8x16_t test_vluti4q_lane_mf8(mfloat8x16_t vn, uint8x8_t vm) { + return vluti4q_lane_mf8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +mfloat8x16_t test_vluti4q_laneq_mf8(mfloat8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_mf8(vn, vm, 1); +} diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c index 0b355db4b2073..2f3994df03784 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c @@ -49,8 +49,8 @@ svfloat32_t test_svdot_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-LABEL: define dso_local @test_svdot_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -59,8 +59,8 @@ svfloat32_t test_svdot_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-CXX-LABEL: define dso_local @_Z20test_svdot_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -91,8 +91,8 @@ svfloat16_t test_svdot_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-LABEL: define dso_local @test_svdot_n_f16_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -101,8 +101,8 @@ svfloat16_t test_svdot_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-CXX-LABEL: define dso_local @_Z20test_svdot_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c index 0daeeec9e7dd7..425e6a57ffe3c 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c @@ -49,8 +49,8 @@ svfloat16_t test_svmlalb_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-LABEL: define dso_local @test_svmlalb_n_f16_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -59,8 +59,8 @@ svfloat16_t test_svmlalb_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-CXX-LABEL: define dso_local @_Z22test_svmlalb_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -91,8 +91,8 @@ svfloat16_t test_svmlalt_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-LABEL: define dso_local @test_svmlalt_n_f16_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -101,8 +101,8 @@ svfloat16_t test_svmlalt_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-CXX-LABEL: define dso_local @_Z22test_svmlalt_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -169,8 +169,8 @@ svfloat32_t test_svmlallbb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlallbb_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -179,8 +179,8 @@ svfloat32_t test_svmlallbb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlallbb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -211,8 +211,8 @@ svfloat32_t test_svmlallbt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlallbt_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -221,8 +221,8 @@ svfloat32_t test_svmlallbt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlallbt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -253,8 +253,8 @@ svfloat32_t test_svmlalltb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlalltb_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -263,8 +263,8 @@ svfloat32_t test_svmlalltb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlalltb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -295,8 +295,8 @@ svfloat32_t test_svmlalltt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlalltt_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -305,8 +305,8 @@ svfloat32_t test_svmlalltt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlalltt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) diff --git a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp new file mode 100644 index 0000000000000..f7a44a5999887 --- /dev/null +++ b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp @@ -0,0 +1,622 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-elf -fcxx-exceptions -fexceptions -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-A64 +// RUN: %clang_cc1 -triple arm64_32-apple-ios7.0 -fcxx-exceptions -fexceptions -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-A64_32 + +struct Sll { + long long x, y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z3Tll3Sll( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SLL:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SLL]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store i64 1, ptr [[X]], align 8 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z3Tll3Sll( +// CHECK-A64_32-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SLL:%.*]], align 8 +// CHECK-A64_32-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SLL]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i64 1, ptr [[X]], align 8 +// CHECK-A64_32-NEXT: ret void +// +void Tll(Sll s) { s.x = 1; } + +struct Sp { + int *x; +}; +// CHECK-A64-LABEL: define dso_local void @_Z2Tp2Sp( +// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SP:%.*]], align 8 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr +// CHECK-A64-NEXT: store ptr [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z2Tp2Sp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SP:%.*]], align 4 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32 +// CHECK-A64_32-NEXT: store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tp(Sp s) { *s.x = 1; } + +struct Spp { + int *x, *y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z3Tpp3Spp( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z3Tpp3Spp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpp(Spp s) { *s.x = 1; } + +struct Sppp { + int *x, *y, *z; +}; +// CHECK-A64-LABEL: define dso_local void @_Z4Tppp4Sppp( +// CHECK-A64-SAME: ptr noundef [[S:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S_INDIRECT_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-A64-NEXT: store ptr [[S]], ptr [[S_INDIRECT_ADDR]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPPP:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z4Tppp4Sppp( +// CHECK-A64_32-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPPP:%.*]], align 4 +// CHECK-A64_32-NEXT: [[TMP_COERCE:%.*]] = alloca [2 x i64], align 8 +// CHECK-A64_32-NEXT: store [2 x i64] [[S_COERCE]], ptr [[TMP_COERCE]], align 8 +// CHECK-A64_32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[S]], ptr align 8 [[TMP_COERCE]], i32 12, i1 false) +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tppp(Sppp s) { *s.x = 1; } + +struct Spi { + int *x, y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z3Tpi3Spi( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPI:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPI]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z3Tpi3Spi( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPI:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPI]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpi(Spi s) { *s.x = 1; } + +struct Srp { + int &x, *y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z3Trp3Srp( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SRP:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z3Trp3Srp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SRP:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Trp(Srp s) { s.x = 1; } + +struct __attribute__((__packed__)) Spp_packed { + int *x, *y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z10Tpp_packed10Spp_packed( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP_PACKED:%.*]], align 1 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 1 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_PACKED]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 1 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z10Tpp_packed10Spp_packed( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP_PACKED:%.*]], align 1 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 1 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_PACKED]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 1 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpp_packed(Spp_packed s) { *s.x = 1; } + +struct __attribute__((__packed__)) Spp_superpacked { + Spp_packed x; +}; +// CHECK-A64-LABEL: define dso_local void @_Z15Tpp_superpacked15Spp_superpacked( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP_SUPERPACKED:%.*]], align 1 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_SUPERPACKED]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[COERCE_DIVE]], align 1 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_SUPERPACKED]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[X1:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_PACKED:%.*]], ptr [[X]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X1]], align 1 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z15Tpp_superpacked15Spp_superpacked( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP_SUPERPACKED:%.*]], align 1 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_SUPERPACKED]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[COERCE_DIVE]], align 1 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_SUPERPACKED]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[X1:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_PACKED:%.*]], ptr [[X]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X1]], align 1 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpp_superpacked(Spp_superpacked s) { *s.x.x = 1; } + +union Upp { + int *x; + long long *y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z11Tupp_packed3Upp( +// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[UNION_UPP:%.*]], align 8 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[UNION_UPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr +// CHECK-A64-NEXT: store ptr [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z11Tupp_packed3Upp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[UNION_UPP:%.*]], align 4 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[UNION_UPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32 +// CHECK-A64_32-NEXT: store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tupp_packed(Upp s) { *s.x = 1; } + +union USpp { + Spp s; + long long y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z12TUSpp_packed4USpp( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[UNION_USPP:%.*]], align 8 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[UNION_USPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z12TUSpp_packed4USpp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[UNION_USPP:%.*]], align 8 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[UNION_USPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void TUSpp_packed(USpp s) { *s.s.x = 1; } + +struct Spf { + int *x; + int z[]; +}; +// CHECK-A64-LABEL: define dso_local void @_Z3Tpf3Spf( +// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPF:%.*]], align 8 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPF]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr +// CHECK-A64-NEXT: store ptr [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPF]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z3Tpf3Spf( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPF:%.*]], align 4 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPF]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32 +// CHECK-A64_32-NEXT: store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPF]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpf(Spf s) { *s.x = 1; } + +struct Sppf { + int *x, *y; + int z[]; +}; +// CHECK-A64-LABEL: define dso_local void @_Z4Tppf4Sppf( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPPF:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPPF]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z4Tppf4Sppf( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPPF:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPPF]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tppf(Sppf s) { *s.x = 1; } + +struct SSpSp { + struct Sp a, b; +}; +// CHECK-A64-LABEL: define dso_local void @_Z5TSpSp5SSpSp( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPSP:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPSP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP:%.*]], ptr [[A]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z5TSpSp5SSpSp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPSP:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPSP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP:%.*]], ptr [[A]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void TSpSp(SSpSp s) { *s.a.x = 1; } + +struct SSpp { + Spp a; +}; +// CHECK-A64-LABEL: define dso_local void @_Z4TSpp4SSpp( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPP:%.*]], align 8 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP:%.*]], ptr [[A]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z4TSpp4SSpp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPP:%.*]], align 4 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[COERCE_DIVE]], align 4 +// CHECK-A64_32-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP:%.*]], ptr [[A]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void TSpp(SSpp s) { *s.a.x = 1; } + +struct SSp : public Sp { + int* b; +}; +// CHECK-A64-LABEL: define dso_local void @_Z3TSp3SSp( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SSP:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z3TSp3SSp( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SSP:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void TSp(SSp s) { *s.x = 1; } + +struct Si { + int x; +}; +struct SSpi : public Si { + int* y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z4TSpi4SSpi( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPI:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SI:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store i32 1, ptr [[X]], align 8 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z4TSpi4SSpi( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPI:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SI:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i32 1, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void TSpi(SSpi s) { s.x = 1; } + +struct Spa { + int* xs[1]; +}; +// CHECK-A64-LABEL: define dso_local void @_Z3Tpa3Spa( +// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPA:%.*]], align 8 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store i64 [[S_COERCE]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64-NEXT: [[XS:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1 x ptr], ptr [[XS]], i64 0, i64 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z3Tpa3Spa( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPA:%.*]], align 4 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32 +// CHECK-A64_32-NEXT: store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4 +// CHECK-A64_32-NEXT: [[XS:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1 x ptr], ptr [[XS]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpa(Spa s) { *s.xs[0] = 1; } + +struct Spa2 { + int* xs[2]; +}; +// CHECK-A64-LABEL: define dso_local void @_Z4Tpa24Spa2( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPA2:%.*]], align 8 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA2]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[COERCE_DIVE]], align 8 +// CHECK-A64-NEXT: [[XS:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA2]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr], ptr [[XS]], i64 0, i64 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z4Tpa24Spa2( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPA2:%.*]], align 4 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA2]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[COERCE_DIVE]], align 4 +// CHECK-A64_32-NEXT: [[XS:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA2]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr], ptr [[XS]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpa2(Spa2 s) { *s.xs[0] = 1; } + +struct Spa3 { + int* xs[3]; +}; +// CHECK-A64-LABEL: define dso_local void @_Z4Tpa34Spa3( +// CHECK-A64-SAME: ptr noundef [[S:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S_INDIRECT_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-A64-NEXT: store ptr [[S]], ptr [[S_INDIRECT_ADDR]], align 8 +// CHECK-A64-NEXT: [[XS:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA3:%.*]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x ptr], ptr [[XS]], i64 0, i64 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z4Tpa34Spa3( +// CHECK-A64_32-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPA3:%.*]], align 4 +// CHECK-A64_32-NEXT: [[TMP_COERCE:%.*]] = alloca [2 x i64], align 8 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA3]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store [2 x i64] [[S_COERCE]], ptr [[TMP_COERCE]], align 8 +// CHECK-A64_32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[COERCE_DIVE]], ptr align 8 [[TMP_COERCE]], i32 12, i1 false) +// CHECK-A64_32-NEXT: [[XS:%.*]] = getelementptr inbounds nuw [[STRUCT_SPA3]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x ptr], ptr [[XS]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpa3(Spa3 s) { *s.xs[0] = 1; } + + +struct __attribute__((aligned(16))) Spp_align16 { + int *x, *y; +}; +// CHECK-A64-LABEL: define dso_local void @_Z11Tpp_align1611Spp_align16( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP_ALIGN16:%.*]], align 16 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 16 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_ALIGN16]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 16 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z11Tpp_align1611Spp_align16( +// CHECK-A64_32-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPP_ALIGN16:%.*]], align 16 +// CHECK-A64_32-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 16 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_ALIGN16]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 16 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void Tpp_align16(Spp_align16 s) { *s.x = 1; } + +struct SSpp_align16 { + Spp_align16 a; +}; +// CHECK-A64-LABEL: define dso_local void @_Z12TSpp_align1612SSpp_align16( +// CHECK-A64-SAME: i128 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPP_ALIGN16:%.*]], align 16 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP_ALIGN16]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store i128 [[S_COERCE]], ptr [[COERCE_DIVE]], align 16 +// CHECK-A64-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP_ALIGN16]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_ALIGN16:%.*]], ptr [[A]], i32 0, i32 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 16 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z12TSpp_align1612SSpp_align16( +// CHECK-A64_32-SAME: i128 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SSPP_ALIGN16:%.*]], align 16 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP_ALIGN16]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i128 [[S_COERCE]], ptr [[COERCE_DIVE]], align 16 +// CHECK-A64_32-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_SSPP_ALIGN16]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPP_ALIGN16:%.*]], ptr [[A]], i32 0, i32 0 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 16 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void TSpp_align16(SSpp_align16 s) { *s.a.x = 1; } + + +struct Sempty { +}; +// CHECK-A64-LABEL: define dso_local void @_Z6Tempty6Sempty( +// CHECK-A64-SAME: i8 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SEMPTY:%.*]], align 1 +// CHECK-A64-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SEMPTY]], ptr [[S]], i32 0, i32 0 +// CHECK-A64-NEXT: store i8 [[S_COERCE]], ptr [[COERCE_DIVE]], align 1 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z6Tempty6Sempty( +// CHECK-A64_32-SAME: i8 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SEMPTY:%.*]], align 1 +// CHECK-A64_32-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SEMPTY]], ptr [[S]], i32 0, i32 0 +// CHECK-A64_32-NEXT: store i8 [[S_COERCE]], ptr [[COERCE_DIVE]], align 1 +// CHECK-A64_32-NEXT: ret void +// +void Tempty(Sempty s) { } + + +struct SpSempty { + Sempty y; + int *x; +}; +// CHECK-A64-LABEL: define dso_local void @_Z8TpSempty8SpSempty( +// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[S:%.*]] = alloca [[STRUCT_SPSEMPTY:%.*]], align 8 +// CHECK-A64-NEXT: store [2 x i64] [[S_COERCE]], ptr [[S]], align 8 +// CHECK-A64-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPSEMPTY]], ptr [[S]], i32 0, i32 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 +// CHECK-A64-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64-NEXT: ret void +// +// CHECK-A64_32-LABEL: define void @_Z8TpSempty8SpSempty( +// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64_32-NEXT: [[ENTRY:.*:]] +// CHECK-A64_32-NEXT: [[S:%.*]] = alloca [[STRUCT_SPSEMPTY:%.*]], align 4 +// CHECK-A64_32-NEXT: store i64 [[S_COERCE]], ptr [[S]], align 4 +// CHECK-A64_32-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPSEMPTY]], ptr [[S]], i32 0, i32 1 +// CHECK-A64_32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 4 +// CHECK-A64_32-NEXT: store i32 1, ptr [[TMP0]], align 4 +// CHECK-A64_32-NEXT: ret void +// +void TpSempty(SpSempty s) { *s.x = 1; } diff --git a/clang/test/CodeGen/AArch64/targetattr.c b/clang/test/CodeGen/AArch64/targetattr.c index d76e8e2248195..f3f074b61cc84 100644 --- a/clang/test/CodeGen/AArch64/targetattr.c +++ b/clang/test/CodeGen/AArch64/targetattr.c @@ -199,26 +199,44 @@ __attribute__((target("cpu=apple-m4"))) // void applem4() {} +__attribute__((target("+sme"))) +// CHECK-LABEL: define {{[^@]+}}@plussmestreaming +// CHECK-SAME: () #[[ATTR19:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +void plussmestreaming(void) __arm_streaming {} + +__attribute__((target("+sme"))) +// CHECK-LABEL: define {{[^@]+}}@plussmelocallystreaming +// CHECK-SAME: () #[[ATTR20:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +__arm_locally_streaming void plussmelocallystreaming(void) {} + //. // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } -// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } -// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } -// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } -// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fpac,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve-bitperm,+sve2,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } +// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } +// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } +// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } +// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a710" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fpac,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve-bitperm,+sve2,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } // CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="cortex-a710" } // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+ete,+fp-armv8,+neon,+trbe,+v8a" } // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" } // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+perfmon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.1a,+v8.2a,+v8a" "tune-cpu"="cortex-a710" } -// CHECK: attributes #[[ATTR9]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" "tune-cpu"="cortex-a710" } -// CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" } +// CHECK: attributes #[[ATTR9]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" "tune-cpu"="cortex-a710" } +// CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" } // CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,-sve" } -// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" } +// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" } // CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16" } -// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } -// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "branch-target-enforcement" "guarded-control-stack" "no-trapping-math"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } +// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } +// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone vscale_range(1,16) "branch-target-enforcement" "guarded-control-stack" "no-trapping-math"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } // CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.3a" } // CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m4" "target-features"="+aes,+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fpac,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+sme,+sme-f64f64,+sme-i16i64,+sme2,+spe-eef,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8.7a,+v8a,+wfxt" } +// CHECK: attributes #[[ATTR19]] = { noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme" } +// CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c index e2f02dc64f766..3ab065d34bcfb 100644 --- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c +++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c @@ -15,24 +15,12 @@ typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_ // CHECK-64-LABEL: @call_bool32_ff( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE4:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1_COERCE:%.*]], [[OP2_COERCE:%.*]], i64 2) -// CHECK-64-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]] -// CHECK-64-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]] -// CHECK-64-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[TMP0:%.*]], [[TMP1:%.*]], i64 2) // CHECK-64-NEXT: ret [[TMP2]] // // CHECK-128-LABEL: @call_bool32_ff( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE4:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1_COERCE:%.*]], [[OP2_COERCE:%.*]], i64 4) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]] -// CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 +// CHECK-128-NEXT: [[TMP2:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[TMP0:%.*]], [[TMP1:%.*]], i64 4) // CHECK-128-NEXT: ret [[TMP2]] // fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) { @@ -41,24 +29,12 @@ fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) { // CHECK-64-LABEL: @call_bool64_ff( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE4:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1_COERCE:%.*]], [[OP2_COERCE:%.*]], i64 1) -// CHECK-64-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]] -// CHECK-64-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]] -// CHECK-64-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 +// CHECK-64-NEXT: [[TMP2:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[TMP0:%.*]], [[TMP1:%.*]], i64 1) // CHECK-64-NEXT: ret [[TMP2]] // // CHECK-128-LABEL: @call_bool64_ff( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE4:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1_COERCE:%.*]], [[OP2_COERCE:%.*]], i64 2) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 +// CHECK-128-NEXT: [[TMP2:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[TMP0:%.*]], [[TMP1:%.*]], i64 2) // CHECK-128-NEXT: ret [[TMP2]] // fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) { @@ -71,25 +47,13 @@ fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) { // CHECK-64-LABEL: @call_bool32_fs( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE2:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1_COERCE:%.*]], [[OP2:%.*]], i64 2) -// CHECK-64-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]] -// CHECK-64-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]] -// CHECK-64-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: ret [[TMP2]] +// CHECK-64-NEXT: [[TMP1:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[TMP0:%.*]], [[OP2:%.*]], i64 2) +// CHECK-64-NEXT: ret [[TMP1]] // // CHECK-128-LABEL: @call_bool32_fs( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE2:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1_COERCE:%.*]], [[OP2:%.*]], i64 4) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: ret [[TMP2]] +// CHECK-128-NEXT: [[TMP1:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[TMP0:%.*]], [[OP2:%.*]], i64 4) +// CHECK-128-NEXT: ret [[TMP1]] // fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) { return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 32); @@ -97,25 +61,13 @@ fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) { // CHECK-64-LABEL: @call_bool64_fs( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE2:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1_COERCE:%.*]], [[OP2:%.*]], i64 1) -// CHECK-64-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]] -// CHECK-64-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]] -// CHECK-64-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: ret [[TMP2]] +// CHECK-64-NEXT: [[TMP1:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[TMP0:%.*]], [[OP2:%.*]], i64 1) +// CHECK-64-NEXT: ret [[TMP1]] // // CHECK-128-LABEL: @call_bool64_fs( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE2:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1_COERCE:%.*]], [[OP2:%.*]], i64 2) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: ret [[TMP2]] +// CHECK-128-NEXT: [[TMP1:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[TMP0:%.*]], [[OP2:%.*]], i64 2) +// CHECK-128-NEXT: ret [[TMP1]] // fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) { return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 64); @@ -127,25 +79,13 @@ fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) { // CHECK-64-LABEL: @call_bool32_ss( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 // CHECK-64-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 2) -// CHECK-64-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-64-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-64-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: ret [[TMP2]] +// CHECK-64-NEXT: ret [[TMP0]] // // CHECK-128-LABEL: @call_bool32_ss( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv2i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 4) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: ret [[TMP2]] +// CHECK-128-NEXT: ret [[TMP0]] // fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) { return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 32); @@ -153,25 +93,13 @@ fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) { // CHECK-64-LABEL: @call_bool64_ss( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 // CHECK-64-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 1) -// CHECK-64-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]] -// CHECK-64-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-64-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: ret [[TMP2]] +// CHECK-64-NEXT: ret [[TMP0]] // // CHECK-128-LABEL: @call_bool64_ss( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 // CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.riscv.vmand.nxv1i1.i64( [[OP1:%.*]], [[OP2:%.*]], i64 2) -// CHECK-128-NEXT: store [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: ret [[TMP2]] +// CHECK-128-NEXT: ret [[TMP0]] // fixed_bool64_t call_bool64_ss(vbool64_t op1, vbool64_t op2) { return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 64); diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c index f0fa7e8d07b4d..8407c065adb21 100644 --- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c +++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c @@ -29,46 +29,22 @@ fixed_bool8_t from_vbool8_t(vbool8_t type) { // CHECK-64-LABEL: @from_vbool16_t( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]] -// CHECK-64-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]] -// CHECK-64-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: ret [[TMP1]] +// CHECK-64-NEXT: ret [[TYPE:%.*]] // // CHECK-128-LABEL: @from_vbool16_t( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]] -// CHECK-128-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]] -// CHECK-128-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: ret [[TYPE:%.*]] // fixed_bool16_t from_vbool16_t(vbool16_t type) { return type; } // CHECK-64-LABEL: @from_vbool32_t( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]] -// CHECK-64-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-64-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: ret [[TMP1]] +// CHECK-64-NEXT: ret [[TYPE:%.*]] // // CHECK-128-LABEL: @from_vbool32_t( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]] -// CHECK-128-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: ret [[TYPE:%.*]] // fixed_bool32_t from_vbool32_t(vbool32_t type) { return type; @@ -76,11 +52,11 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) { // CHECK-64-LABEL: @to_vbool32_t( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: ret [[TYPE_COERCE:%.*]] +// CHECK-64-NEXT: ret [[TMP0:%.*]] // // CHECK-128-LABEL: @to_vbool32_t( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: ret [[TYPE_COERCE:%.*]] +// CHECK-128-NEXT: ret [[TMP0:%.*]] // vbool32_t to_vbool32_t(fixed_bool32_t type) { return type; @@ -88,23 +64,11 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) { // CHECK-64-LABEL: @from_vbool64_t( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-64-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA13:![0-9]+]] -// CHECK-64-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-64-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-64-NEXT: ret [[TMP1]] +// CHECK-64-NEXT: ret [[TYPE:%.*]] // // CHECK-128-LABEL: @from_vbool64_t( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA13:![0-9]+]] -// CHECK-128-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]] -// CHECK-128-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: ret [[TYPE:%.*]] // fixed_bool64_t from_vbool64_t(vbool64_t type) { return type; @@ -112,11 +76,11 @@ fixed_bool64_t from_vbool64_t(vbool64_t type) { // CHECK-64-LABEL: @to_vbool64_t( // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: ret [[TYPE_COERCE:%.*]] +// CHECK-64-NEXT: ret [[TMP0:%.*]] // // CHECK-128-LABEL: @to_vbool64_t( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: ret [[TYPE_COERCE:%.*]] +// CHECK-128-NEXT: ret [[TMP0:%.*]] // vbool64_t to_vbool64_t(fixed_bool64_t type) { return type; diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c index 058ec49b77881..45a099dc9c678 100644 --- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c +++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c @@ -55,12 +55,12 @@ DEFINE_STRUCT(bool64) // CHECK-128-LABEL: @read_bool32( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 // CHECK-128-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1 // CHECK-128-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6:![0-9]+]] -// CHECK-128-NEXT: store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv1i8.v1i8( poison, <1 x i8> [[TMP0]], i64 0) +// CHECK-128-NEXT: [[TMP1:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-128-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[TMP1]], i64 0) +// CHECK-128-NEXT: ret [[TMP2]] // vbool32_t read_bool32(struct struct_bool32 *s) { return s->y[0]; @@ -68,11 +68,11 @@ vbool32_t read_bool32(struct struct_bool32 *s) { // CHECK-128-LABEL: @write_bool32( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: store [[X:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]] -// CHECK-128-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8i1.nxv2i1( zeroinitializer, [[X:%.*]], i64 0) +// CHECK-128-NEXT: [[TMP1:%.*]] = bitcast [[TMP0]] to +// CHECK-128-NEXT: [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8( [[TMP1]], i64 0) // CHECK-128-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1 -// CHECK-128-NEXT: store <1 x i8> [[TMP0]], ptr [[Y]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[TBAA6]] // CHECK-128-NEXT: ret void // void write_bool32(struct struct_bool32 *s, vbool32_t x) { @@ -81,12 +81,12 @@ void write_bool32(struct struct_bool32 *s, vbool32_t x) { // CHECK-128-LABEL: @read_bool64( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 // CHECK-128-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1 // CHECK-128-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-128-NEXT: ret [[TMP1]] +// CHECK-128-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv1i8.v1i8( poison, <1 x i8> [[TMP0]], i64 0) +// CHECK-128-NEXT: [[TMP1:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-128-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv1i1.nxv8i1( [[TMP1]], i64 0) +// CHECK-128-NEXT: ret [[TMP2]] // vbool64_t read_bool64(struct struct_bool64 *s) { return s->y[0]; @@ -94,11 +94,11 @@ vbool64_t read_bool64(struct struct_bool64 *s) { // CHECK-128-LABEL: @write_bool64( // CHECK-128-NEXT: entry: -// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-128-NEXT: store [[X:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]] -// CHECK-128-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8i1.nxv1i1( zeroinitializer, [[X:%.*]], i64 0) +// CHECK-128-NEXT: [[TMP1:%.*]] = bitcast [[TMP0]] to +// CHECK-128-NEXT: [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8( [[TMP1]], i64 0) // CHECK-128-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1 -// CHECK-128-NEXT: store <1 x i8> [[TMP0]], ptr [[Y]], align 1, !tbaa [[TBAA6]] +// CHECK-128-NEXT: store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[TBAA6]] // CHECK-128-NEXT: ret void // void write_bool64(struct struct_bool64 *s, vbool64_t x) { diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c index 7992951346d54..0a50e41dda7e1 100644 --- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c +++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c @@ -97,13 +97,7 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) { // CHECK-LABEL: @from_vbool32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 -// CHECK-NEXT: store [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1 -// CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-NEXT: ret [[TMP1]] +// CHECK-NEXT: ret [[TYPE:%.*]] // fixed_bool32_t from_vbool32_t(vbool32_t type) { return type; @@ -111,7 +105,7 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) { // CHECK-LABEL: @to_vbool32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: ret [[TYPE_COERCE:%.*]] +// CHECK-NEXT: ret [[TMP0:%.*]] // vbool32_t to_vbool32_t(fixed_bool32_t type) { return type; @@ -119,7 +113,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) { // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]] +// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]] // CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TYPE]], i64 0) // CHECK-NEXT: ret [[CAST_SCALABLE]] // @@ -130,7 +124,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) { // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TYPE:%.*]], i64 0) -// CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]] +// CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]] // CHECK-NEXT: ret void // gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) { @@ -139,7 +133,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) { // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]] +// CHECK-NEXT: [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6]] // CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TYPE]], i64 0) // CHECK-NEXT: ret [[CAST_SCALABLE]] // @@ -150,7 +144,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) { // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]] +// CHECK-NEXT: store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]] // CHECK-NEXT: ret void // gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) { diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c index d81855aea2e5e..f01e6caeefd43 100644 --- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c +++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c @@ -113,25 +113,25 @@ fixed_int16m4_t test_bool4(vbool4_t m, vint16m4_t vec) { // CHECK-NEXT: [[M_ADDR:%.*]] = alloca , align 1 // CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 4 // CHECK-NEXT: [[MASK:%.*]] = alloca , align 1 -// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 // CHECK-NEXT: store [[M:%.*]], ptr [[M_ADDR]], align 1 // CHECK-NEXT: store [[VEC:%.*]], ptr [[VEC_ADDR]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[M_ADDR]], align 1 // CHECK-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr @global_bool32, align 1 -// CHECK-NEXT: store <1 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 1 -// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[SAVED_VALUE]], align 1 -// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.riscv.vmand.nxv2i1.i64( [[TMP0]], [[TMP2]], i64 8) -// CHECK-NEXT: store [[TMP3]], ptr [[MASK]], align 1 -// CHECK-NEXT: [[TMP4:%.*]] = load , ptr [[MASK]], align 1 -// CHECK-NEXT: [[TMP5:%.*]] = load , ptr [[VEC_ADDR]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr @global_vec, align 8 -// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TMP6]], i64 0) -// CHECK-NEXT: [[TMP7:%.*]] = call @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64( poison, [[TMP5]], [[CAST_SCALABLE]], [[TMP4]], i64 8, i64 3) -// CHECK-NEXT: [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TMP7]], i64 0) +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = call @llvm.vector.insert.nxv1i8.v1i8( poison, <1 x i8> [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP4:%.*]] = call @llvm.riscv.vmand.nxv2i1.i64( [[TMP0]], [[TMP3]], i64 8) +// CHECK-NEXT: store [[TMP4]], ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load , ptr [[MASK]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = load , ptr [[VEC_ADDR]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load <8 x i32>, ptr @global_vec, align 8 +// CHECK-NEXT: [[CAST_SCALABLE1:%.*]] = call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TMP7]], i64 0) +// CHECK-NEXT: [[TMP8:%.*]] = call @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64( poison, [[TMP6]], [[CAST_SCALABLE1]], [[TMP5]], i64 8, i64 3) +// CHECK-NEXT: [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32( [[TMP8]], i64 0) // CHECK-NEXT: store <8 x i32> [[CAST_FIXED]], ptr [[RETVAL]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8 -// CHECK-NEXT: [[CAST_SCALABLE1:%.*]] = call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TMP8]], i64 0) -// CHECK-NEXT: ret [[CAST_SCALABLE1]] +// CHECK-NEXT: [[TMP9:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[CAST_SCALABLE2:%.*]] = call @llvm.vector.insert.nxv2i32.v8i32( poison, <8 x i32> [[TMP9]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE2]] // fixed_int32m1_t test_bool32(vbool32_t m, vint32m1_t vec) { vbool32_t mask = __riscv_vmand(m, global_bool32, __riscv_v_fixed_vlen/32); @@ -224,15 +224,16 @@ fixed_bool4_t address_of_array_idx_bool4() { // CHECK-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 // CHECK-NEXT: [[ARR:%.*]] = alloca [3 x <1 x i8>], align 1 // CHECK-NEXT: [[PARR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 1 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i8>], ptr [[ARR]], i64 0, i64 0 // CHECK-NEXT: store ptr [[ARRAYIDX]], ptr [[PARR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load <1 x i8>, ptr [[TMP0]], align 1 // CHECK-NEXT: store <1 x i8> [[TMP1]], ptr [[RETVAL]], align 1 -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL_COERCE]], ptr align 1 [[RETVAL]], i64 1, i1 false) -// CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[RETVAL_COERCE]], align 1 -// CHECK-NEXT: ret [[TMP2]] +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = call @llvm.vector.insert.nxv1i8.v1i8( poison, <1 x i8> [[TMP2]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[TMP3]], i64 0) +// CHECK-NEXT: ret [[TMP4]] // fixed_bool32_t address_of_array_idx_bool32() { fixed_bool32_t arr[3]; diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c index 4bd6311e05b03..92ba27fb65425 100644 --- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c +++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c @@ -89,10 +89,10 @@ void write_global_bool4(vbool4_t v) { global_bool4 = v; } #if __riscv_v_fixed_vlen >= 256 // CHECK-256-LABEL: @write_global_bool32( // CHECK-256-NEXT: entry: -// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca , align 1 -// CHECK-256-NEXT: store [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]] -// CHECK-256-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-256-NEXT: store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA6]] +// CHECK-256-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv8i1.nxv2i1( zeroinitializer, [[V:%.*]], i64 0) +// CHECK-256-NEXT: [[TMP1:%.*]] = bitcast [[TMP0]] to +// CHECK-256-NEXT: [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8( [[TMP1]], i64 0) +// CHECK-256-NEXT: store <1 x i8> [[CAST_FIXED]], ptr @global_bool32, align 1, !tbaa [[TBAA6]] // CHECK-256-NEXT: ret void // void write_global_bool32(vbool32_t v) { global_bool32 = v; } @@ -151,11 +151,11 @@ vbool4_t read_global_bool4() { return global_bool4; } #if __riscv_v_fixed_vlen >= 256 // CHECK-256-LABEL: @read_global_bool32( // CHECK-256-NEXT: entry: -// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1 // CHECK-256-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[TBAA6]] -// CHECK-256-NEXT: store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-256-NEXT: [[TMP1:%.*]] = load , ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]] -// CHECK-256-NEXT: ret [[TMP1]] +// CHECK-256-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv1i8.v1i8( poison, <1 x i8> [[TMP0]], i64 0) +// CHECK-256-NEXT: [[TMP1:%.*]] = bitcast [[CAST_SCALABLE]] to +// CHECK-256-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv2i1.nxv8i1( [[TMP1]], i64 0) +// CHECK-256-NEXT: ret [[TMP2]] // vbool32_t read_global_bool32() { return global_bool32; } #endif diff --git a/clang/test/CodeGen/RISCV/riscv-zihintpause.c b/clang/test/CodeGen/RISCV/riscv-zihintpause.c new file mode 100644 index 0000000000000..2e1369f3f6e0c --- /dev/null +++ b/clang/test/CodeGen/RISCV/riscv-zihintpause.c @@ -0,0 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple riscv32 -target-feature +zihintpause -emit-llvm %s -o - \ +// RUN: | FileCheck %s +// RUN: %clang_cc1 -triple riscv64 -target-feature +zihintpause -emit-llvm %s -o - \ +// RUN: | FileCheck %s + +// CHECK-LABEL: @test_builtin_pause( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.riscv.pause() +// CHECK-NEXT: ret void +// +void test_builtin_pause() { + __builtin_riscv_pause(); +} diff --git a/clang/test/CodeGen/X86/avx512-error.c b/clang/test/CodeGen/X86/avx512-error.c index 422cc7a8679dc..645126916572c 100644 --- a/clang/test/CodeGen/X86/avx512-error.c +++ b/clang/test/CodeGen/X86/avx512-error.c @@ -29,4 +29,5 @@ __m512d zmm_error(__m512d a) { // noevex-warning@*:* {{invalid feature combination: +avx512bw +avx10.1-256; will be promoted to avx10.1-512}} // noevex-warning@*:* {{invalid feature combination: +avx512bw +avx10.1-256; will be promoted to avx10.1-512}} // noevex-warning@*:* {{invalid feature combination: +avx512bw +avx10.1-256; will be promoted to avx10.1-512}} +// noevex-warning@*:* {{invalid feature combination: +avx512bw +avx10.1-256; will be promoted to avx10.1-512}} #endif diff --git a/clang/test/CodeGen/arm-mfp8.c b/clang/test/CodeGen/arm-mfp8.c index d9e7b5d4707d8..9385b537f18b3 100644 --- a/clang/test/CodeGen/arm-mfp8.c +++ b/clang/test/CodeGen/arm-mfp8.c @@ -38,34 +38,22 @@ mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) { // CHECK-C-LABEL: define dso_local <1 x i8> @func1n( // CHECK-C-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 -// CHECK-C-NEXT: [[MFP8_ADDR:%.*]] = alloca i8, align 1 -// CHECK-C-NEXT: [[F1N:%.*]] = alloca [10 x i8], align 1 -// CHECK-C-NEXT: store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1 -// CHECK-C-NEXT: [[TMP0:%.*]] = load i8, ptr [[MFP8_ADDR]], align 1 -// CHECK-C-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-C-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX]], align 1 -// CHECK-C-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-C-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -// CHECK-C-NEXT: store i8 [[TMP1]], ptr [[RETVAL]], align 1 -// CHECK-C-NEXT: [[TMP2:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 -// CHECK-C-NEXT: ret <1 x i8> [[TMP2]] +// CHECK-C-NEXT: [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1 +// CHECK-C-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-C-NEXT: store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1 +// CHECK-C-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 +// CHECK-C-NEXT: ret <1 x i8> [[TMP0]] // // CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z6func1nu6__mfp8( // CHECK-CXX-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 -// CHECK-CXX-NEXT: [[MFP8_ADDR:%.*]] = alloca i8, align 1 -// CHECK-CXX-NEXT: [[F1N:%.*]] = alloca [10 x i8], align 1 -// CHECK-CXX-NEXT: store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load i8, ptr [[MFP8_ADDR]], align 1 -// CHECK-CXX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-CXX-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX]], align 1 -// CHECK-CXX-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-CXX-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -// CHECK-CXX-NEXT: store i8 [[TMP1]], ptr [[RETVAL]], align 1 -// CHECK-CXX-NEXT: [[TMP2:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 -// CHECK-CXX-NEXT: ret <1 x i8> [[TMP2]] +// CHECK-CXX-NEXT: [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1 +// CHECK-CXX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-CXX-NEXT: store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1 +// CHECK-CXX-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 +// CHECK-CXX-NEXT: ret <1 x i8> [[TMP0]] // __mfp8 func1n(__mfp8 mfp8) { __mfp8 f1n[10]; @@ -98,18 +86,14 @@ mfloat8_t test_extract_element(mfloat8x16_t x, int i) { // CHECK-C-LABEL: define dso_local <16 x i8> @test_insert_element( // CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[V_ADDR:%.*]] = alloca i8, align 1 -// CHECK-C-NEXT: store <1 x i8> [[V]], ptr [[V_ADDR]], align 1 -// CHECK-C-NEXT: [[TMP0:%.*]] = load i8, ptr [[V_ADDR]], align 1 +// CHECK-C-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8 // CHECK-C-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]] // CHECK-C-NEXT: ret <16 x i8> [[VECINS]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z19test_insert_element14__Mfloat8x16_tiu6__mfp8( // CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[V_ADDR:%.*]] = alloca i8, align 1 -// CHECK-CXX-NEXT: store <1 x i8> [[V]], ptr [[V_ADDR]], align 1 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load i8, ptr [[V_ADDR]], align 1 +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8 // CHECK-CXX-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]] // CHECK-CXX-NEXT: ret <16 x i8> [[VECINS]] // diff --git a/clang/test/CodeGen/attr-counted-by-for-pointers.c b/clang/test/CodeGen/attr-counted-by-for-pointers.c new file mode 100644 index 0000000000000..24076541168d4 --- /dev/null +++ b/clang/test/CodeGen/attr-counted-by-for-pointers.c @@ -0,0 +1,473 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -DWITH_ATTRS -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -fexperimental-late-parse-attributes -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITH-ATTR %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -DWITH_ATTRS -Wall -fstrict-flex-arrays=3 -fexperimental-late-parse-attributes -emit-llvm -o - %s | FileCheck --check-prefix=NO-SANITIZE-WITH-ATTR %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -fsanitize=array-bounds,object-size,local-bounds -fstrict-flex-arrays=3 -fexperimental-late-parse-attributes -emit-llvm -o - %s | FileCheck --check-prefix=SANITIZE-WITHOUT-ATTR %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -fstrict-flex-arrays=3 -fexperimental-late-parse-attributes -emit-llvm -o - %s | FileCheck --check-prefix=NO-SANITIZE-WITHOUT-ATTR %s + +#if !__has_attribute(counted_by) +#error "has attribute broken" +#endif + +#ifdef WITH_ATTRS +#define __counted_by(member) __attribute__((__counted_by__(member))) +#define __sized_by(member) __attribute__((__sized_by__(member))) +#else +#define __counted_by(member) +#define __sized_by(member) +#endif + +#define __bdos(P) __builtin_dynamic_object_size(P, 0) + +typedef long unsigned int size_t; + +struct foo { size_t field; }; +struct annotated_ptr { + unsigned long flags; + struct foo **buf __counted_by(ptr_count); + int ptr_count; +}; + +// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2:![0-9]+]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: handler.out_of_bounds: +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3:[0-9]+]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: cont10: +// SANITIZE-WITH-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA4:![0-9]+]] +// SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 [[IDXPROM]] +// SANITIZE-WITH-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA13:![0-9]+]] +// SANITIZE-WITH-ATTR-NEXT: ret void +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2:![0-9]+]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]] +// NO-SANITIZE-WITH-ATTR-NEXT: ret void +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2:![0-9]+]] +// SANITIZE-WITHOUT-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// SANITIZE-WITHOUT-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]] +// SANITIZE-WITHOUT-ATTR-NEXT: ret void +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2:![0-9]+]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret void +// +void test1(struct annotated_ptr *p, int index, struct foo *value) { + p->buf[index] = value; +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: handler.out_of_bounds: +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: cont10: +// SANITIZE-WITH-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA4]] +// SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 [[IDXPROM]] +// SANITIZE-WITH-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA13]] +// SANITIZE-WITH-ATTR-NEXT: ret void +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]] +// NO-SANITIZE-WITH-ATTR-NEXT: ret void +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]] +// SANITIZE-WITHOUT-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// SANITIZE-WITHOUT-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]] +// SANITIZE-WITHOUT-ATTR-NEXT: ret void +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret void +// +void test2(struct annotated_ptr *p, int index, struct foo *value) { + ((struct foo **)((char *)p->buf))[index] = value; +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT10:%.*]], !prof [[PROF15:![0-9]+]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: handler.out_of_bounds: +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: cont10: +// SANITIZE-WITH-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA4]] +// SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[IDXPROM]] +// SANITIZE-WITH-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA13]] +// SANITIZE-WITH-ATTR-NEXT: ret void +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]] +// NO-SANITIZE-WITH-ATTR-NEXT: ret void +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]] +// SANITIZE-WITHOUT-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// SANITIZE-WITHOUT-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]] +// SANITIZE-WITHOUT-ATTR-NEXT: ret void +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[TBAA2]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITHOUT-ATTR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]] +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret void +// +void test3(struct annotated_ptr *p, int index, struct foo *value) { + *&*&*&p->buf[index] = value; +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test4( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[ARRAY_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[ARRAY_SIZE]], i64 0 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test4( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAY_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[ARRAY_SIZE]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test4(struct annotated_ptr *p) { + return __bdos(p->buf); +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test5( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[ARRAY_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[ARRAY_SIZE]], i64 0 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869184, 17179869177) i64 @test5( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[ARRAY_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[ARRAY_SIZE]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test5(struct annotated_ptr *p, int index) { + return __bdos((struct foo **)((char *)p->buf)); +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 17179869177) i64 @test6( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT8:%.*]], !prof [[PROF15]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: handler.out_of_bounds: +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: cont8: +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0) +// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -34359738360, 34359738361) i64 @test6( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = shl nsw i64 [[TMP0]], 3 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[INDEX]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = and i1 [[TMP2]], [[TMP1]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[RESULT]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP4]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test6( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test6( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test6(struct annotated_ptr *p, int index) { + return __bdos(&p->buf[index]); +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: br i1 [[TMP1]], label [[CONT10:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: handler.out_of_bounds: +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: cont10: +// SANITIZE-WITH-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test7( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test7( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test7(struct annotated_ptr *p, int index) { + return __bdos(((struct foo **)(char *)p->buf)[index]); +} + +struct annotated_sized_ptr { + unsigned long flags; + void *buf __sized_by(ptr_count); + int ptr_count; +}; + +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test8( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0) +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP0]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test8( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0) +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP0]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test8( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test8(struct annotated_sized_ptr *p, int index) { + return __bdos(p->buf); +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test9( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT8:%.*]], !prof [[PROF15]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: handler.out_of_bounds: +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: cont8: +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.smax.i64(i64 [[RESULT]], i64 0) +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -4294967295, 4294967296) i64 @test9( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[RESULT]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test9( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test9( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test9(struct annotated_sized_ptr *p, int index) { + return __bdos(&p->buf[index]); +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 2147483648) i64 @test10( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// SANITIZE-WITH-ATTR-NEXT: [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], label [[CONT8:%.*]], !prof [[PROF15]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: handler.out_of_bounds: +// SANITIZE-WITH-ATTR-NEXT: tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]] +// SANITIZE-WITH-ATTR-NEXT: unreachable, !nosanitize [[META2]] +// SANITIZE-WITH-ATTR: cont8: +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[INDEX_SIZE:%.*]] = shl nuw nsw i64 [[IDXPROM]], 2 +// SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[COUNT]], [[INDEX_SIZE]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.smax.i64(i64 [[RESULT]], i64 0) +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -10737418236, 10737418240) i64 @test10( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[INDEX_SIZE:%.*]] = shl nsw i64 [[IDXPROM]], 2 +// NO-SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[COUNT]], [[INDEX_SIZE]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[RESULT]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test10( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test10( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test10(struct annotated_sized_ptr *p, int index) { + return __bdos(&((unsigned int *) p->buf)[index]); +} diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c index dfdf06587f0e2..101949af208e1 100644 --- a/clang/test/CodeGen/attr-counted-by.c +++ b/clang/test/CodeGen/attr-counted-by.c @@ -194,6 +194,42 @@ size_t test2_bdos(struct annotated *p) { return __bdos(p->array); } +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos_cast( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos_cast( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i32 [[COUNTED_BY_LOAD]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP1]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test2_bdos_cast( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test2_bdos_cast( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test2_bdos_cast(struct annotated *p) { + return __bdos((char *)p->array); +} + // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3( // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITH-ATTR-NEXT: entry: @@ -265,6 +301,30 @@ size_t test3_bdos(struct annotated *p) { return __bdos(p); } +// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos_cast( +// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos_cast( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 -1 +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos_cast( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos_cast( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test3_bdos_cast(struct annotated *p) { + return __bdos((char *)p); +} + // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4( // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITH-ATTR-NEXT: entry: @@ -469,6 +529,96 @@ size_t test4_bdos(struct annotated *p, int index) { return __bdos(&p->array[index]); } +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -12884901886, 12884901885) i64 @test4_bdos_cast1( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2 +// SANITIZE-WITH-ATTR-NEXT: [[INDEX_SIZE:%.*]] = shl nsw i64 [[IDXPROM]], 1 +// SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[INDEX_SIZE]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[RESULT]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -12884901886, 12884901885) i64 @test4_bdos_cast1( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2 +// NO-SANITIZE-WITH-ATTR-NEXT: [[INDEX_SIZE:%.*]] = shl nsw i64 [[IDXPROM]], 1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[INDEX_SIZE]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[RESULT]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast1( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast1( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test4_bdos_cast1(struct annotated *p, int index) { + return __bdos(&((unsigned short *) ((char *)p->array))[index]); +} + +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -10737418239, 10737418237) i64 @test4_bdos_cast2( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2 +// SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[IDXPROM]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[RESULT]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -10737418239, 10737418237) i64 @test4_bdos_cast2( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 2 +// NO-SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[IDXPROM]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[RESULT]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast2( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test4_bdos_cast2( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test4_bdos_cast2(struct annotated *p, int index) { + return __bdos(&((char *) ((unsigned short *)p->array))[index]); +} + // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5( // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITH-ATTR-NEXT: entry: @@ -2059,6 +2209,54 @@ size_t test32_bdos(struct annotated_with_array *ptr, int index) { return __bdos(&ptr->flags[index]); } +// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -21474836134, 21474836817) i64 @test32_bdos_cast( +// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITH-ATTR-NEXT: entry: +// SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336 +// SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3 +// SANITIZE-WITH-ATTR-NEXT: [[FIELD_OFFSET:%.*]] = shl nsw i64 [[IDXPROM]], 1 +// SANITIZE-WITH-ATTR-NEXT: [[REASS_SUB:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[FIELD_OFFSET]] +// SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = add nsw i64 [[REASS_SUB]], 344 +// SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[REASS_SUB]], -345 +// SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -21474836134, 21474836817) i64 @test32_bdos_cast( +// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] { +// NO-SANITIZE-WITH-ATTR-NEXT: entry: +// NO-SANITIZE-WITH-ATTR-NEXT: [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4 +// NO-SANITIZE-WITH-ATTR-NEXT: [[COUNT:%.*]] = sext i32 [[COUNTED_BY_LOAD]] to i64 +// NO-SANITIZE-WITH-ATTR-NEXT: [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nsw i64 [[COUNT]], 3 +// NO-SANITIZE-WITH-ATTR-NEXT: [[FIELD_OFFSET:%.*]] = shl nsw i64 [[IDXPROM]], 1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[REASS_SUB:%.*]] = sub nsw i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], [[FIELD_OFFSET]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[RESULT:%.*]] = add nsw i64 [[REASS_SUB]], 344 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[REASS_SUB]], -345 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[INDEX]], -1 +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +// NO-SANITIZE-WITH-ATTR-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[RESULT]], i64 0 +// NO-SANITIZE-WITH-ATTR-NEXT: ret i64 [[TMP3]] +// +// SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos_cast( +// SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// SANITIZE-WITHOUT-ATTR-NEXT: entry: +// SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +// NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test32_bdos_cast( +// NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] { +// NO-SANITIZE-WITHOUT-ATTR-NEXT: entry: +// NO-SANITIZE-WITHOUT-ATTR-NEXT: ret i64 -1 +// +size_t test32_bdos_cast(struct annotated_with_array *ptr, int index) { + return __bdos(&((unsigned short *) ((char *) ptr->flags))[index]); +} + // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test33( // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { // SANITIZE-WITH-ATTR-NEXT: entry: diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c index 0913295b0c5f5..86c2812434643 100644 --- a/clang/test/CodeGen/builtins-arm64.c +++ b/clang/test/CodeGen/builtins-arm64.c @@ -10,7 +10,7 @@ void f0(void *a, void *b) { void *tp (void) { return __builtin_thread_pointer (); -// CHECK-LINUX: call {{.*}} @llvm.thread.pointer() +// CHECK-LINUX: call {{.*}} @llvm.thread.pointer.p0() } // CHECK: call {{.*}} @llvm.bitreverse.i32(i32 %a) diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 639c18190f436..7904762709df6 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1127,6 +1127,26 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() { // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m1x2_rn_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m1x2_rn_relu_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76) + __nvvm_e2m1x2_to_f16x2_rn(0x004C); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76) + __nvvm_e2m1x2_to_f16x2_rn_relu(0x004C); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 263cfd3ab4c69..d8aff82b0c140 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -741,7 +741,13 @@ __externref_t externref_null() { // WEBASSEMBLY-NEXT: ret } +int externref_is_null(__externref_t arg) { + return __builtin_wasm_ref_is_null_extern(arg); + // WEBASSEMBLY: tail call i32 @llvm.wasm.ref.is_null.extern(ptr addrspace(10) %arg) + // WEBASSEMBLY-NEXT: ret +} + void *tp (void) { return __builtin_thread_pointer (); - // WEBASSEMBLY: call {{.*}} @llvm.thread.pointer() + // WEBASSEMBLY: call {{.*}} @llvm.thread.pointer.p0() } diff --git a/clang/test/CodeGen/dllexport.c b/clang/test/CodeGen/dllexport.c index 4c1143cf5ca48..f64bcb5393005 100644 --- a/clang/test/CodeGen/dllexport.c +++ b/clang/test/CodeGen/dllexport.c @@ -2,6 +2,8 @@ // RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck %s // RUN: %clang_cc1 -triple i686-windows-gnu -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck %s // RUN: %clang_cc1 -triple x86_64-windows-gnu -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple i686-pc-cygwin -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck %s diff --git a/clang/test/CodeGen/dllimport.c b/clang/test/CodeGen/dllimport.c index 6170c8c4a66a3..1631c6dc56805 100644 --- a/clang/test/CodeGen/dllimport.c +++ b/clang/test/CodeGen/dllimport.c @@ -2,8 +2,11 @@ // RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s // RUN: %clang_cc1 -triple i686-windows-gnu -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=GNU %s // RUN: %clang_cc1 -triple x86_64-windows-gnu -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=GNU %s +// RUN: %clang_cc1 -triple i686-pc-cygwin -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=GNU %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -fms-extensions -emit-llvm -std=c11 -O0 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=GNU %s // RUN: %clang_cc1 -triple i686-windows-msvc -fms-extensions -emit-llvm -std=c11 -O1 -fno-inline -o - %s | FileCheck --check-prefix=O1 --check-prefix=MO1 %s // RUN: %clang_cc1 -triple i686-windows-gnu -fms-extensions -emit-llvm -std=c11 -O1 -fno-inline -o - %s | FileCheck --check-prefix=O1 --check-prefix=GO1 %s +// RUN: %clang_cc1 -triple i686-pc-cygwin -fms-extensions -emit-llvm -std=c11 -O1 -fno-inline -o - %s | FileCheck --check-prefix=O1 --check-prefix=GO1 %s #define JOIN2(x, y) x##y #define JOIN(x, y) JOIN2(x, y) diff --git a/clang/test/CodeGen/dso-local-executable.c b/clang/test/CodeGen/dso-local-executable.c index 15575d3927f23..880273df137d7 100644 --- a/clang/test/CodeGen/dso-local-executable.c +++ b/clang/test/CodeGen/dso-local-executable.c @@ -12,6 +12,9 @@ // RUN: %clang_cc1 -triple x86_64-w64-mingw32 -emit-llvm %s -o - | FileCheck --check-prefixes=MINGW,MINGW-NATIVE_TLS,MINGW-AUTO-IMPORT %s // RUN: %clang_cc1 -triple x86_64-w64-mingw32 -emit-llvm %s -o - -fno-auto-import | FileCheck --check-prefixes=MINGW,MINGW-NATIVE_TLS,MINGW-NO-AUTO-IMPORT %s // RUN: %clang_cc1 -triple x86_64-w64-mingw32 -emit-llvm %s -o - -femulated-tls | FileCheck --check-prefixes=MINGW,MINGW-EMUTLS,MINGW-AUTO-IMPORT %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm %s -o - | FileCheck --check-prefixes=MINGW,MINGW-NATIVE_TLS,MINGW-AUTO-IMPORT %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm %s -o - -fno-auto-import | FileCheck --check-prefixes=MINGW,MINGW-NATIVE_TLS,MINGW-NO-AUTO-IMPORT %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm %s -o - -femulated-tls | FileCheck --check-prefixes=MINGW,MINGW-EMUTLS,MINGW-AUTO-IMPORT %s // MINGW: @baz = dso_local global i32 42 // MINGW-NEXT: @import_var = external dllimport global i32 // MINGW-NEXT: @weak_bar = extern_weak global i32 diff --git a/clang/test/CodeGen/target-avx-abi-diag.c b/clang/test/CodeGen/target-avx-abi-diag.c index dfbbc3213ca6b..116959d60788f 100644 --- a/clang/test/CodeGen/target-avx-abi-diag.c +++ b/clang/test/CodeGen/target-avx-abi-diag.c @@ -98,4 +98,6 @@ __attribute__((target("avx512f"))) void call_avx512_ok2(void) { // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}} // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}} // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}} +// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}} +// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}} #endif diff --git a/clang/test/CodeGenCXX/dllexport-members.cpp b/clang/test/CodeGenCXX/dllexport-members.cpp index e4effa4c72c72..3753050cbf7d5 100644 --- a/clang/test/CodeGenCXX/dllexport-members.cpp +++ b/clang/test/CodeGenCXX/dllexport-members.cpp @@ -4,6 +4,8 @@ // RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M64VS2015 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G32 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-pc-cygwin -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=C32 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-pc-cygwin -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s // Helper structs to make templates more expressive. struct ImplicitInst_Exported {}; @@ -35,12 +37,16 @@ struct ExportMembers { // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?normalInlineDecl@ExportMembers@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"?normalInlineDecl@ExportMembers@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers9normalDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers9normalDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers9normalDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers13normalInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers13normalInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers13normalInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers15normalInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers15normalInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers15normalInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers16normalInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers16normalInlineDeclEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers16normalInlineDeclEv(ptr {{[^,]*}} %this) // M32-DAG: define linkonce_odr dso_local x86_thiscallcc void @"?referencedNonExportedInClass@ExportMembers@@QAEXXZ" __declspec(dllexport) void normalDef(); @@ -58,12 +64,16 @@ struct ExportMembers { // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?virtualInlineDecl@ExportMembers@@UAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"?virtualInlineDecl@ExportMembers@@UEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers10virtualDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers10virtualDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers10virtualDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers14virtualInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers14virtualInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers14virtualInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers16virtualInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers16virtualInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers16virtualInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers17virtualInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers17virtualInlineDeclEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers17virtualInlineDeclEv(ptr {{[^,]*}} %this) __declspec(dllexport) virtual void virtualDef(); __declspec(dllexport) virtual void virtualInclass() {} @@ -86,6 +96,7 @@ struct ExportMembers { // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?protectedDef@ExportMembers@@IAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local dllexport void @"?protectedDef@ExportMembers@@IEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers12protectedDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers12protectedDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers12protectedDefEv(ptr {{[^,]*}} %this) // MSC-DAG: define dso_local dllexport void @"?protectedStaticDef@ExportMembers@@KAXXZ"() // GNU-DAG: define dso_local dllexport void @_ZN13ExportMembers18protectedStaticDefEv() @@ -96,6 +107,7 @@ struct ExportMembers { // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?privateDef@ExportMembers@@AAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local dllexport void @"?privateDef@ExportMembers@@AEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers10privateDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers10privateDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers10privateDefEv(ptr {{[^,]*}} %this) // MSC-DAG: define dso_local dllexport void @"?privateStaticDef@ExportMembers@@CAXXZ"() // GNU-DAG: define dso_local dllexport void @_ZN13ExportMembers16privateStaticDefEv() @@ -106,6 +118,7 @@ struct ExportMembers { // M32-DAG: define dso_local x86_thiscallcc void @"?ignored@ExportMembers@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local void @"?ignored@ExportMembers@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local x86_thiscallcc void @_ZN13ExportMembers7ignoredEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local void @_ZN13ExportMembers7ignoredEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local void @_ZN13ExportMembers7ignoredEv(ptr {{[^,]*}} %this) public: void ignored(); @@ -163,12 +176,16 @@ struct ExportMembers::Nested { // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?normalInlineDecl@Nested@ExportMembers@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"?normalInlineDecl@Nested@ExportMembers@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested9normalDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested9normalDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested9normalDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested13normalInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested13normalInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested13normalInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested15normalInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested15normalInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested15normalInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested16normalInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested16normalInlineDeclEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested16normalInlineDeclEv(ptr {{[^,]*}} %this) __declspec(dllexport) void normalDef(); __declspec(dllexport) void normalInclass() {} @@ -184,12 +201,16 @@ struct ExportMembers::Nested { // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?virtualInlineDecl@Nested@ExportMembers@@UAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"?virtualInlineDecl@Nested@ExportMembers@@UEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested10virtualDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested10virtualDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested10virtualDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested14virtualInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested14virtualInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested14virtualInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested16virtualInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested16virtualInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested16virtualInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested17virtualInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested17virtualInlineDeclEv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN13ExportMembers6Nested17virtualInlineDeclEv(ptr {{[^,]*}} %this) __declspec(dllexport) virtual void virtualDef(); __declspec(dllexport) virtual void virtualInclass() {} @@ -212,6 +233,7 @@ struct ExportMembers::Nested { // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?protectedDef@Nested@ExportMembers@@IAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local dllexport void @"?protectedDef@Nested@ExportMembers@@IEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested12protectedDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested12protectedDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested12protectedDefEv(ptr {{[^,]*}} %this) // MSC-DAG: define dso_local dllexport void @"?protectedStaticDef@Nested@ExportMembers@@KAXXZ"() // GNU-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested18protectedStaticDefEv() @@ -222,6 +244,7 @@ struct ExportMembers::Nested { // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?privateDef@Nested@ExportMembers@@AAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local dllexport void @"?privateDef@Nested@ExportMembers@@AEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN13ExportMembers6Nested10privateDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested10privateDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested10privateDefEv(ptr {{[^,]*}} %this) // MSC-DAG: define dso_local dllexport void @"?privateStaticDef@Nested@ExportMembers@@CAXXZ"() // GNU-DAG: define dso_local dllexport void @_ZN13ExportMembers6Nested16privateStaticDefEv() @@ -232,6 +255,7 @@ struct ExportMembers::Nested { // M32-DAG: define dso_local x86_thiscallcc void @"?ignored@Nested@ExportMembers@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local void @"?ignored@Nested@ExportMembers@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local x86_thiscallcc void @_ZN13ExportMembers6Nested7ignoredEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local void @_ZN13ExportMembers6Nested7ignoredEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local void @_ZN13ExportMembers6Nested7ignoredEv(ptr {{[^,]*}} %this) public: void ignored(); @@ -283,44 +307,54 @@ struct ExportSpecials { // M32-DAG: define dso_local dllexport x86_thiscallcc ptr @"??0ExportSpecials@@QAE@XZ"(ptr {{[^,]*}} returned {{[^,]*}} %this) // M64-DAG: define dso_local dllexport ptr @"??0ExportSpecials@@QEAA@XZ"(ptr {{[^,]*}} returned {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsC1Ev(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC1Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC1Ev(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsC2Ev(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC2Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC2Ev(ptr {{[^,]*}} %this) __declspec(dllexport) ExportSpecials(); // M32-DAG: define dso_local dllexport x86_thiscallcc void @"??1ExportSpecials@@QAE@XZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local dllexport void @"??1ExportSpecials@@QEAA@XZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsD1Ev(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsD1Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsD1Ev(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsD2Ev(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsD2Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsD2Ev(ptr {{[^,]*}} %this) __declspec(dllexport) ~ExportSpecials(); // M32-DAG: define dso_local dllexport x86_thiscallcc ptr @"??0ExportSpecials@@QAE@ABU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport ptr @"??0ExportSpecials@@QEAA@AEBU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllexport) ExportSpecials(const ExportSpecials&); // M32-DAG: define dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportSpecials@@QAEAAU0@ABU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportSpecials@@QEAAAEAU0@AEBU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ExportSpecialsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ExportSpecialsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ExportSpecialsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllexport) ExportSpecials& operator=(const ExportSpecials&); // M32-DAG: define dso_local dllexport x86_thiscallcc ptr @"??0ExportSpecials@@QAE@$$QAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport ptr @"??0ExportSpecials@@QEAA@$$QEAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN14ExportSpecialsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport void @_ZN14ExportSpecialsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllexport) ExportSpecials(ExportSpecials&&); // M32-DAG: define dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportSpecials@@QAEAAU0@$$QAU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportSpecials@@QEAAAEAU0@$$QEAU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ExportSpecialsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ExportSpecialsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ExportSpecialsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllexport) ExportSpecials& operator=(ExportSpecials&&); }; @@ -337,36 +371,42 @@ struct ExportInlineSpecials { // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc ptr @"??0ExportInlineSpecials@@QAE@XZ"(ptr {{[^,]*}} returned {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport ptr @"??0ExportInlineSpecials@@QEAA@XZ"( // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN20ExportInlineSpecialsC1Ev( + // C32-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsC1Ev( // G64-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsC1Ev( __declspec(dllexport) ExportInlineSpecials() {} // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??1ExportInlineSpecials@@QAE@XZ"( // M64-DAG: define weak_odr dso_local dllexport void @"??1ExportInlineSpecials@@QEAA@XZ"( // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN20ExportInlineSpecialsD1Ev( + // C32-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsD1Ev( // G64-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsD1Ev( __declspec(dllexport) ~ExportInlineSpecials() {} // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc ptr @"??0ExportInlineSpecials@@QAE@ABU0@@Z"( // M64-DAG: define weak_odr dso_local dllexport ptr @"??0ExportInlineSpecials@@QEAA@AEBU0@@Z"( // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN20ExportInlineSpecialsC1ERKS_( + // C32-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsC1ERKS_( // G64-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsC1ERKS_( __declspec(dllexport) inline ExportInlineSpecials(const ExportInlineSpecials&); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportInlineSpecials@@QAEAAU0@ABU0@@Z"( // M64-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportInlineSpecials@@QEAAAEAU0@AEBU0@@Z"( // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ExportInlineSpecialsaSERKS_( + // C32-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ExportInlineSpecialsaSERKS_( // G64-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ExportInlineSpecialsaSERKS_( __declspec(dllexport) ExportInlineSpecials& operator=(const ExportInlineSpecials&); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc ptr @"??0ExportInlineSpecials@@QAE@$$QAU0@@Z"( // M64-DAG: define weak_odr dso_local dllexport ptr @"??0ExportInlineSpecials@@QEAA@$$QEAU0@@Z"( // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN20ExportInlineSpecialsC1EOS_( + // C32-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsC1EOS_( // G64-DAG: define weak_odr dso_local dllexport void @_ZN20ExportInlineSpecialsC1EOS_( __declspec(dllexport) ExportInlineSpecials(ExportInlineSpecials&&) {} // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportInlineSpecials@@QAEAAU0@$$QAU0@@Z"( // M64-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportInlineSpecials@@QEAAAEAU0@$$QEAU0@@Z"( // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ExportInlineSpecialsaSEOS_( + // C32-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ExportInlineSpecialsaSEOS_( // G64-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ExportInlineSpecialsaSEOS_( __declspec(dllexport) ExportInlineSpecials& operator=(ExportInlineSpecials&&) { return *this; } }; @@ -387,44 +427,54 @@ struct ExportDefaultedDefs { // M32-DAG: define dso_local dllexport x86_thiscallcc ptr @"??0ExportDefaultedDefs@@QAE@XZ"(ptr {{[^,]*}} returned {{[^,]*}} %this) // M64-DAG: define dso_local dllexport ptr @"??0ExportDefaultedDefs@@QEAA@XZ"(ptr {{[^,]*}} returned {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsC1Ev(ptr {{[^,]*}} %this) +// C32-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC1Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC1Ev(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsC2Ev(ptr {{[^,]*}} %this) +// C32-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC2Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC2Ev(ptr {{[^,]*}} %this) __declspec(dllexport) ExportDefaultedDefs::ExportDefaultedDefs() = default; // M32-DAG: define dso_local dllexport x86_thiscallcc void @"??1ExportDefaultedDefs@@QAE@XZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local dllexport void @"??1ExportDefaultedDefs@@QEAA@XZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsD1Ev(ptr {{[^,]*}} %this) +// C32-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsD1Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsD1Ev(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsD2Ev(ptr {{[^,]*}} %this) +// C32-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsD2Ev(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsD2Ev(ptr {{[^,]*}} %this) ExportDefaultedDefs::~ExportDefaultedDefs() = default; // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc ptr @"??0ExportDefaultedDefs@@QAE@ABU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define weak_odr dso_local dllexport ptr @"??0ExportDefaultedDefs@@QEAA@AEBU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define weak_odr dso_local dllexport void @_ZN19ExportDefaultedDefsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define weak_odr dso_local dllexport void @_ZN19ExportDefaultedDefsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define weak_odr dso_local dllexport void @_ZN19ExportDefaultedDefsC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define weak_odr dso_local dllexport void @_ZN19ExportDefaultedDefsC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllexport) ExportDefaultedDefs::ExportDefaultedDefs(const ExportDefaultedDefs&) = default; // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportDefaultedDefs@@QAEAAU0@ABU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportDefaultedDefs@@QEAAAEAU0@AEBU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ExportDefaultedDefsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ExportDefaultedDefsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define weak_odr dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ExportDefaultedDefsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) inline ExportDefaultedDefs& ExportDefaultedDefs::operator=(const ExportDefaultedDefs&) = default; // M32-DAG: define dso_local dllexport x86_thiscallcc ptr @"??0ExportDefaultedDefs@@QAE@$$QAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport ptr @"??0ExportDefaultedDefs@@QEAA@$$QEAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN19ExportDefaultedDefsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport void @_ZN19ExportDefaultedDefsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllexport) ExportDefaultedDefs::ExportDefaultedDefs(ExportDefaultedDefs&&) = default; // M32-DAG: define dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportDefaultedDefs@@QAEAAU0@$$QAU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ExportDefaultedDefs@@QEAAAEAU0@$$QEAU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ExportDefaultedDefsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ExportDefaultedDefsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ExportDefaultedDefsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) ExportDefaultedDefs& ExportDefaultedDefs::operator=(ExportDefaultedDefs&&) = default; @@ -466,24 +516,28 @@ struct ExportAlloc { // M32-DAG: define dso_local dllexport ptr @"??2ExportAlloc@@SAPAXI@Z"(i32 %n) // M64-DAG: define dso_local dllexport ptr @"??2ExportAlloc@@SAPEAX_K@Z"(i64 %n) // G32-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnwEj(i32 %n) -// G64-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnwEy(i64 %n) +// C32-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnwEj(i32 %n) +// G64-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnwE{{[ym]}}(i64 %n) void* ExportAlloc::operator new(__SIZE_TYPE__ n) { return malloc(n); } // M32-DAG: define dso_local dllexport ptr @"??_UExportAlloc@@SAPAXI@Z"(i32 %n) // M64-DAG: define dso_local dllexport ptr @"??_UExportAlloc@@SAPEAX_K@Z"(i64 %n) // G32-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnaEj(i32 %n) -// G64-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnaEy(i64 %n) +// C32-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnaEj(i32 %n) +// G64-DAG: define dso_local dllexport ptr @_ZN11ExportAllocnaE{{[ym]}}(i64 %n) void* ExportAlloc::operator new[](__SIZE_TYPE__ n) { return malloc(n); } // M32-DAG: define dso_local dllexport void @"??3ExportAlloc@@SAXPAX@Z"(ptr %p) // M64-DAG: define dso_local dllexport void @"??3ExportAlloc@@SAXPEAX@Z"(ptr %p) // G32-DAG: define dso_local dllexport void @_ZN11ExportAllocdlEPv(ptr %p) +// C32-DAG: define dso_local dllexport void @_ZN11ExportAllocdlEPv(ptr %p) // G64-DAG: define dso_local dllexport void @_ZN11ExportAllocdlEPv(ptr %p) void ExportAlloc::operator delete(void* p) { free(p); } // M32-DAG: define dso_local dllexport void @"??_VExportAlloc@@SAXPAX@Z"(ptr %p) // M64-DAG: define dso_local dllexport void @"??_VExportAlloc@@SAXPEAX@Z"(ptr %p) // G32-DAG: define dso_local dllexport void @_ZN11ExportAllocdaEPv(ptr %p) +// C32-DAG: define dso_local dllexport void @_ZN11ExportAllocdaEPv(ptr %p) // G64-DAG: define dso_local dllexport void @_ZN11ExportAllocdaEPv(ptr %p) void ExportAlloc::operator delete[](void* p) { free(p); } @@ -504,6 +558,7 @@ void useMemFunTmpl() { // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??$exportedNormal@UImplicitInst_Exported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"??$exportedNormal@UImplicitInst_Exported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl14exportedNormalI21ImplicitInst_ExportedEEvv(ptr {{[^,]*}} %this) + // C32-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI21ImplicitInst_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI21ImplicitInst_ExportedEEvv(ptr {{[^,]*}} %this) MemFunTmpl().exportedNormal(); @@ -518,6 +573,7 @@ void useMemFunTmpl() { // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??$exportedNormal@UExplicitDecl_Exported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"??$exportedNormal@UExplicitDecl_Exported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl14exportedNormalI21ExplicitDecl_ExportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI21ExplicitDecl_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI21ExplicitDecl_ExportedEEvv(ptr {{[^,]*}} %this) extern template void MemFunTmpl::exportedNormal(); template void MemFunTmpl::exportedNormal(); @@ -533,6 +589,7 @@ extern template void MemFunTmpl::exportedStatic(); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??$exportedNormal@UExplicitInst_Exported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"??$exportedNormal@UExplicitInst_Exported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl14exportedNormalI21ExplicitInst_ExportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI21ExplicitInst_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI21ExplicitInst_ExportedEEvv(ptr {{[^,]*}} %this) template void MemFunTmpl::exportedNormal(); @@ -545,12 +602,14 @@ template void MemFunTmpl::exportedStatic(); // M32-DAG: define dso_local dllexport x86_thiscallcc void @"??$exportedNormal@UExplicitSpec_Def_Exported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local dllexport void @"??$exportedNormal@UExplicitSpec_Def_Exported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl14exportedNormalI25ExplicitSpec_Def_ExportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI25ExplicitSpec_Def_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI25ExplicitSpec_Def_ExportedEEvv(ptr {{[^,]*}} %this) template<> __declspec(dllexport) void MemFunTmpl::exportedNormal() {} // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??$exportedNormal@UExplicitSpec_InlineDef_Exported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"??$exportedNormal@UExplicitSpec_InlineDef_Exported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl14exportedNormalI31ExplicitSpec_InlineDef_ExportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI31ExplicitSpec_InlineDef_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl14exportedNormalI31ExplicitSpec_InlineDef_ExportedEEvv(ptr {{[^,]*}} %this) template<> __declspec(dllexport) inline void MemFunTmpl::exportedNormal() {} @@ -568,6 +627,7 @@ template<> __declspec(dllexport) inline void MemFunTmpl::exportedStatic void MemFunTmpl::exportedNormal() {} @@ -581,6 +641,7 @@ template<> void MemFunTmpl::exportedStatic() {} // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??$normalDef@UExplicitDecl_Exported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"??$normalDef@UExplicitDecl_Exported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl9normalDefI21ExplicitDecl_ExportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl9normalDefI21ExplicitDecl_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl9normalDefI21ExplicitDecl_ExportedEEvv(ptr {{[^,]*}} %this) extern template __declspec(dllexport) void MemFunTmpl::normalDef(); template __declspec(dllexport) void MemFunTmpl::normalDef(); @@ -596,6 +657,7 @@ extern template __declspec(dllexport) void MemFunTmpl::staticDef(); @@ -610,8 +672,10 @@ template __declspec(dllexport) void MemFunTmpl::staticDef // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??$normalDef@UExplicitSpec_InlineDef_Exported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define weak_odr dso_local dllexport void @"??$normalDef@UExplicitSpec_InlineDef_Exported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl9normalDefI25ExplicitSpec_Def_ExportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define dso_local dllexport void @_ZN10MemFunTmpl9normalDefI25ExplicitSpec_Def_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local dllexport void @_ZN10MemFunTmpl9normalDefI25ExplicitSpec_Def_ExportedEEvv(ptr {{[^,]*}} %this) // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN10MemFunTmpl9normalDefI31ExplicitSpec_InlineDef_ExportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl9normalDefI31ExplicitSpec_InlineDef_ExportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local dllexport void @_ZN10MemFunTmpl9normalDefI31ExplicitSpec_InlineDef_ExportedEEvv(ptr {{[^,]*}} %this) template<> __declspec(dllexport) void MemFunTmpl::normalDef() {} template<> __declspec(dllexport) inline void MemFunTmpl::normalDef() {} @@ -692,8 +756,10 @@ template struct ClassTmplMem { // MSVC exports explicit specialization of exported class template member function; MinGW does not. // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?exportedNormal@?$ClassTmplMem@H@@QAEXXZ" // G32-DAG: define dso_local x86_thiscallcc void @_ZN12ClassTmplMemIiE14exportedNormalEv +// C32-DAG: define dso_local void @_ZN12ClassTmplMemIiE14exportedNormalEv template<> void ClassTmplMem::exportedNormal() {} // M32-DAG: define dso_local dllexport void @"?exportedStatic@?$ClassTmplMem@H@@SAXXZ" // G32-DAG: define dso_local void @_ZN12ClassTmplMemIiE14exportedStaticEv +// C32-DAG: define dso_local void @_ZN12ClassTmplMemIiE14exportedStaticEv template<> void ClassTmplMem::exportedStatic() {} diff --git a/clang/test/CodeGenCXX/dllexport-missing-key.cpp b/clang/test/CodeGenCXX/dllexport-missing-key.cpp index 90e736f6fad3a..505679f315331 100644 --- a/clang/test/CodeGenCXX/dllexport-missing-key.cpp +++ b/clang/test/CodeGenCXX/dllexport-missing-key.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -std=c++11 -o - %s | FileCheck --check-prefix=GNU %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm -std=c++11 -o - %s | FileCheck --check-prefix=GNU %s class __declspec(dllexport) QAbstractLayoutStyleInfo { public: diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp index c8ac526f4cbe3..dfbb2762ac85c 100644 --- a/clang/test/CodeGenCXX/dllexport.cpp +++ b/clang/test/CodeGenCXX/dllexport.cpp @@ -6,8 +6,10 @@ // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU --check-prefix=G32 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU %s -// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-scei-ps4 -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=PS %s -// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-sie-ps5 -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=PS %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-pc-cygwin -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU --check-prefix=C32 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-pc-cygwin -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-scei-ps4 -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=PS %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-sie-ps5 -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=PS %s // Helper structs to make templates more expressive. struct ImplicitInst_Exported {}; @@ -308,7 +310,7 @@ void Befriended::func() {} // Implicit declarations can be redeclared with dllexport. // MSC-DAG: define dso_local dllexport nonnull ptr @"??2@{{YAPAXI|YAPEAX_K}}@Z"( -// GNU-DAG: define dso_local dllexport nonnull ptr @_Znw{{[yj]}}( +// GNU-DAG: define dso_local dllexport nonnull ptr @_Znw{{[yjm]}}( void* alloc(__SIZE_TYPE__ n); __declspec(dllexport) void* operator new(__SIZE_TYPE__ n) { return alloc(n); } @@ -616,6 +618,7 @@ void W::foo() {} // M32-DAG: [[W_VTABLE:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4W@@6B@", ptr @"?foo@W@@UAEXXZ"] }, comdat($"??_7W@@6B@") // M32-DAG: @"??_7W@@6B@" = dllexport unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[W_VTABLE]], i32 0, i32 0, i32 1) // G32-DAG: @_ZTV1W = dso_local dllexport unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1W, ptr @_ZN1W3fooEv] } +// C32-DAG: @_ZTV1W = dso_local dllexport unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1W, ptr @_ZN1W3fooEv] } struct __declspec(dllexport) X : public virtual W {}; // vbtable: @@ -699,6 +702,7 @@ template void PartiallySpecializedClassTemplate::f() {} USEMEMFUNC(PartiallySpecializedClassTemplate, f); // M32-DAG: define linkonce_odr dso_local x86_thiscallcc void @"?f@?$PartiallySpecializedClassTemplate@PAX@@QAEXXZ" // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN33PartiallySpecializedClassTemplateIPvE1fEv +// C32-DAG: define weak_odr dso_local dllexport void @_ZN33PartiallySpecializedClassTemplateIPvE1fEv // Attributes on explicit specializations are honored. template struct ExplicitlySpecializedClassTemplate {}; @@ -707,6 +711,7 @@ void ExplicitlySpecializedClassTemplate::f() {} USEMEMFUNC(ExplicitlySpecializedClassTemplate, f); // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?f@?$ExplicitlySpecializedClassTemplate@PAX@@QAEXXZ" // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN34ExplicitlySpecializedClassTemplateIPvE1fEv +// C32-DAG: define dso_local dllexport void @_ZN34ExplicitlySpecializedClassTemplateIPvE1fEv // MS inherits DLL attributes to partial specializations. template struct __declspec(dllexport) PartiallySpecializedExportedClassTemplate {}; @@ -714,6 +719,7 @@ template struct PartiallySpecializedExportedClassTemplate { voi USEMEMFUNC(PartiallySpecializedExportedClassTemplate, f); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?f@?$PartiallySpecializedExportedClassTemplate@PAX@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN41PartiallySpecializedExportedClassTemplateIPvE1fEv +// C32-DAG: define linkonce_odr dso_local void @_ZN41PartiallySpecializedExportedClassTemplateIPvE1fEv // MS ignores DLL attributes on partial specializations; inheritance still works though. template struct __declspec(dllexport) PartiallySpecializedExportedClassTemplate2 {}; @@ -722,6 +728,7 @@ template void PartiallySpecializedExportedClassTemplate2::f() { USEMEMFUNC(PartiallySpecializedExportedClassTemplate2, f); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?f@?$PartiallySpecializedExportedClassTemplate2@PAX@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN42PartiallySpecializedExportedClassTemplate2IPvE1fEv +// C32-DAG: declare dllimport void @_ZN42PartiallySpecializedExportedClassTemplate2IPvE1fEv // Attributes on the instantiation take precedence over attributes on the template. template struct __declspec(dllimport) ExplicitlyInstantiatedWithDifferentAttr { void f() {} }; @@ -771,6 +778,7 @@ USEMEMFUNC(ExplicitInstantiationDeclExportedDefTemplate, f); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?f@?$ExplicitInstantiationDeclExportedDefTemplate@H@@QAEXXZ" // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc ptr @"??0?$ExplicitInstantiationDeclExportedDefTemplate@H@@QAE@XZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN44ExplicitInstantiationDeclExportedDefTemplateIiE1fEv +// C32-DAG: define weak_odr dso_local void @_ZN44ExplicitInstantiationDeclExportedDefTemplateIiE1fEv template struct ImplicitInstantiationExportedExplicitInstantiationDefTemplate { virtual void f() {} }; ImplicitInstantiationExportedExplicitInstantiationDefTemplate ImplicitInstantiationExportedExplicitInstantiationDefTemplateInstance; @@ -778,6 +786,7 @@ template struct __declspec(dllexport) ImplicitInstantiationExportedExplicitInsta USEMEMFUNC(ImplicitInstantiationExportedExplicitInstantiationDefTemplate, f); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?f@?$ImplicitInstantiationExportedExplicitInstantiationDefTemplate@H@@UAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN61ImplicitInstantiationExportedExplicitInstantiationDefTemplateIiE1fEv +// C32-DAG: define weak_odr dso_local void @_ZN61ImplicitInstantiationExportedExplicitInstantiationDefTemplateIiE1fEv template struct __declspec(dllexport) ImplicitInstantiationExplicitInstantiationDefExportedTemplate { virtual void f() {} }; ImplicitInstantiationExplicitInstantiationDefExportedTemplate ImplicitInstantiationExplicitInstantiationDefExportedTemplateInstance; @@ -785,6 +794,7 @@ template struct ImplicitInstantiationExplicitInstantiationDefExportedTemplate, f); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?f@?$ImplicitInstantiationExplicitInstantiationDefExportedTemplate@H@@UAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN61ImplicitInstantiationExplicitInstantiationDefExportedTemplateIiE1fEv +// C32-DAG: define weak_odr dso_local void @_ZN61ImplicitInstantiationExplicitInstantiationDefExportedTemplateIiE1fEv template struct __declspec(dllexport) ImplicitInstantiationExportedExplicitInstantiationDefExportedTemplate { virtual void f() {} }; ImplicitInstantiationExportedExplicitInstantiationDefExportedTemplate ImplicitInstantiationExportedExplicitInstantiationDefExportedTemplateInstance; @@ -792,6 +802,7 @@ template struct __declspec(dllexport) ImplicitInstantiationExportedExplicitInsta USEMEMFUNC(ImplicitInstantiationExportedExplicitInstantiationDefExportedTemplate, f); // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?f@?$ImplicitInstantiationExportedExplicitInstantiationDefExportedTemplate@H@@UAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN69ImplicitInstantiationExportedExplicitInstantiationDefExportedTemplateIiE1fEv +// C32-DAG: define weak_odr dso_local void @_ZN69ImplicitInstantiationExportedExplicitInstantiationDefExportedTemplateIiE1fEv namespace { struct InternalLinkageType {}; } struct __declspec(dllexport) PR23308 { @@ -982,6 +993,7 @@ struct __declspec(dllexport) DerivedFromTemplate : public ClassTemplate {}; USEMEMFUNC(DerivedFromTemplate, func) // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$ClassTemplate@H@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ClassTemplateIiE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN13ClassTemplateIiE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN13ClassTemplateIiE4funcEv // ExportedTemplate is explicitly exported. @@ -989,6 +1001,7 @@ struct __declspec(dllexport) DerivedFromExportedTemplate : public ExportedClassT USEMEMFUNC(DerivedFromExportedTemplate, func) // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$ExportedClassTemplate@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN21ExportedClassTemplateIiE4funcEv +// C32-DAG: define weak_odr dso_local dllexport void @_ZN21ExportedClassTemplateIiE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN21ExportedClassTemplateIiE4funcEv // ImportedClassTemplate is explicitly imported. @@ -996,6 +1009,7 @@ struct __declspec(dllexport) DerivedFromImportedTemplate : public ImportedClassT USEMEMFUNC(DerivedFromImportedTemplate, func) // M32-DAG: {{declare|define available_externally}} dllimport x86_thiscallcc void @"?func@?$ImportedClassTemplate@H@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN21ImportedClassTemplateIiE4funcEv +// C32-DAG: declare dllimport void @_ZN21ImportedClassTemplateIiE4funcEv // PS-DAG: declare dllimport void @_ZN21ImportedClassTemplateIiE4funcEv // Base class already implicitly instantiated without dll attribute. @@ -1004,6 +1018,7 @@ struct __declspec(dllexport) DerivedFromTemplateD2 : public ClassTemplate USEMEMFUNC(DerivedFromTemplateB2, func) // M32-DAG: {{declare|define available_externally}} dllimport x86_thiscallcc void @"?func@?$ClassTemplate@_N@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ClassTemplateIbE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN13ClassTemplateIbE4funcEv // PS-DAG: declare dllimport void @_ZN13ClassTemplateIbE4funcEv // Base class already specialized without dll attribute. @@ -1019,6 +1035,7 @@ struct __declspec(dllexport) DerivedFromExplicitlySpecializedTemplate : public E USEMEMFUNC(DerivedFromExplicitlySpecializedTemplate, func) // M32-DAG: define dso_local x86_thiscallcc void @"?func@?$ExplicitlySpecializedTemplate@H@@QAEXXZ" // G32-DAG: define dso_local x86_thiscallcc void @_ZN29ExplicitlySpecializedTemplateIiE4funcEv +// C32-DAG: define dso_local void @_ZN29ExplicitlySpecializedTemplateIiE4funcEv // PS-DAG: define dso_local void @_ZN29ExplicitlySpecializedTemplateIiE4funcEv // Base class alredy specialized with export attribute. @@ -1026,6 +1043,7 @@ struct __declspec(dllexport) DerivedFromExplicitlyExportSpecializedTemplate : pu USEMEMFUNC(DerivedFromExplicitlyExportSpecializedTemplate, func) // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?func@?$ExplicitlyExportSpecializedTemplate@H@@QAEXXZ" // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN35ExplicitlyExportSpecializedTemplateIiE4funcEv +// C32-DAG: define dso_local dllexport void @_ZN35ExplicitlyExportSpecializedTemplateIiE4funcEv // PS-DAG: define dso_local dllexport void @_ZN35ExplicitlyExportSpecializedTemplateIiE4funcEv // Base class already specialized with import attribute. @@ -1033,6 +1051,7 @@ struct __declspec(dllexport) DerivedFromExplicitlyImportSpecializedTemplate : pu USEMEMFUNC(DerivedFromExplicitlyImportSpecializedTemplate, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ExplicitlyImportSpecializedTemplate@H@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN35ExplicitlyImportSpecializedTemplateIiE4funcEv +// C32-DAG: declare dllimport void @_ZN35ExplicitlyImportSpecializedTemplateIiE4funcEv // PS-DAG: declare dllimport void @_ZN35ExplicitlyImportSpecializedTemplateIiE4funcEv // Base class already instantiated without dll attribute. @@ -1040,6 +1059,7 @@ struct __declspec(dllexport) DerivedFromExplicitlyInstantiatedTemplate : public USEMEMFUNC(DerivedFromExplicitlyInstantiatedTemplate, func) // M32-DAG: define weak_odr dso_local x86_thiscallcc void @"?func@?$ExplicitlyInstantiatedTemplate@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN30ExplicitlyInstantiatedTemplateIiE4funcEv +// C32-DAG: define weak_odr dso_local void @_ZN30ExplicitlyInstantiatedTemplateIiE4funcEv // PS-DAG: define weak_odr void @_ZN30ExplicitlyInstantiatedTemplateIiE4funcEv // Base class already instantiated with export attribute. @@ -1047,6 +1067,7 @@ struct __declspec(dllexport) DerivedFromExplicitlyExportInstantiatedTemplate : p USEMEMFUNC(DerivedFromExplicitlyExportInstantiatedTemplate, func) // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$ExplicitlyExportInstantiatedTemplate@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN36ExplicitlyExportInstantiatedTemplateIiE4funcEv +// C32-DAG: define weak_odr dso_local dllexport void @_ZN36ExplicitlyExportInstantiatedTemplateIiE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN36ExplicitlyExportInstantiatedTemplateIiE4funcEv // Base class already instantiated with import attribute. @@ -1054,6 +1075,7 @@ struct __declspec(dllexport) DerivedFromExplicitlyImportInstantiatedTemplate : p USEMEMFUNC(DerivedFromExplicitlyImportInstantiatedTemplate, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ExplicitlyImportInstantiatedTemplate@H@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN36ExplicitlyImportInstantiatedTemplateIiE4funcEv +// C32-DAG: declare dllimport void @_ZN36ExplicitlyImportInstantiatedTemplateIiE4funcEv // PS-DAG: declare dllimport void @_ZN36ExplicitlyImportInstantiatedTemplateIiE4funcEv // MS: A dll attribute propagates through multiple levels of instantiation. @@ -1063,6 +1085,7 @@ struct __declspec(dllexport) BottomClass : public MiddleClass { }; USEMEMFUNC(BottomClass, func) // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$TopClass@H@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN8TopClassIiE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN8TopClassIiE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN8TopClassIiE4funcEv template struct ExplicitInstantiationDeclTemplateBase { void func() {} }; @@ -1071,6 +1094,7 @@ struct __declspec(dllexport) DerivedFromExplicitInstantiationDeclTemplateBase : template struct ExplicitInstantiationDeclTemplateBase; // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$ExplicitInstantiationDeclTemplateBase@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv +// C32-DAG: define weak_odr dso_local void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv // PR26076 diff --git a/clang/test/CodeGenCXX/dllimport-members.cpp b/clang/test/CodeGenCXX/dllimport-members.cpp index 19bd7fec3f337..896f1547b658f 100644 --- a/clang/test/CodeGenCXX/dllimport-members.cpp +++ b/clang/test/CodeGenCXX/dllimport-members.cpp @@ -2,8 +2,11 @@ // RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple x86_64-windows-msvc -fms-compatibility -emit-llvm -std=c++1y -O0 -o - %s -DMSABI | FileCheck --check-prefix=MSC --check-prefix=M64 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple i686-windows-gnu -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G32 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple x86_64-windows-gnu -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple i686-pc-cygwin -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=C32 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple x86_64-pc-cygwin -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple i686-windows-msvc -fms-compatibility -emit-llvm -std=c++1y -O1 -o - %s -DMSABI | FileCheck --check-prefix=MO1 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple i686-windows-gnu -emit-llvm -std=c++1y -O1 -o - %s | FileCheck --check-prefix=GO1 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -disable-llvm-passes -triple i686-pc-cygwin -emit-llvm -std=c++1y -O1 -o - %s | FileCheck --check-prefix=CO1 %s // Helper structs to make templates more expressive. struct ImplicitInst_Imported {}; @@ -74,21 +77,29 @@ struct ImportMembers { // M32-DAG: declare dllimport x86_thiscallcc void @"?normalInlineDecl@ImportMembers@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?normalInlineDecl@ImportMembers@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define dso_local x86_thiscallcc void @_ZN13ImportMembers9normalDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local void @_ZN13ImportMembers9normalDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local void @_ZN13ImportMembers9normalDefEv(ptr {{[^,]*}} %this) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers10normalDeclEv(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN13ImportMembers10normalDeclEv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN13ImportMembers10normalDeclEv(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers13normalInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers13normalInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers13normalInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers15normalInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers15normalInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers15normalInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers16normalInlineDeclEv(ptr {{[^,]*}} %this) - // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16normalInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16normalInlineDeclEv(ptr {{[^,]*}} %this) + // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16normalInlineDeclEv(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?normalInclass@ImportMembers@@QAEXXZ"( // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?normalInlineDef@ImportMembers@@QAEXXZ"( // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?normalInlineDecl@ImportMembers@@QAEXXZ"( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers13normalInclassEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers15normalInlineDefEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers16normalInlineDeclEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers13normalInclassEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers15normalInlineDefEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16normalInlineDeclEv( __declspec(dllimport) void normalDef(); // dllimport ignored __declspec(dllimport) void normalDecl(); __declspec(dllimport) void normalInclass() {} @@ -106,14 +117,19 @@ struct ImportMembers { // M32-DAG: declare dllimport x86_thiscallcc void @"?virtualInlineDecl@ImportMembers@@UAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?virtualInlineDecl@ImportMembers@@UEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define dso_local x86_thiscallcc void @_ZN13ImportMembers10virtualDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local void @_ZN13ImportMembers10virtualDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local void @_ZN13ImportMembers10virtualDefEv(ptr {{[^,]*}} %this) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers11virtualDeclEv(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN13ImportMembers11virtualDeclEv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN13ImportMembers11virtualDeclEv(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers14virtualInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers14virtualInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers14virtualInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers16virtualInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16virtualInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16virtualInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers17virtualInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers17virtualInlineDeclEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers17virtualInlineDeclEv(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?virtualInclass@ImportMembers@@UAEXXZ"( // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?virtualInlineDef@ImportMembers@@UAEXXZ"( @@ -121,6 +137,9 @@ struct ImportMembers { // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers14virtualInclassEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers16virtualInlineDefEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers17virtualInlineDeclEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers14virtualInclassEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16virtualInlineDefEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers17virtualInlineDeclEv( __declspec(dllimport) virtual void virtualDef(); // dllimport ignored __declspec(dllimport) virtual void virtualDecl(); __declspec(dllimport) virtual void virtualInclass() {} @@ -143,6 +162,9 @@ struct ImportMembers { // GO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers13staticInclassEv() // GO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers15staticInlineDefEv() // GO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16staticInlineDeclEv() + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers13staticInclassEv() + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers15staticInlineDefEv() + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers16staticInlineDeclEv() __declspec(dllimport) static void staticDef(); // dllimport ignored __declspec(dllimport) static void staticDecl(); __declspec(dllimport) static void staticInclass() {} @@ -152,6 +174,7 @@ struct ImportMembers { // M32-DAG: declare dllimport x86_thiscallcc void @"?protectedNormalDecl@ImportMembers@@IAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?protectedNormalDecl@ImportMembers@@IEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers19protectedNormalDeclEv(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN13ImportMembers19protectedNormalDeclEv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN13ImportMembers19protectedNormalDeclEv(ptr {{[^,]*}}) // MSC-DAG: declare dllimport void @"?protectedStaticDecl@ImportMembers@@KAXXZ"() // GNU-DAG: declare dllimport void @_ZN13ImportMembers19protectedStaticDeclEv() @@ -162,6 +185,7 @@ struct ImportMembers { // M32-DAG: declare dllimport x86_thiscallcc void @"?privateNormalDecl@ImportMembers@@AAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?privateNormalDecl@ImportMembers@@AEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers17privateNormalDeclEv(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN13ImportMembers17privateNormalDeclEv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN13ImportMembers17privateNormalDeclEv(ptr {{[^,]*}}) // MSC-DAG: declare dllimport void @"?privateStaticDecl@ImportMembers@@CAXXZ"() // GNU-DAG: declare dllimport void @_ZN13ImportMembers17privateStaticDeclEv() @@ -172,6 +196,7 @@ struct ImportMembers { // M32-DAG: declare dso_local x86_thiscallcc void @"?ignored@ImportMembers@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dso_local void @"?ignored@ImportMembers@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dso_local x86_thiscallcc void @_ZN13ImportMembers7ignoredEv(ptr {{[^,]*}}) + // C32-DAG: declare dso_local void @_ZN13ImportMembers7ignoredEv(ptr {{[^,]*}}) // G64-DAG: declare dso_local void @_ZN13ImportMembers7ignoredEv(ptr {{[^,]*}}) public: void ignored(); @@ -246,14 +271,19 @@ struct ImportMembers::Nested { // M32-DAG: declare dllimport x86_thiscallcc void @"?normalInlineDecl@Nested@ImportMembers@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?normalInlineDecl@Nested@ImportMembers@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested9normalDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local void @_ZN13ImportMembers6Nested9normalDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local void @_ZN13ImportMembers6Nested9normalDefEv(ptr {{[^,]*}} %this) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers6Nested10normalDeclEv(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN13ImportMembers6Nested10normalDeclEv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN13ImportMembers6Nested10normalDeclEv(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested13normalInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested13normalInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested13normalInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested15normalInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested15normalInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested15normalInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested16normalInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16normalInlineDeclEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16normalInlineDeclEv(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?normalInclass@Nested@ImportMembers@@QAEXXZ"( // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?normalInlineDef@Nested@ImportMembers@@QAEXXZ"( @@ -261,6 +291,9 @@ struct ImportMembers::Nested { // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested13normalInclassEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested15normalInlineDefEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested16normalInlineDeclEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested13normalInclassEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested15normalInlineDefEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16normalInlineDeclEv( __declspec(dllimport) void normalDef(); // dllimport ignored __declspec(dllimport) void normalDecl(); __declspec(dllimport) void normalInclass() {} @@ -278,14 +311,19 @@ struct ImportMembers::Nested { // M32-DAG: declare dllimport x86_thiscallcc void @"?virtualInlineDecl@Nested@ImportMembers@@UAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?virtualInlineDecl@Nested@ImportMembers@@UEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested10virtualDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define dso_local void @_ZN13ImportMembers6Nested10virtualDefEv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local void @_ZN13ImportMembers6Nested10virtualDefEv(ptr {{[^,]*}} %this) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers6Nested11virtualDeclEv(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN13ImportMembers6Nested11virtualDeclEv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN13ImportMembers6Nested11virtualDeclEv(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested14virtualInclassEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested14virtualInclassEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested14virtualInclassEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested16virtualInlineDefEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16virtualInlineDefEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16virtualInlineDefEv(ptr {{[^,]*}} %this) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested17virtualInlineDeclEv(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested17virtualInlineDeclEv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested17virtualInlineDeclEv(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"?virtualInclass@Nested@ImportMembers@@UAEXXZ"( @@ -294,6 +332,9 @@ struct ImportMembers::Nested { // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested14virtualInclassEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested16virtualInlineDefEv( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested17virtualInlineDeclEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested14virtualInclassEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16virtualInlineDefEv( + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested17virtualInlineDeclEv( __declspec(dllimport) virtual void virtualDef(); // dllimport ignored __declspec(dllimport) virtual void virtualDecl(); __declspec(dllimport) virtual void virtualInclass() {} @@ -316,6 +357,9 @@ struct ImportMembers::Nested { // GO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested13staticInclassEv() // GO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested15staticInlineDefEv() // GO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16staticInlineDeclEv() + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested13staticInclassEv() + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested15staticInlineDefEv() + // CO1-DAG: define linkonce_odr dso_local void @_ZN13ImportMembers6Nested16staticInlineDeclEv() __declspec(dllimport) static void staticDef(); // dllimport ignored __declspec(dllimport) static void staticDecl(); __declspec(dllimport) static void staticInclass() {} @@ -325,6 +369,7 @@ struct ImportMembers::Nested { // M32-DAG: declare dllimport x86_thiscallcc void @"?protectedNormalDecl@Nested@ImportMembers@@IAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?protectedNormalDecl@Nested@ImportMembers@@IEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers6Nested19protectedNormalDeclEv(ptr {{[^,]*}} + // C32-DAG: declare dllimport void @_ZN13ImportMembers6Nested19protectedNormalDeclEv(ptr {{[^,]*}} // G64-DAG: declare dllimport void @_ZN13ImportMembers6Nested19protectedNormalDeclEv(ptr {{[^,]*}}) // MSC-DAG: declare dllimport void @"?protectedStaticDecl@Nested@ImportMembers@@KAXXZ"() // GNU-DAG: declare dllimport void @_ZN13ImportMembers6Nested19protectedStaticDeclEv() @@ -335,6 +380,7 @@ struct ImportMembers::Nested { // M32-DAG: declare dllimport x86_thiscallcc void @"?privateNormalDecl@Nested@ImportMembers@@AAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"?privateNormalDecl@Nested@ImportMembers@@AEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN13ImportMembers6Nested17privateNormalDeclEv(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN13ImportMembers6Nested17privateNormalDeclEv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN13ImportMembers6Nested17privateNormalDeclEv(ptr {{[^,]*}}) // MSC-DAG: declare dllimport void @"?privateStaticDecl@Nested@ImportMembers@@CAXXZ"() // GNU-DAG: declare dllimport void @_ZN13ImportMembers6Nested17privateStaticDeclEv() @@ -345,6 +391,7 @@ struct ImportMembers::Nested { // M32-DAG: declare dso_local x86_thiscallcc void @"?ignored@Nested@ImportMembers@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dso_local void @"?ignored@Nested@ImportMembers@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dso_local x86_thiscallcc void @_ZN13ImportMembers6Nested7ignoredEv(ptr {{[^,]*}}) + // C32-DAG: declare dso_local void @_ZN13ImportMembers6Nested7ignoredEv(ptr {{[^,]*}}) // G64-DAG: declare dso_local void @_ZN13ImportMembers6Nested7ignoredEv(ptr {{[^,]*}}) public: void ignored(); @@ -411,36 +458,42 @@ struct ImportSpecials { // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportSpecials@@QAE@XZ"(ptr {{[^,]*}} returned {{[^,]*}}) // M64-DAG: declare dllimport ptr @"??0ImportSpecials@@QEAA@XZ"(ptr {{[^,]*}} returned {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN14ImportSpecialsC1Ev(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN14ImportSpecialsC1Ev(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN14ImportSpecialsC1Ev(ptr {{[^,]*}}) __declspec(dllimport) ImportSpecials(); // M32-DAG: declare dllimport x86_thiscallcc void @"??1ImportSpecials@@QAE@XZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??1ImportSpecials@@QEAA@XZ"(ptr {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN14ImportSpecialsD1Ev(ptr {{[^,]*}}) + // C32-DAG: declare dllimport void @_ZN14ImportSpecialsD1Ev(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN14ImportSpecialsD1Ev(ptr {{[^,]*}}) __declspec(dllimport) ~ImportSpecials(); // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportSpecials@@QAE@ABU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport ptr @"??0ImportSpecials@@QEAA@AEBU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN14ImportSpecialsC1ERKS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) + // C32-DAG: declare dllimport void @_ZN14ImportSpecialsC1ERKS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G64-DAG: declare dllimport void @_ZN14ImportSpecialsC1ERKS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) __declspec(dllimport) ImportSpecials(const ImportSpecials&); // M32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportSpecials@@QAEAAU0@ABU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportSpecials@@QEAAAEAU0@AEBU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ImportSpecialsaSERKS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) + // C32-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ImportSpecialsaSERKS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ImportSpecialsaSERKS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) __declspec(dllimport) ImportSpecials& operator=(const ImportSpecials&); // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportSpecials@@QAE@$$QAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport ptr @"??0ImportSpecials@@QEAA@$$QEAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN14ImportSpecialsC1EOS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) + // C32-DAG: declare dllimport void @_ZN14ImportSpecialsC1EOS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G64-DAG: declare dllimport void @_ZN14ImportSpecialsC1EOS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) __declspec(dllimport) ImportSpecials(ImportSpecials&&); // M32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportSpecials@@QAEAAU0@$$QAU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportSpecials@@QEAAAEAU0@$$QEAU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ImportSpecialsaSEOS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) + // C32-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ImportSpecialsaSEOS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14ImportSpecialsaSEOS_(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) __declspec(dllimport) ImportSpecials& operator=(ImportSpecials&&); }; @@ -452,49 +505,61 @@ struct ImportInlineSpecials { // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportInlineSpecials@@QAE@XZ"(ptr {{[^,]*}} returned {{[^,]*}}) // M64-DAG: declare dllimport ptr @"??0ImportInlineSpecials@@QEAA@XZ"(ptr {{[^,]*}} returned {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsC1Ev(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1Ev(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1Ev(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc ptr @"??0ImportInlineSpecials@@QAE@XZ"( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsC1Ev( + // CO1-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1Ev( __declspec(dllimport) ImportInlineSpecials() {} // M32-DAG: declare dllimport x86_thiscallcc void @"??1ImportInlineSpecials@@QAE@XZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??1ImportInlineSpecials@@QEAA@XZ"(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsD1Ev(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsD1Ev(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsD1Ev(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"??1ImportInlineSpecials@@QAE@XZ"( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsD1Ev( + // CO1-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsD1Ev( __declspec(dllimport) ~ImportInlineSpecials() {} // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportInlineSpecials@@QAE@ABU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport ptr @"??0ImportInlineSpecials@@QEAA@AEBU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc ptr @"??0ImportInlineSpecials@@QAE@ABU0@@Z"( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsC1ERKS_( + // CO1-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1ERKS_( __declspec(dllimport) inline ImportInlineSpecials(const ImportInlineSpecials&); // M32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportInlineSpecials@@QAEAAU0@ABU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportInlineSpecials@@QEAAAEAU0@AEBU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportInlineSpecials@@QAEAAU0@ABU0@@Z"( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSERKS_( + // CO1-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSERKS_( __declspec(dllimport) ImportInlineSpecials& operator=(const ImportInlineSpecials&); // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportInlineSpecials@@QAE@$$QAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport ptr @"??0ImportInlineSpecials@@QEAA@$$QEAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc ptr @"??0ImportInlineSpecials@@QAE@$$QAU0@@Z"( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN20ImportInlineSpecialsC1EOS_( + // CO1-DAG: define linkonce_odr dso_local void @_ZN20ImportInlineSpecialsC1EOS_( __declspec(dllimport) ImportInlineSpecials(ImportInlineSpecials&&) {} // M32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportInlineSpecials@@QAEAAU0@$$QAU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportInlineSpecials@@QEAAAEAU0@$$QEAU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportInlineSpecials@@QAEAAU0@$$QAU0@@Z"( // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSEOS_( + // CO1-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN20ImportInlineSpecialsaSEOS_( __declspec(dllimport) ImportInlineSpecials& operator=(ImportInlineSpecials&&) { return *this; } }; ImportInlineSpecials::ImportInlineSpecials(const ImportInlineSpecials&) {} @@ -507,49 +572,61 @@ struct ImportDefaulted { // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportDefaulted@@QAE@XZ"(ptr {{[^,]*}} returned {{[^,]*}}) // M64-DAG: declare dllimport ptr @"??0ImportDefaulted@@QEAA@XZ"(ptr {{[^,]*}} returned {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedC1Ev(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1Ev(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1Ev(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc ptr @"??0ImportDefaulted@@QAE@XZ"(ptr {{[^,]*}} returned {{[^,]*}} %this) // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedC1Ev(ptr {{[^,]*}} %this) + // CO1-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1Ev(ptr {{[^,]*}} %this) __declspec(dllimport) ImportDefaulted() = default; // M32-DAG: declare dllimport x86_thiscallcc void @"??1ImportDefaulted@@QAE@XZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??1ImportDefaulted@@QEAA@XZ"(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedD1Ev(ptr {{[^,]*}} %this) + // C32-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedD1Ev(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedD1Ev(ptr {{[^,]*}} %this) // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"??1ImportDefaulted@@QAE@XZ"(ptr {{[^,]*}} %this) // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedD1Ev(ptr {{[^,]*}} %this) + // CO1-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedD1Ev(ptr {{[^,]*}} %this) __declspec(dllimport) ~ImportDefaulted() = default; // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportDefaulted@@QAE@ABU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport ptr @"??0ImportDefaulted@@QEAA@AEBU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc ptr @"??0ImportDefaulted@@QAE@ABU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // CO1-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllimport) ImportDefaulted(const ImportDefaulted&) = default; // M32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaulted@@QAEAAU0@ABU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaulted@@QEAAAEAU0@AEBU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaulted@@QAEAAU0@ABU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // CO1-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllimport) ImportDefaulted& operator=(const ImportDefaulted&) = default; // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportDefaulted@@QAE@$$QAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport ptr @"??0ImportDefaulted@@QEAA@$$QEAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc ptr @"??0ImportDefaulted@@QAE@$$QAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN15ImportDefaultedC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // CO1-DAG: define linkonce_odr dso_local void @_ZN15ImportDefaultedC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllimport) ImportDefaulted(ImportDefaulted&&) = default; // M32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaulted@@QAEAAU0@$$QAU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaulted@@QEAAAEAU0@$$QEAU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // C32-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaulted@@QAEAAU0@$$QAU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // GO1-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) + // CO1-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN15ImportDefaultedaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) __declspec(dllimport) ImportDefaulted& operator=(ImportDefaulted&&) = default; ForceNonTrivial v; // ensure special members are non-trivial @@ -586,26 +663,31 @@ __declspec(dllimport) ImportDefaultedDefs::~ImportDefaultedDefs() = default; // M32-DAG: declare dllimport x86_thiscallcc ptr @"??0ImportDefaultedDefs@@QAE@ABU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport ptr @"??0ImportDefaultedDefs@@QEAA@AEBU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN19ImportDefaultedDefsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define linkonce_odr dso_local void @_ZN19ImportDefaultedDefsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local void @_ZN19ImportDefaultedDefsC1ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) inline ImportDefaultedDefs::ImportDefaultedDefs(const ImportDefaultedDefs&) = default; // M32-DAG: declare dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaultedDefs@@QAEAAU0@ABU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // M64-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaultedDefs@@QEAAAEAU0@AEBU0@@Z"(ptr {{[^,]*}}, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}})) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ImportDefaultedDefsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ImportDefaultedDefsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define linkonce_odr dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ImportDefaultedDefsaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) inline ImportDefaultedDefs& ImportDefaultedDefs::operator=(const ImportDefaultedDefs&) = default; // M32-DAG: define dso_local dllexport x86_thiscallcc ptr @"??0ImportDefaultedDefs@@QAE@$$QAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport ptr @"??0ImportDefaultedDefs@@QEAA@$$QEAU0@@Z"(ptr {{[^,]*}} returned {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local x86_thiscallcc void @_ZN19ImportDefaultedDefsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define dso_local void @_ZN19ImportDefaultedDefsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local void @_ZN19ImportDefaultedDefsC1EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local x86_thiscallcc void @_ZN19ImportDefaultedDefsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define dso_local void @_ZN19ImportDefaultedDefsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local void @_ZN19ImportDefaultedDefsC2EOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) ImportDefaultedDefs::ImportDefaultedDefs(ImportDefaultedDefs&&) = default; // dllimport ignored // M32-DAG: define dso_local dllexport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaultedDefs@@QAEAAU0@$$QAU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // M64-DAG: define dso_local dllexport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4ImportDefaultedDefs@@QEAAAEAU0@$$QEAU0@@Z"(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G32-DAG: define dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ImportDefaultedDefsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) +// C32-DAG: define dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ImportDefaultedDefsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) // G64-DAG: define dso_local nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN19ImportDefaultedDefsaSEOS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) ImportDefaultedDefs& ImportDefaultedDefs::operator=(ImportDefaultedDefs&&) = default; // dllimport ignored @@ -623,24 +705,28 @@ struct ImportAlloc { // M32-DAG: declare dllimport ptr @"??2ImportAlloc@@SAPAXI@Z"(i32) // M64-DAG: declare dllimport ptr @"??2ImportAlloc@@SAPEAX_K@Z"(i64) // G32-DAG: declare dllimport ptr @_ZN11ImportAllocnwEj(i32) -// G64-DAG: declare dllimport ptr @_ZN11ImportAllocnwEy(i64) +// C32-DAG: declare dllimport ptr @_ZN11ImportAllocnwEj(i32) +// G64-DAG: declare dllimport ptr @_ZN11ImportAllocnwE{{[ym]}}(i64) void UNIQ(use)() { new ImportAlloc(); } // M32-DAG: declare dllimport ptr @"??_UImportAlloc@@SAPAXI@Z"(i32) // M64-DAG: declare dllimport ptr @"??_UImportAlloc@@SAPEAX_K@Z"(i64) // G32-DAG: declare dllimport ptr @_ZN11ImportAllocnaEj(i32) -// G64-DAG: declare dllimport ptr @_ZN11ImportAllocnaEy(i64) +// C32-DAG: declare dllimport ptr @_ZN11ImportAllocnaEj(i32) +// G64-DAG: declare dllimport ptr @_ZN11ImportAllocnaE{{[ym]}}(i64) void UNIQ(use)() { new ImportAlloc[1]; } // M32-DAG: declare dllimport void @"??3ImportAlloc@@SAXPAX@Z"(ptr) // M64-DAG: declare dllimport void @"??3ImportAlloc@@SAXPEAX@Z"(ptr) // G32-DAG: declare dllimport void @_ZN11ImportAllocdlEPv(ptr) +// C32-DAG: declare dllimport void @_ZN11ImportAllocdlEPv(ptr) // G64-DAG: declare dllimport void @_ZN11ImportAllocdlEPv(ptr) void UNIQ(use)(ImportAlloc* ptr) { delete ptr; } // M32-DAG: declare dllimport void @"??_VImportAlloc@@SAXPAX@Z"(ptr) // M64-DAG: declare dllimport void @"??_VImportAlloc@@SAXPEAX@Z"(ptr) // G32-DAG: declare dllimport void @_ZN11ImportAllocdaEPv(ptr) +// C32-DAG: declare dllimport void @_ZN11ImportAllocdaEPv(ptr) // G64-DAG: declare dllimport void @_ZN11ImportAllocdaEPv(ptr) void UNIQ(use)(ImportAlloc* ptr) { delete[] ptr; } @@ -660,6 +746,7 @@ struct MemFunTmpl { // M32-DAG: declare dllimport x86_thiscallcc void @"??$importedNormal@UImplicitInst_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$importedNormal@UImplicitInst_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN10MemFunTmpl14importedNormalI21ImplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define linkonce_odr dso_local void @_ZN10MemFunTmpl14importedNormalI21ImplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN10MemFunTmpl14importedNormalI21ImplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) USEMF(MemFunTmpl, importedNormal) @@ -673,6 +760,7 @@ USE(MemFunTmpl::importedStatic) // M32-DAG: declare dllimport x86_thiscallcc void @"??$importedNormal@UExplicitDecl_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$importedNormal@UExplicitDecl_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dso_local x86_thiscallcc void @_ZN10MemFunTmpl14importedNormalI21ExplicitDecl_ImportedEEvv(ptr {{[^,]*}}) +// C32-DAG: declare dso_local void @_ZN10MemFunTmpl14importedNormalI21ExplicitDecl_ImportedEEvv(ptr {{[^,]*}}) // G64-DAG: declare dso_local void @_ZN10MemFunTmpl14importedNormalI21ExplicitDecl_ImportedEEvv(ptr {{[^,]*}}) extern template void MemFunTmpl::importedNormal(); USEMF(MemFunTmpl, importedNormal) @@ -688,6 +776,7 @@ USE(MemFunTmpl::importedStatic) // M32-DAG: declare dllimport x86_thiscallcc void @"??$importedNormal@UExplicitInst_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$importedNormal@UExplicitInst_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN10MemFunTmpl14importedNormalI21ExplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define weak_odr dso_local void @_ZN10MemFunTmpl14importedNormalI21ExplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local void @_ZN10MemFunTmpl14importedNormalI21ExplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) template void MemFunTmpl::importedNormal(); USEMF(MemFunTmpl, importedNormal) @@ -702,6 +791,7 @@ USE(MemFunTmpl::importedStatic) // M32-DAG: declare dllimport x86_thiscallcc void @"??$importedNormal@UExplicitSpec_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$importedNormal@UExplicitSpec_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN10MemFunTmpl14importedNormalI21ExplicitSpec_ImportedEEvv(ptr {{[^,]*}}) +// C32-DAG: declare dllimport void @_ZN10MemFunTmpl14importedNormalI21ExplicitSpec_ImportedEEvv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN10MemFunTmpl14importedNormalI21ExplicitSpec_ImportedEEvv(ptr {{[^,]*}}) template<> __declspec(dllimport) void MemFunTmpl::importedNormal(); USEMF(MemFunTmpl, importedNormal) @@ -716,6 +806,7 @@ USEMF(MemFunTmpl, importedNormal) // M32-DAG: declare dllimport x86_thiscallcc void @"??$importedNormal@UExplicitSpec_InlineDef_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$importedNormal@UExplicitSpec_InlineDef_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN10MemFunTmpl14importedNormalI31ExplicitSpec_InlineDef_ImportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define linkonce_odr dso_local void @_ZN10MemFunTmpl14importedNormalI31ExplicitSpec_InlineDef_ImportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN10MemFunTmpl14importedNormalI31ExplicitSpec_InlineDef_ImportedEEvv(ptr {{[^,]*}} %this) template<> __declspec(dllimport) inline void MemFunTmpl::importedNormal() {} USEMF(MemFunTmpl, importedNormal) @@ -743,6 +834,7 @@ USE(MemFunTmpl::importedStatic) // M32-DAG: define dso_local x86_thiscallcc void @"??$importedNormal@UExplicitSpec_NotImported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}} %this) // M64-DAG: define dso_local void @"??$importedNormal@UExplicitSpec_NotImported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}} %this) // G32-DAG: define dso_local x86_thiscallcc void @_ZN10MemFunTmpl14importedNormalI24ExplicitSpec_NotImportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define dso_local void @_ZN10MemFunTmpl14importedNormalI24ExplicitSpec_NotImportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define dso_local void @_ZN10MemFunTmpl14importedNormalI24ExplicitSpec_NotImportedEEvv(ptr {{[^,]*}} %this) template<> void MemFunTmpl::importedNormal() {} USEMF(MemFunTmpl, importedNormal) @@ -758,6 +850,7 @@ USE(MemFunTmpl::importedStatic) // M32-DAG: declare dllimport x86_thiscallcc void @"??$normalDef@UExplicitDecl_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$normalDef@UExplicitDecl_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dso_local x86_thiscallcc void @_ZN10MemFunTmpl9normalDefI21ExplicitDecl_ImportedEEvv(ptr {{[^,]*}}) +// C32-DAG: declare dso_local void @_ZN10MemFunTmpl9normalDefI21ExplicitDecl_ImportedEEvv(ptr {{[^,]*}}) // G64-DAG: declare dso_local void @_ZN10MemFunTmpl9normalDefI21ExplicitDecl_ImportedEEvv(ptr {{[^,]*}}) extern template __declspec(dllimport) void MemFunTmpl::normalDef(); USEMF(MemFunTmpl, normalDef) @@ -773,6 +866,7 @@ USE(MemFunTmpl::staticDef) // M32-DAG: declare dllimport x86_thiscallcc void @"??$normalDef@UExplicitInst_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$normalDef@UExplicitInst_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN10MemFunTmpl9normalDefI21ExplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define weak_odr dso_local void @_ZN10MemFunTmpl9normalDefI21ExplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define weak_odr dso_local void @_ZN10MemFunTmpl9normalDefI21ExplicitInst_ImportedEEvv(ptr {{[^,]*}} %this) template __declspec(dllimport) void MemFunTmpl::normalDef(); USEMF(MemFunTmpl, normalDef) @@ -787,6 +881,7 @@ USE(MemFunTmpl::staticDef) // M32-DAG: declare dllimport x86_thiscallcc void @"??$normalDef@UExplicitSpec_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$normalDef@UExplicitSpec_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: declare dllimport x86_thiscallcc void @_ZN10MemFunTmpl9normalDefI21ExplicitSpec_ImportedEEvv(ptr {{[^,]*}}) +// C32-DAG: declare dllimport void @_ZN10MemFunTmpl9normalDefI21ExplicitSpec_ImportedEEvv(ptr {{[^,]*}}) // G64-DAG: declare dllimport void @_ZN10MemFunTmpl9normalDefI21ExplicitSpec_ImportedEEvv(ptr {{[^,]*}}) template<> __declspec(dllimport) void MemFunTmpl::normalDef(); USEMF(MemFunTmpl, normalDef) @@ -801,6 +896,7 @@ USEMF(MemFunTmpl, normalDef) // M32-DAG: declare dllimport x86_thiscallcc void @"??$normalDef@UExplicitSpec_InlineDef_Imported@@@MemFunTmpl@@QAEXXZ"(ptr {{[^,]*}}) // M64-DAG: declare dllimport void @"??$normalDef@UExplicitSpec_InlineDef_Imported@@@MemFunTmpl@@QEAAXXZ"(ptr {{[^,]*}}) // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN10MemFunTmpl9normalDefI31ExplicitSpec_InlineDef_ImportedEEvv(ptr {{[^,]*}} %this) +// C32-DAG: define linkonce_odr dso_local void @_ZN10MemFunTmpl9normalDefI31ExplicitSpec_InlineDef_ImportedEEvv(ptr {{[^,]*}} %this) // G64-DAG: define linkonce_odr dso_local void @_ZN10MemFunTmpl9normalDefI31ExplicitSpec_InlineDef_ImportedEEvv(ptr {{[^,]*}} %this) template<> __declspec(dllimport) inline void MemFunTmpl::normalDef() {} USEMF(MemFunTmpl, normalDef) @@ -888,10 +984,12 @@ template struct ClassTmplMem { // MSVC imports explicit specialization of imported class template member function; MinGW does not. // M32-DAG: declare dllimport x86_thiscallcc void @"?importedNormal@?$ClassTmplMem@H@@QAEXXZ" // G32-DAG: declare dso_local x86_thiscallcc void @_ZN12ClassTmplMemIiE14importedNormalEv +// C32-DAG: declare dso_local void @_ZN12ClassTmplMemIiE14importedNormalEv template<> void ClassTmplMem::importedNormal(); USEMF(ClassTmplMem, importedNormal); // M32-DAG: declare dllimport void @"?importedStatic@?$ClassTmplMem@H@@SAXXZ" // G32-DAG: declare dso_local void @_ZN12ClassTmplMemIiE14importedStaticEv +// C32-DAG: declare dso_local void @_ZN12ClassTmplMemIiE14importedStaticEv template<> void ClassTmplMem::importedStatic(); USEMF(ClassTmplMem, importedStatic); diff --git a/clang/test/CodeGenCXX/dllimport-missing-key.cpp b/clang/test/CodeGenCXX/dllimport-missing-key.cpp index d8ef7aa7ea680..9eb9e7b5bbb0d 100644 --- a/clang/test/CodeGenCXX/dllimport-missing-key.cpp +++ b/clang/test/CodeGenCXX/dllimport-missing-key.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple i686-windows-gnu -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU %s +// RUN: %clang_cc1 -triple i686-pc-cygwin -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU %s class __declspec(dllimport) QObjectData { public: diff --git a/clang/test/CodeGenCXX/dllimport-rtti.cpp b/clang/test/CodeGenCXX/dllimport-rtti.cpp index f23a242c4bedc..046bf5020398d 100644 --- a/clang/test/CodeGenCXX/dllimport-rtti.cpp +++ b/clang/test/CodeGenCXX/dllimport-rtti.cpp @@ -1,5 +1,8 @@ -// RUN: %clang_cc1 -triple i686-windows-msvc -emit-llvm -std=c++1y -fms-extensions -O1 -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=MSVC -// RUN: %clang_cc1 -triple i686-windows-gnu -emit-llvm -std=c++1y -fms-extensions -O1 -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=GNU +// RUN: %clang_cc1 -triple i686-windows-msvc -emit-llvm -std=c++1y -fms-extensions -O1 -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=MSVC +// RUN: %clang_cc1 -triple i686-windows-gnu -emit-llvm -std=c++1y -fms-extensions -O1 -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=GNU +// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -std=c++1y -fms-extensions -O1 -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=GNU +// RUN: %clang_cc1 -triple i686-pc-cygwin -emit-llvm -std=c++1y -fms-extensions -O1 -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=GNU +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm -std=c++1y -fms-extensions -O1 -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=GNU struct __declspec(dllimport) S { virtual void f() {} diff --git a/clang/test/CodeGenCXX/dllimport.cpp b/clang/test/CodeGenCXX/dllimport.cpp index 484866b45389f..363f97a8d58ee 100644 --- a/clang/test/CodeGenCXX/dllimport.cpp +++ b/clang/test/CodeGenCXX/dllimport.cpp @@ -2,15 +2,18 @@ // RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC --check-prefix=M64 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU --check-prefix=G32 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-pc-cygwin -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU --check-prefix=C32 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-pc-cygwin -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc -fno-rtti -fno-threadsafe-statics -fms-extensions -fms-compatibility-version=18.00 -emit-llvm -std=c++1y -O1 -disable-llvm-passes -o - %s -DMSABI -w | FileCheck --check-prefix=MO1 --check-prefix=M18 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc -fno-rtti -fno-threadsafe-statics -fms-extensions -fms-compatibility-version=19.00 -emit-llvm -std=c++1y -O1 -disable-llvm-passes -o - %s -DMSABI -w | FileCheck --check-prefix=MO1 --check-prefix=M19 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O1 -disable-llvm-passes -o - %s -w | FileCheck --check-prefix=GO1 %s -// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-scei-ps4 -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=PS %s -// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-sie-ps5 -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=PS %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-scei-ps4 -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=PS %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-sie-ps5 -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=PS %s // CHECK-NOT doesn't play nice with CHECK-DAG, so use separate run lines. // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC2 %s // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU2 %s +// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-pc-cygwin -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -w | FileCheck --check-prefix=GNU2 %s // Helper structs to make templates more expressive. struct ImplicitInst_Imported {}; @@ -205,7 +208,7 @@ USEVAR(VarTmpl) // Functions //===----------------------------------------------------------------------===// -// GNU-DAG: declare dso_local void @_ZdlPv{{j|y}}(ptr, i{{32|64}}) +// GNU-DAG: declare dso_local void @_ZdlPv{{[jym]}}(ptr, i{{32|64}}) // Import function declaration. // MSC-DAG: declare dllimport void @"?decl@@YAXXZ"() @@ -311,7 +314,7 @@ USE(friend5) // Implicit declarations can be redeclared with dllimport. // MSC-DAG: declare dllimport nonnull ptr @"??2@{{YAPAXI|YAPEAX_K}}@Z"( -// GNU-DAG: declare dllimport nonnull ptr @_Znw{{[yj]}}( +// GNU-DAG: declare dllimport nonnull ptr @_Znw{{[yjm]}}( __declspec(dllimport) void* operator new(__SIZE_TYPE__ n); void UNIQ(use)() { ::operator new(42); } @@ -677,6 +680,7 @@ struct __declspec(dllimport) KeyFuncClass { }; extern constexpr KeyFuncClass keyFuncClassVar = {}; // G32-DAG: @_ZTV12KeyFuncClass = external dllimport unnamed_addr constant { [3 x ptr] } +// C32-DAG: @_ZTV12KeyFuncClass = external dllimport unnamed_addr constant { [3 x ptr] } struct __declspec(dllimport) X : public virtual W {}; USECLASS(X) @@ -813,6 +817,7 @@ template struct __declspec(dllimport) PartiallySpecializedClassTemp USEMEMFUNC(PartiallySpecializedClassTemplate, f); // M32-DAG: declare dso_local x86_thiscallcc void @"?f@?$PartiallySpecializedClassTemplate@PAX@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN33PartiallySpecializedClassTemplateIPvE1fEv +// C32-DAG: declare dllimport void @_ZN33PartiallySpecializedClassTemplateIPvE1fEv // Attributes on explicit specializations are honored. template struct ExplicitlySpecializedClassTemplate {}; @@ -820,6 +825,7 @@ template <> struct __declspec(dllimport) ExplicitlySpecializedClassTemplate, f); // M32-DAG: declare dllimport x86_thiscallcc void @"?f@?$ExplicitlySpecializedClassTemplate@PAX@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN34ExplicitlySpecializedClassTemplateIPvE1fEv +// C32-DAG: declare dllimport void @_ZN34ExplicitlySpecializedClassTemplateIPvE1fEv // MS inherits DLL attributes to partial specializations. template struct __declspec(dllimport) PartiallySpecializedImportedClassTemplate {}; @@ -827,6 +833,7 @@ template struct PartiallySpecializedImportedClassTemplate { voi USEMEMFUNC(PartiallySpecializedImportedClassTemplate, f); // M32-DAG: {{declare|define available_externally}} dllimport x86_thiscallcc void @"?f@?$PartiallySpecializedImportedClassTemplate@PAX@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN41PartiallySpecializedImportedClassTemplateIPvE1fEv +// C32-DAG: define linkonce_odr dso_local void @_ZN41PartiallySpecializedImportedClassTemplateIPvE1fEv // Attributes on the instantiation take precedence over attributes on the template. template struct __declspec(dllexport) ExplicitlyInstantiatedWithDifferentAttr { void f() {} }; @@ -842,6 +849,7 @@ USEMEMFUNC(ExplicitInstantiationDeclImportedDefTemplate, f); // M32-DAG: {{declare|define available_externally}} dllimport x86_thiscallcc void @"?f@?$ExplicitInstantiationDeclImportedDefTemplate@H@@QAEXXZ" // M32-DAG: {{declare|define available_externally}} dllimport x86_thiscallcc ptr @"??0?$ExplicitInstantiationDeclImportedDefTemplate@H@@QAE@XZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN44ExplicitInstantiationDeclImportedDefTemplateIiE1fEv +// C32-DAG: define weak_odr dso_local void @_ZN44ExplicitInstantiationDeclImportedDefTemplateIiE1fEv template struct __declspec(dllimport) ExplicitInstantiationDeclExportedDefImportedTemplate { void f() {} ExplicitInstantiationDeclExportedDefImportedTemplate() {} }; extern template struct __declspec(dllimport) ExplicitInstantiationDeclExportedDefImportedTemplate ; @@ -919,6 +927,7 @@ struct __declspec(dllimport) DerivedFromTemplate : public ClassTemplate {}; USEMEMFUNC(ClassTemplate, func) // M32-DAG: {{declare|define available_externally}} dllimport x86_thiscallcc void @"?func@?$ClassTemplate@H@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ClassTemplateIiE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN13ClassTemplateIiE4funcEv // PS-DAG: declare dllimport void @_ZN13ClassTemplateIiE4funcEv // ImportedTemplate is explicitly imported. @@ -926,6 +935,7 @@ struct __declspec(dllimport) DerivedFromImportedTemplate : public ImportedClassT USEMEMFUNC(ImportedClassTemplate, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ImportedClassTemplate@H@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN21ImportedClassTemplateIiE4funcEv +// C32-DAG: declare dllimport void @_ZN21ImportedClassTemplateIiE4funcEv // PS-DAG: declare dllimport void @_ZN21ImportedClassTemplateIiE4funcEv // ExportedTemplate is explicitly exported. @@ -933,6 +943,7 @@ struct __declspec(dllimport) DerivedFromExportedTemplate : public ExportedClassT USEMEMFUNC(ExportedClassTemplate, func) // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$ExportedClassTemplate@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN21ExportedClassTemplateIiE4funcEv +// C32-DAG: define weak_odr dso_local dllexport void @_ZN21ExportedClassTemplateIiE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN21ExportedClassTemplateIiE4funcEv // Base class already implicitly instantiated without attribute. @@ -941,6 +952,7 @@ struct __declspec(dllimport) DerivedFromTemplateD2 : public ClassTemplate, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ClassTemplate@N@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ClassTemplateIdE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN13ClassTemplateIdE4funcEv // PS-DAG: declare dllimport void @_ZN13ClassTemplateIdE4funcEv // MS: Base class already instantiated with dfferent attribute. @@ -949,6 +961,7 @@ struct __declspec(dllimport) DerivedFromTemplateB2 : public ClassTemplate USEMEMFUNC(ClassTemplate, func) // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$ClassTemplate@_N@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN13ClassTemplateIbE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN13ClassTemplateIbE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN13ClassTemplateIbE4funcEv // Base class already specialized without dll attribute. @@ -956,6 +969,7 @@ struct __declspec(dllimport) DerivedFromExplicitlySpecializedTemplate : public E USEMEMFUNC(ExplicitlySpecializedTemplate, func) // M32-DAG: define linkonce_odr dso_local x86_thiscallcc void @"?func@?$ExplicitlySpecializedTemplate@H@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN29ExplicitlySpecializedTemplateIiE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN29ExplicitlySpecializedTemplateIiE4funcEv // PS-DAG: define linkonce_odr void @_ZN29ExplicitlySpecializedTemplateIiE4funcEv // Base class alredy specialized with export attribute. @@ -963,6 +977,7 @@ struct __declspec(dllimport) DerivedFromExplicitlyExportSpecializedTemplate : pu USEMEMFUNC(ExplicitlyExportSpecializedTemplate, func) // M32-DAG: define dso_local dllexport x86_thiscallcc void @"?func@?$ExplicitlyExportSpecializedTemplate@H@@QAEXXZ" // G32-DAG: define dso_local dllexport x86_thiscallcc void @_ZN35ExplicitlyExportSpecializedTemplateIiE4funcEv +// C32-DAG: define dso_local dllexport void @_ZN35ExplicitlyExportSpecializedTemplateIiE4funcEv // PS-DAG: define dso_local dllexport void @_ZN35ExplicitlyExportSpecializedTemplateIiE4funcEv // Base class already specialized with import attribute. @@ -970,6 +985,7 @@ struct __declspec(dllimport) DerivedFromExplicitlyImportSpecializedTemplate : pu USEMEMFUNC(ExplicitlyImportSpecializedTemplate, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ExplicitlyImportSpecializedTemplate@H@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN35ExplicitlyImportSpecializedTemplateIiE4funcEv +// C32-DAG: declare dllimport void @_ZN35ExplicitlyImportSpecializedTemplateIiE4funcEv // PS-DAG: declare dllimport void @_ZN35ExplicitlyImportSpecializedTemplateIiE4funcEv // Base class already instantiated without dll attribute. @@ -977,6 +993,7 @@ struct __declspec(dllimport) DerivedFromExplicitlyInstantiatedTemplate : public USEMEMFUNC(ExplicitlyInstantiatedTemplate, func) // M32-DAG: define weak_odr dso_local x86_thiscallcc void @"?func@?$ExplicitlyInstantiatedTemplate@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN30ExplicitlyInstantiatedTemplateIiE4funcEv +// C32-DAG: define weak_odr dso_local void @_ZN30ExplicitlyInstantiatedTemplateIiE4funcEv // PS-DAG: define weak_odr void @_ZN30ExplicitlyInstantiatedTemplateIiE4funcEv // Base class already instantiated with export attribute. @@ -984,6 +1001,7 @@ struct __declspec(dllimport) DerivedFromExplicitlyExportInstantiatedTemplate : p USEMEMFUNC(ExplicitlyExportInstantiatedTemplate, func) // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"?func@?$ExplicitlyExportInstantiatedTemplate@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @_ZN36ExplicitlyExportInstantiatedTemplateIiE4funcEv +// C32-DAG: define weak_odr dso_local dllexport void @_ZN36ExplicitlyExportInstantiatedTemplateIiE4funcEv // PS-DAG: define weak_odr dllexport void @_ZN36ExplicitlyExportInstantiatedTemplateIiE4funcEv // Base class already instantiated with import attribute. @@ -991,6 +1009,7 @@ struct __declspec(dllimport) DerivedFromExplicitlyImportInstantiatedTemplate : p USEMEMFUNC(ExplicitlyImportInstantiatedTemplate, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ExplicitlyImportInstantiatedTemplate@H@@QAEXXZ" // G32-DAG: declare dllimport x86_thiscallcc void @_ZN36ExplicitlyImportInstantiatedTemplateIiE4funcEv +// C32-DAG: declare dllimport void @_ZN36ExplicitlyImportInstantiatedTemplateIiE4funcEv // PS-DAG: declare dllimport void @_ZN36ExplicitlyImportInstantiatedTemplateIiE4funcEv // MS: A dll attribute propagates through multiple levels of instantiation. @@ -1000,6 +1019,7 @@ struct __declspec(dllimport) BottomClass : public MiddleClass { }; USEMEMFUNC(TopClass, func) // M32-DAG: {{declare|define available_externally}} dllimport x86_thiscallcc void @"?func@?$TopClass@H@@QAEXXZ" // G32-DAG: define linkonce_odr dso_local x86_thiscallcc void @_ZN8TopClassIiE4funcEv +// C32-DAG: define linkonce_odr dso_local void @_ZN8TopClassIiE4funcEv // PS-DAG: declare dllimport void @_ZN8TopClassIiE4funcEv template struct ExplicitInstantiationDeclTemplateBase { void func() {} }; @@ -1009,6 +1029,7 @@ template struct ExplicitInstantiationDeclTemplateBase; USEMEMFUNC(ExplicitInstantiationDeclTemplateBase, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ExplicitInstantiationDeclTemplateBase@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv +// C32-DAG: define weak_odr dso_local void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv // PS-DAG: declare dllimport void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv template struct ExplicitInstantiationDeclTemplateBase2 { void func() {} }; @@ -1018,6 +1039,7 @@ template struct __declspec(dllexport) ExplicitInstantiationDeclTemplateBase2, func) // M32-DAG: declare dllimport x86_thiscallcc void @"?func@?$ExplicitInstantiationDeclTemplateBase2@H@@QAEXXZ" // G32-DAG: define weak_odr dso_local x86_thiscallcc void @_ZN38ExplicitInstantiationDeclTemplateBase2IiE4funcEv +// C32-DAG: define weak_odr dso_local void @_ZN38ExplicitInstantiationDeclTemplateBase2IiE4funcEv // PS-DAG: declare dllimport void @_ZN38ExplicitInstantiationDeclTemplateBase2IiE4funcEv namespace pr39496 { diff --git a/clang/test/CodeGenCXX/dso-local-executable.cpp b/clang/test/CodeGenCXX/dso-local-executable.cpp index d021a6a06f0f0..2be6812efc3a5 100644 --- a/clang/test/CodeGenCXX/dso-local-executable.cpp +++ b/clang/test/CodeGenCXX/dso-local-executable.cpp @@ -1,6 +1,7 @@ // RUN: %clang_cc1 -triple x86_64-pc-linux -mrelocation-model static -O1 -disable-llvm-passes -emit-llvm %s -o - | FileCheck --check-prefix=STATIC %s // RUN: %clang_cc1 -triple x86_64-pc-linux -mrelocation-model static -fno-plt -O1 -disable-llvm-passes -emit-llvm %s -o - | FileCheck --check-prefix=NOPLT %s // RUN: %clang_cc1 -triple x86_64-w64-mingw32 -O1 -disable-llvm-passes -emit-llvm %s -o - | FileCheck --check-prefix=MINGW %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -O1 -disable-llvm-passes -emit-llvm %s -o - | FileCheck --check-prefix=MINGW %s // STATIC-DAG: @_ZTV1C = linkonce_odr dso_local unnamed_addr constant // STATIC-DAG: @_ZTS1C = linkonce_odr dso_local constant diff --git a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp index 408a3fd0a77dd..de112d6da53db 100644 --- a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp +++ b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp @@ -1,4 +1,7 @@ // RUN: %clang_cc1 -emit-llvm -triple i686-mingw32 %s -o - | FileCheck %s +// RUN: %clang_cc1 -emit-llvm -triple x86_64-w64-mingw32 %s -o - | FileCheck %s +// RUN: %clang_cc1 -emit-llvm -triple i686-pc-cygwin %s -o - | FileCheck %s +// RUN: %clang_cc1 -emit-llvm -triple x86_64-pc-cygwin %s -o - | FileCheck %s #define JOIN2(x, y) x##y #define JOIN(x, y) JOIN2(x, y) diff --git a/clang/test/CodeGenCXX/rtti-mingw64.cpp b/clang/test/CodeGenCXX/rtti-mingw64.cpp index e0a4607cf28c3..9de280602945d 100644 --- a/clang/test/CodeGenCXX/rtti-mingw64.cpp +++ b/clang/test/CodeGenCXX/rtti-mingw64.cpp @@ -1,4 +1,6 @@ // RUN: %clang_cc1 -triple x86_64-windows-gnu %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin %s -emit-llvm -o - | FileCheck %s + struct A { int a; }; struct B : virtual A { int b; }; B b; diff --git a/clang/test/CodeGenCXX/virt-dtor-key.cpp b/clang/test/CodeGenCXX/virt-dtor-key.cpp index d3b9ab3351518..cd169ab01dc8b 100644 --- a/clang/test/CodeGenCXX/virt-dtor-key.cpp +++ b/clang/test/CodeGenCXX/virt-dtor-key.cpp @@ -1,5 +1,9 @@ -// RUN: %clang_cc1 -triple i386-linux -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -triple i386-windows-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MINGW +// RUN: %clang_cc1 -triple i686-linux -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple i686-windows-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MINGW +// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MINGW +// RUN: %clang_cc1 -triple i686-pc-cygwin -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MINGW +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MINGW + // CHECK: @_ZTI3foo ={{.*}} constant // CHECK-MINGW: @_ZTI3foo = linkonce_odr class foo { diff --git a/clang/test/CodeGenCXX/vtable-key-function-ios.cpp b/clang/test/CodeGenCXX/vtable-key-function-ios.cpp index 43abfb62c73a6..b11d0a62a04a6 100644 --- a/clang/test/CodeGenCXX/vtable-key-function-ios.cpp +++ b/clang/test/CodeGenCXX/vtable-key-function-ios.cpp @@ -3,6 +3,8 @@ // RUN: %clang_cc1 %s -triple=x86_64-pc-windows-gnu -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK-MINGW %s // RUN: %clang_cc1 %s -triple=x86_64-pc-windows-gnu -emit-llvm -o - | FileCheck -check-prefix=CHECK-LATE %s +// RUN: %clang_cc1 %s -triple=x86_64-pc-cygwin -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK-MINGW %s +// RUN: %clang_cc1 %s -triple=x86_64-pc-cygwin -emit-llvm -o - | FileCheck -check-prefix=CHECK-LATE %s // The 'a' variants ask for the vtable first. // The 'b' variants ask for the vtable second. diff --git a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl index c0eb1b138ed04..b36682e065b3a 100644 --- a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl +++ b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl @@ -27,6 +27,7 @@ void main(unsigned GI : SV_GroupIndex) {} // Verify function constructors are emitted // NOINLINE-NEXT: call void @_Z13call_me_firstv() // NOINLINE-NEXT: call void @_Z12then_call_mev() +// NOINLINE-NEXT: call void @_GLOBAL__sub_I_GlobalConstructorFunction.hlsl() // NOINLINE-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() // NOINLINE-NEXT: call void @_Z4mainj(i32 %0) // NOINLINE-NEXT: call void @_Z12call_me_lastv( @@ -36,6 +37,9 @@ void main(unsigned GI : SV_GroupIndex) {} // INLINE-NEXT: alloca // INLINE-NEXT: store i32 12 // INLINE-NEXT: store i32 13 +// INLINE-NEXT: %[[HANDLE:.*]] = call target("dx.CBuffer", target("dx.Layout", %"__cblayout_$Globals", 4, 0)) +// INLINE-NEXT-SAME: @"llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_$Globalss_4_0tt"(i32 0, i32 0, i32 1, i32 0, i1 false) +// INLINE-NEXT: store target("dx.CBuffer", target("dx.Layout", %"__cblayout_$Globals", 4, 0)) %[[HANDLE]], ptr @"$Globals.cb", align 4 // INLINE-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() // INLINE-NEXT: store i32 % // INLINE-NEXT: store i32 0 diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl index b798c2a6d6c4b..1d451acfc6214 100644 --- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl +++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl @@ -33,7 +33,7 @@ void SecondEntry() {} // Verify the constructor is alwaysinline // NOINLINE: ; Function Attrs: {{.*}}alwaysinline -// NOINLINE-NEXT: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ev({{.*}} [[CtorAttr:\#[0-9]+]] +// NOINLINE-NEXT: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ejijj({{.*}} [[CtorAttr:\#[0-9]+]] // NOINLINE: ; Function Attrs: {{.*}}alwaysinline // NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[InitAttr:\#[0-9]+]] diff --git a/clang/test/CodeGenHLSL/RootSignature.hlsl b/clang/test/CodeGenHLSL/RootSignature.hlsl new file mode 100644 index 0000000000000..60e0dec175b8f --- /dev/null +++ b/clang/test/CodeGenHLSL/RootSignature.hlsl @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -o - %s | FileCheck %s + +// CHECK: !dx.rootsignatures = !{![[#FIRST_ENTRY:]], ![[#SECOND_ENTRY:]]} + +// CHECK: ![[#FIRST_ENTRY]] = !{ptr @FirstEntry, ![[#EMPTY:]]} +// CHECK: ![[#EMPTY]] = !{} + +[shader("compute"), RootSignature("")] +[numthreads(1,1,1)] +void FirstEntry() {} + +// CHECK: ![[#SECOND_ENTRY]] = !{ptr @SecondEntry, ![[#SECOND_RS:]]} +// CHECK: ![[#SECOND_RS]] = !{![[#TABLE:]]} +// CHECK: ![[#TABLE]] = !{!"DescriptorTable", i32 0, ![[#CBV:]], ![[#SRV:]]} +// CHECK: ![[#CBV]] = !{!"CBV", i32 1, i32 0, i32 0, i32 -1, i32 4} +// CHECK: ![[#SRV]] = !{!"SRV", i32 4, i32 42, i32 3, i32 32, i32 0} + +#define SampleDescriptorTable \ + "DescriptorTable( " \ + " CBV(b0), " \ + " SRV(t42, space = 3, offset = 32, numDescriptors = 4, flags = 0) " \ + ")" +[shader("compute"), RootSignature(SampleDescriptorTable)] +[numthreads(1,1,1)] +void SecondEntry() {} + +// Sanity test to ensure no root is added for this function as there is only +// two entries in !dx.roosignatures +[shader("compute")] +[numthreads(1,1,1)] +void ThirdEntry() {} diff --git a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl index d7c4b03552cdc..5f844fcfe4121 100644 --- a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl +++ b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl @@ -35,31 +35,19 @@ export void foo() { // Buf1 initialization part 2 - body of ByteAddressBuffer C1 constructor with explicit binding that calls the C2 constructor // CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC1Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this, // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: %registerNo.addr = alloca i32, align 4 -// CHECK-NEXT: %spaceNo.addr = alloca i32, align 4 -// CHECK-NEXT: %range.addr = alloca i32, align 4 -// CHECK-NEXT: %index.addr = alloca i32, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: store i32 %registerNo, ptr %registerNo.addr, align 4 -// CHECK-NEXT: store i32 %spaceNo, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: store i32 %range, ptr %range.addr, align 4 -// CHECK-NEXT: store i32 %index, ptr %index.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: %0 = load i32, ptr %registerNo.addr, align 4 -// CHECK-NEXT: %1 = load i32, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: %2 = load i32, ptr %range.addr, align 4 -// CHECK-NEXT: %3 = load i32, ptr %index.addr, align 4 -// CHECK: call void @_ZN4hlsl17ByteAddressBufferC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this1, -// CHECK-SAME: i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3) -// CHECK-NEXT: ret void +// CHECK: call void @_ZN4hlsl17ByteAddressBufferC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) +// CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}) -// Buf2 initialization part 1 - FIXME: constructor with implicit binding does not exist yet; -// the global init function currently calls the default RWByteAddressBuffer C1 constructor -// CHECK: define internal void @__cxx_global_var_init.1() +// Buf2 initialization part 1 - global init function that calls RWByteAddressBuffer C1 constructor with implicit binding +// CHECK: define internal void @__cxx_global_var_init.1() #0 { // CHECK-NEXT: entry: -// CHECK-NEXT: call void @_ZN4hlsl19RWByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2) +// CHECK-NEXT: call void @_ZN4hlsl19RWByteAddressBufferC1Ejijj(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2, +// CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0) + +// Buf2 initialization part 2 - body of RWByteAddressBuffer C1 constructor with implicit binding that calls the C2 constructor +// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC1Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %this, +// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId) +// CHECK: call void @_ZN4hlsl19RWByteAddressBufferC2Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %this1, i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3) #4 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by // RasterizerOrderedByteAddressBuffer C1 default constructor @@ -71,43 +59,30 @@ export void foo() { // Buf3 initialization part 2 - body of RasterizerOrderedByteAddressBuffer default C1 constructor that // calls the default C2 constructor // CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this1) +// CHECK: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}}) // CHECK-NEXT: ret void // Buf1 initialization part 3 - ByteAddressBuffer C2 constructor with explicit binding that initializes // handle with @llvm.dx.resource.handlefrombinding // CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this, // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: %registerNo.addr = alloca i32, align 4 -// CHECK-NEXT: %spaceNo.addr = alloca i32, align 4 -// CHECK-NEXT: %range.addr = alloca i32, align 4 -// CHECK-NEXT: %index.addr = alloca i32, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: store i32 %registerNo, ptr %registerNo.addr, align 4 -// CHECK-NEXT: store i32 %spaceNo, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: store i32 %range, ptr %range.addr, align 4 -// CHECK-NEXT: store i32 %index, ptr %index.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: %0 = load i32, ptr %registerNo.addr, align 4 -// CHECK-NEXT: %1 = load i32, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: %2 = load i32, ptr %range.addr, align 4 -// CHECK-NEXT: %3 = load i32, ptr %index.addr, align 4 -// CHECK-DXIL-NEXT: %4 = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t( -// CHECK-DXIL-SAME: i32 %1, i32 %0, i32 %2, i32 %3, i1 false) -// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::ByteAddressBuffer", ptr %this1, i32 0, i32 0 -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 0, 0) %4, ptr %__handle, align 4 -// CHECK-NEXT: ret void +// CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t( +// CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false) +// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::ByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0 +// CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 0, 0) %[[HANDLE]], ptr %__handle, align 4 + +// Buf2 initialization part 3 - body of RWByteAddressBuffer C2 constructor with implicit binding that initializes +// handle with @llvm.dx.resource.handlefromimplicitbinding +// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC2Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %this, +// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId) unnamed_addr #1 align 2 { +// CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_i8_1_0t(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false) +// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWByteAddressBuffer", ptr %this1, i32 0, i32 0 +// CHECK-NEXT: store target("dx.RawBuffer", i8, 1, 0) %[[HANDLE]], ptr %__handle, align 4 // Buf3 initialization part 3 - body of RasterizerOrderedByteAddressBuffer default C2 constructor that // initializes handle to poison // CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedByteAddressBuffer", ptr %this1, i32 0, i32 0 +// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0 // CHECK: store target("dx.RawBuffer", i8, 1, 1) poison, ptr %__handle, align 4 // Module initialization diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl index adf231dedf4cb..ad8ebdf7d8c85 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl @@ -35,31 +35,19 @@ export void foo() { // Buf1 initialization part 2 - body of RWBuffer C1 constructor with explicit binding that calls the C2 constructor // CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC1Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this, // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index) +// CHECK: call void @_ZN4hlsl8RWBufferIfEC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) +// CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}) + +// Buf2 initialization part 1 - global init function that calls RWBuffer C1 constructor with implicit binding +// CHECK: define internal void @__cxx_global_var_init.1() // CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: %registerNo.addr = alloca i32, align 4 -// CHECK-NEXT: %spaceNo.addr = alloca i32, align 4 -// CHECK-NEXT: %range.addr = alloca i32, align 4 -// CHECK-NEXT: %index.addr = alloca i32, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: store i32 %registerNo, ptr %registerNo.addr, align 4 -// CHECK-NEXT: store i32 %spaceNo, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: store i32 %range, ptr %range.addr, align 4 -// CHECK-NEXT: store i32 %index, ptr %index.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: %0 = load i32, ptr %registerNo.addr, align 4 -// CHECK-NEXT: %1 = load i32, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: %2 = load i32, ptr %range.addr, align 4 -// CHECK-NEXT: %3 = load i32, ptr %index.addr, align 4 -// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIfEC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this1, -// CHECK-SAME: i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3) -// CHECK-NEXT: ret void - -// Buf2 initialization part 1 - FIXME: constructor with implicit binding does not exist yet; -// the global init function currently calls the default RWBufer C1 constructor -// CHECK: define internal void @__cxx_global_var_init.1() #0 { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIdEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2) +// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIdEC1Ejijj(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2, +// CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0) + +// Buf2 initialization part 2 - body of RWBuffer C1 constructor with implicit binding that calls the C2 constructor +// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC1Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %this, +// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId) +// CHECK: call void @_ZN4hlsl8RWBufferIdEC2Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}) #4 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by RWBuffer C1 default constructor // CHECK: define void @_Z3foov() @@ -69,48 +57,29 @@ export void foo() { // Buf3 initialization part 2 - body of RWBuffer default C1 constructor that calls the default C2 constructor // CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this1) -// CHECK-NEXT: ret void +// CHECK: call void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}}) // Buf1 initialization part 3 - body of RWBuffer C2 constructor with explicit binding that initializes // handle with @llvm.dx.resource.handlefrombinding // CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this, // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: %registerNo.addr = alloca i32, align 4 -// CHECK-NEXT: %spaceNo.addr = alloca i32, align 4 -// CHECK-NEXT: %range.addr = alloca i32, align 4 -// CHECK-NEXT: %index.addr = alloca i32, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: store i32 %registerNo, ptr %registerNo.addr, align 4 -// CHECK-NEXT: store i32 %spaceNo, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: store i32 %range, ptr %range.addr, align 4 -// CHECK-NEXT: store i32 %index, ptr %index.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: %0 = load i32, ptr %registerNo.addr, align 4 -// CHECK-NEXT: %1 = load i32, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: %2 = load i32, ptr %range.addr, align 4 -// CHECK-NEXT: %3 = load i32, ptr %index.addr, align 4 -// CHECK-DXIL-NEXT: %4 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t( -// CHECK-DXIL-SAME: i32 %1, i32 %0, i32 %2, i32 %3, i1 false) -// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %this1, i32 0, i32 0 -// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %4, ptr %__handle, align 4 -// CHECK-NEXT: ret void +// CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t( +// CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false) +// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %{{.*}}, i32 0, i32 0 +// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %[[HANDLE]], ptr %__handle, align 4 + +// Buf2 initialization part 3 - body of RWBuffer C2 constructor with implicit binding that initializes +// handle with @llvm.dx.resource.handlefromimplicitbinding +// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC2Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %this, +// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId) unnamed_addr #1 align 2 { +// CHECK: %[[HANDLE:.*]] = call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_1_0_0t(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false) +// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.0", ptr %{{.*}}, i32 0, i32 0 +// CHECK-NEXT: store target("dx.TypedBuffer", double, 1, 0, 0) %[[HANDLE]], ptr %__handle, align 4 // Buf3 initialization part 3 - body of RWBuffer default C2 constructor that initializes handle to poison // CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.1", ptr %this1, i32 0, i32 0 +// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.1", ptr %{{.*}}, i32 0, i32 0 // CHECK-NEXT: store target("dx.TypedBuffer", i32, 1, 0, 1) poison, ptr %__handle, align 4 -// CHECK-NEXT: ret void // Module initialization // CHECK: define internal void @_GLOBAL__sub_I_RWBuffer_constructor.hlsl() diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl index ea818a737cf74..34ce676a02f83 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl @@ -36,31 +36,21 @@ export void foo() { // that calls the C2 constructor // CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC1Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this, // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: %registerNo.addr = alloca i32, align 4 -// CHECK-NEXT: %spaceNo.addr = alloca i32, align 4 -// CHECK-NEXT: %range.addr = alloca i32, align 4 -// CHECK-NEXT: %index.addr = alloca i32, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: store i32 %registerNo, ptr %registerNo.addr, align 4 -// CHECK-NEXT: store i32 %spaceNo, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: store i32 %range, ptr %range.addr, align 4 -// CHECK-NEXT: store i32 %index, ptr %index.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: %0 = load i32, ptr %registerNo.addr, align 4 -// CHECK-NEXT: %1 = load i32, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: %2 = load i32, ptr %range.addr, align 4 -// CHECK-NEXT: %3 = load i32, ptr %index.addr, align 4 -// CHECK: call void @_ZN4hlsl16StructuredBufferIfEC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this1, -// CHECK-SAME: i32 noundef %0, i32 noundef %1, i32 noundef %2, i32 noundef %3) -// CHECK-NEXT: ret void - -// Buf2 initialization part 1 - FIXME: constructor with implicit binding does not exist yet; -// the global init function currently calls the default RWStructuredBufer C1 constructor +// CHECK: call void @_ZN4hlsl16StructuredBufferIfEC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) +// CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}) + +// Buf2 initialization part 1 - global init function that calls RWStructuredBuffer C1 constructor with +// implicit binding // CHECK: define internal void @__cxx_global_var_init.1() // CHECK-NEXT: entry: -// CHECK-NEXT: call void @_ZN4hlsl18RWStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2) +// CHECK-NEXT: call void @_ZN4hlsl18RWStructuredBufferIfEC1Ejijj(ptr noundef nonnull align 4 dereferenceable(4) @_ZL4Buf2, +// CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0) + +// Buf2 initialization part 2 - body of RWStructuredBuffer C1 constructor with implicit binding that calls the C2 constructor +// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC1Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %this, +// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId) +// CHECK: call void @_ZN4hlsl18RWStructuredBufferIfEC2Ejijj(ptr noundef nonnull align 4 dereferenceable(4) +// CHECK-SAME; %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}) #4 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by // AppendStructuredBuffer C1 default constructor @@ -72,47 +62,29 @@ export void foo() { // Buf3 initialization part 2 - body of AppendStructuredBuffer default C1 constructor that calls // the default C2 constructor // CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK: call void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this1) -// CHECK-NEXT: ret void +// CHECK: call void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}}) // Buf1 initialization part 3 - body of AppendStructuredBuffer C2 constructor with explicit binding // that initializes handle with @llvm.dx.resource.handlefrombinding // CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2Ejjij(ptr noundef nonnull align 4 dereferenceable(4) %this, // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: %registerNo.addr = alloca i32, align 4 -// CHECK-NEXT: %spaceNo.addr = alloca i32, align 4 -// CHECK-NEXT: %range.addr = alloca i32, align 4 -// CHECK-NEXT: %index.addr = alloca i32, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: store i32 %registerNo, ptr %registerNo.addr, align 4 -// CHECK-NEXT: store i32 %spaceNo, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: store i32 %range, ptr %range.addr, align 4 -// CHECK-NEXT: store i32 %index, ptr %index.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK-NEXT: %0 = load i32, ptr %registerNo.addr, align 4 -// CHECK-NEXT: %1 = load i32, ptr %spaceNo.addr, align 4 -// CHECK-NEXT: %2 = load i32, ptr %range.addr, align 4 -// CHECK-NEXT: %3 = load i32, ptr %index.addr, align 4 -// CHECK-DXIL-NEXT: %4 = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t( -// CHECK-DXIL-SAME: i32 %1, i32 %0, i32 %2, i32 %3, i1 false) -// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %this1, i32 0, i32 0 -// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 0, 0) %4, ptr %__handle, align 4 -// CHECK-NEXT: ret void +// CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t( +// CHECK-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false) +// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %{{.*}}, i32 0, i32 0 +// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 0, 0) %[[HANDLE]], ptr %__handle, align 4 + +// Buf2 initialization part 3 - body of RWStructuredBuffer C2 constructor with implicit binding that initializes +// handle with @llvm.dx.resource.handlefromimplicitbinding +// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2Ejijj(ptr noundef nonnull align 4 dereferenceable(4) %this, +// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId) unnamed_addr #1 align 2 { +// CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_f32_1_0t(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false) +// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %{{.*}}, i32 0, i32 0 +// CHECK-NEXT: store target("dx.RawBuffer", float, 1, 0) %[[HANDLE]], ptr %__handle, align 4 // Buf3 initialization part 3 - body of AppendStructuredBuffer default C2 constructor that // initializes handle to poison // CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK-NEXT: %this.addr = alloca ptr, align 4 -// CHECK-NEXT: store ptr %this, ptr %this.addr, align 4 -// CHECK-NEXT: %this1 = load ptr, ptr %this.addr, align 4 -// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %this1, i32 0, i32 0 +// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 0 // CHECK: store target("dx.RawBuffer", float, 1, 0) poison, ptr %__handle, align 4 // Module initialization diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl index 405f5ef218973..3fb76d57341cf 100644 --- a/clang/test/CodeGenHLSL/cbuffer.hlsl +++ b/clang/test/CodeGenHLSL/cbuffer.hlsl @@ -102,7 +102,7 @@ typedef uint32_t4 uint32_t8[2]; typedef uint4 T1; typedef T1 T2[2]; // check a double typedef -cbuffer CBTypedefArray { +cbuffer CBTypedefArray : register(space2) { uint32_t8 t1[2]; T2 t2[2]; } @@ -268,16 +268,64 @@ cbuffer CB_C { // CHECK: define internal void @_init_buffer_CBScalars.cb() // CHECK-NEXT: entry: -// CHECK-NEXT: %[[HANDLE1:.*]] = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars, 56, 0, 8, 16, 24, 32, 36, 40, 48)) +// CHECK-NEXT: %CBScalars.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars, 56, 0, 8, 16, 24, 32, 36, 40, 48)) // CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBScalarss_56_0_8_16_24_32_36_40_48tt(i32 5, i32 1, i32 1, i32 0, i1 false) // CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars, 56, 0, 8, 16, 24, 32, 36, 40, 48)) %CBScalars.cb_h, ptr @CBScalars.cb, align 4 +// CHECK: define internal void @_init_buffer_CBVectors.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CBVectors.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBVectors, 136, 0, 16, 40, 48, 80, 96, 112)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBVectorss_136_0_16_40_48_80_96_112tt(i32 0, i32 0, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBVectors, 136, 0, 16, 40, 48, 80, 96, 112)) %CBVectors.cb_h, ptr @CBVectors.cb, align 4 + // CHECK: define internal void @_init_buffer_CBArrays.cb() // CHECK-NEXT: entry: -// CHECK-NEXT: %[[HANDLE2:.*]] = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays, 708, 0, 48, 112, 176, 224, 608, 624, 656)) +// CHECK-NEXT: %CBArrays.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays, 708, 0, 48, 112, 176, 224, 608, 624, 656)) // CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBArrayss_708_0_48_112_176_224_608_624_656tt(i32 0, i32 2, i32 1, i32 0, i1 false) // CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays, 708, 0, 48, 112, 176, 224, 608, 624, 656)) %CBArrays.cb_h, ptr @CBArrays.cb, align 4 +// CHECK: define internal void @_init_buffer_CBTypedefArray.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CBTypedefArray.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBTypedefArray, 128, 0, 64)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBTypedefArrays_128_0_64tt(i32 1, i32 2, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBTypedefArray, 128, 0, 64)) %CBTypedefArray.cb_h, ptr @CBTypedefArray.cb, align 4 + +// CHECK: define internal void @_init_buffer_CBStructs.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CBStructs.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBStructs, 246, 0, 16, 32, 64, 144, 238, 240)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBStructss_246_0_16_32_64_144_238_240tt(i32 2, i32 0, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBStructs, 246, 0, 16, 32, 64, 144, 238, 240)) %CBStructs.cb_h, ptr @CBStructs.cb, align 4 + +// CHECK: define internal void @_init_buffer_CBClasses.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CBClasses.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBClasses, 260, 0, 16, 32, 112)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBClassess_260_0_16_32_112tt(i32 3, i32 0, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBClasses, 260, 0, 16, 32, 112)) %CBClasses.cb_h, ptr @CBClasses.cb, align 4 + +// CHECK: define internal void @_init_buffer_CBMix.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CBMix.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBMix, 170, 0, 24, 32, 120, 128, 136, 144, 152, 160, 168)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBMixs_170_0_24_32_120_128_136_144_152_160_168tt(i32 4, i32 0, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBMix, 170, 0, 24, 32, 120, 128, 136, 144, 152, 160, 168)) %CBMix.cb_h, ptr @CBMix.cb, align 4 + +// CHECK: define internal void @_init_buffer_CB_A.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CB_A.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_A, 188, 0, 32, 76, 80, 120, 128, 144, 160, 182)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CB_As_188_0_32_76_80_120_128_144_160_182tt(i32 5, i32 0, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_A, 188, 0, 32, 76, 80, 120, 128, 144, 160, 182)) %CB_A.cb_h, ptr @CB_A.cb, align 4 + +// CHECK: define internal void @_init_buffer_CB_B.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CB_B.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_B, 94, 0, 88)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CB_Bs_94_0_88tt(i32 6, i32 0, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_B, 94, 0, 88)) %CB_B.cb_h, ptr @CB_B.cb, align 4 + +// CHECK: define internal void @_init_buffer_CB_C.cb() +// CHECK-NEXT: entry: +// CHECK-NEXT: %CB_C.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_C, 400, 0, 16, 112, 128, 392)) +// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.CBuffer_tdx.Layout_s___cblayout_CB_Cs_400_0_16_112_128_392tt(i32 7, i32 0, i32 1, i32 0, i1 false) +// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_C, 400, 0, 16, 112, 128, 392)) %CB_C.cb_h, ptr @CB_C.cb, align 4 + RWBuffer Buf; [numthreads(4,1,1)] @@ -288,7 +336,13 @@ void main() { // CHECK: define internal void @_GLOBAL__sub_I_cbuffer.hlsl() // CHECK-NEXT: entry: // CHECK-NEXT: call void @_init_buffer_CBScalars.cb() +// CHECK-NEXT: call void @_init_buffer_CBVectors.cb() // CHECK-NEXT: call void @_init_buffer_CBArrays.cb() +// CHECK-NEXT: call void @_init_buffer_CBTypedefArray.cb() +// CHECK-NEXT: call void @_init_buffer_CBStructs.cb() +// CHECK-NEXT: call void @_init_buffer_CBClasses.cb() +// CHECK-NEXT: call void @_init_buffer_CBMix.cb() +// CHECK-NEXT: call void @_init_buffer_CB_A.cb() // CHECK: !hlsl.cbs = !{![[CBSCALARS:[0-9]+]], ![[CBVECTORS:[0-9]+]], ![[CBARRAYS:[0-9]+]], ![[CBTYPEDEFARRAY:[0-9]+]], ![[CBSTRUCTS:[0-9]+]], ![[CBCLASSES:[0-9]+]], // CHECK-SAME: ![[CBMIX:[0-9]+]], ![[CB_A:[0-9]+]], ![[CB_B:[0-9]+]], ![[CB_C:[0-9]+]]} diff --git a/clang/test/CodeGenHLSL/convergence/global_array.hlsl b/clang/test/CodeGenHLSL/convergence/global_array.hlsl new file mode 100644 index 0000000000000..e11de82c3ded0 --- /dev/null +++ b/clang/test/CodeGenHLSL/convergence/global_array.hlsl @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -Wno-hlsl-implicit-binding -finclude-default-header -triple spirv-unknown-vulkan-compute -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s + +// CHECK: define internal spir_func void @__cxx_global_var_init() +// CHECK: [[entry_token:%.*]] = call token @llvm.experimental.convergence.entry() +// CHECK: br label %[[loop_entry:.*]] + +// CHECK: [[loop_entry]]: +// CHECK: [[loop_token:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[entry_token]]) ] +// CHECK: call void {{.*}} [ "convergencectrl"(token [[loop_token]]) ] +// CHECK: br i1 {{%.*}} label {{%.*}} label %[[loop_entry]] +RWBuffer e[2]; + +[numthreads(4,1,1)] +void main() { +} + diff --git a/clang/test/CodeGenHLSL/static-local-ctor.hlsl b/clang/test/CodeGenHLSL/static-local-ctor.hlsl index 7aeb5e987d6b2..474bcf1aff6ac 100644 --- a/clang/test/CodeGenHLSL/static-local-ctor.hlsl +++ b/clang/test/CodeGenHLSL/static-local-ctor.hlsl @@ -21,7 +21,7 @@ void InitBuf(RWBuffer buf) { // CHECK-NEXT: br i1 [[Tmp3]] // CHECK-NOT: _Init_thread_header // CHECK: init.check: -// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ev +// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ejijj // CHECK-NEXT: store i8 1, ptr @_ZGVZ4mainvE5mybuf // CHECK-NOT: _Init_thread_footer diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index 24d64c94c0956..1f696aba8d088 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -317,6 +317,9 @@ // RUN: not %clang --target=aarch64-linux -fsanitize=memtag -I +mte %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANMT-NOMT-1 // CHECK-SANMT-NOMT-1: '-fsanitize=memtag-stack' requires hardware support (+memtag) +// RUN: not %clang --target=aarch64-linux-android31 -fsanitize-trap=memtag -march=armv8-a+memtag -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANMT-TRAP +// CHECK-SANMT-TRAP: error: unsupported argument 'memtag' to option '-fsanitize-trap=' + // RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-address-use-after-scope %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-SCOPE // RUN: %clang_cl --target=x86_64-windows -fsanitize=address -fsanitize-address-use-after-scope -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-SCOPE // CHECK-USE-AFTER-SCOPE: -cc1{{.*}}-fsanitize-address-use-after-scope diff --git a/clang/test/Driver/haiku.c b/clang/test/Driver/haiku.c index 3f421ab6e81e6..aeb9519e9479e 100644 --- a/clang/test/Driver/haiku.c +++ b/clang/test/Driver/haiku.c @@ -35,6 +35,7 @@ // CHECK-C-HEADER-PATH: "-internal-isystem" "/boot/system/develop/headers/glibc" // CHECK-C-HEADER-PATH: "-internal-isystem" "/boot/system/develop/headers/gnu" // CHECK-C-HEADER-PATH: "-internal-isystem" "/boot/system/develop/headers/posix" +// CHECK-C-HEADER-PATH: "-internal-isystem" "/boot/system/develop/headers/gcc/include" // CHECK-C-HEADER-PATH: "-internal-isystem" "/boot/system/develop/headers" // Check x86_64-unknown-haiku, X86_64 diff --git a/clang/test/Driver/ppc-mrop-protection-support-check.c b/clang/test/Driver/ppc-mrop-protection-support-check.c index 50eaef3ed770b..f500e9e3e510c 100644 --- a/clang/test/Driver/ppc-mrop-protection-support-check.c +++ b/clang/test/Driver/ppc-mrop-protection-support-check.c @@ -1,20 +1,15 @@ // RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ // RUN: -mcpu=pwr10 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=HASROP // RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ -// RUN: -mcpu=power10 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=HASROP -// RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ -// RUN: -mcpu=pwr9 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=HASROP -// RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ -// RUN: -mcpu=power9 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=HASROP -// RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ -// RUN: -mcpu=pwr8 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=HASROP -// RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ // RUN: -mcpu=power8 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=HASROP // RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ // RUN: -mcpu=pwr7 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=NOROP -// RUN: not %clang -target powerpc64le-unknown-linux-gnu -fsyntax-only \ -// RUN: -mcpu=power7 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=NOROP + +// RUN: not %clang -target powerpc-unknown-linux -fsyntax-only \ +// RUN: -mcpu=pwr8 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=32BIT +// RUN: not %clang -target powerpc-unknown-aix -fsyntax-only \ +// RUN: -mcpu=pwr8 -mrop-protect %s 2>&1 | FileCheck %s --check-prefix=32BIT #ifdef __ROP_PROTECT__ static_assert(false, "ROP Protect enabled"); @@ -24,3 +19,4 @@ static_assert(false, "ROP Protect enabled"); // HASROP-NOT: option '-mrop-protect' cannot be specified with // NOROP: option '-mrop-protect' cannot be specified with +// 32BIT: option '-mrop-protect' cannot be specified on this target diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index f7d4ecb057d6e..bd3c1b7de743a 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -10,6 +10,7 @@ // CHECK-NEXT: a 2.1 'A' (Atomic Instructions) // CHECK-NEXT: f 2.2 'F' (Single-Precision Floating-Point) // CHECK-NEXT: d 2.2 'D' (Double-Precision Floating-Point) +// CHECK-NEXT: q 2.2 'Q' (Quad-Precision Floating-Point) // CHECK-NEXT: c 2.0 'C' (Compressed Instructions) // CHECK-NEXT: b 1.0 'B' (the collection of the Zba, Zbb, Zbs extensions) // CHECK-NEXT: v 1.0 'V' (Vector Extension for Application Processors) @@ -157,6 +158,7 @@ // CHECK-NEXT: svpbmt 1.0 'Svpbmt' (Page-Based Memory Types) // CHECK-NEXT: svvptc 1.0 'Svvptc' (Obviating Memory-Management Instructions after Marking PTEs Valid) // CHECK-NEXT: xandesperf 5.0 'XAndesPerf' (Andes Performance Extension) +// CHECK-NEXT: xandesvdot 5.0 'XAndesVDot' (Andes Vector Dot Product Extension) // CHECK-NEXT: xandesvpackfph 5.0 'XAndesVPackFPH' (Andes Vector Packed FP16 Extension) // CHECK-NEXT: xcvalu 1.0 'XCValu' (CORE-V ALU Operations) // CHECK-NEXT: xcvbi 1.0 'XCVbi' (CORE-V Immediate Branching) diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c index 018fa25218ea6..1da8311b5de98 100644 --- a/clang/test/Driver/riscv-arch.c +++ b/clang/test/Driver/riscv-arch.c @@ -10,6 +10,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32imafd -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv32-unknown-elf -march=rv32imafdq -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32ic -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -21,6 +23,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32imafdc -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv32-unknown-elf -march=rv32imafdqc -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32ia -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -28,6 +32,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32iafd -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv32-unknown-elf -march=rv32iafdq -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32iac -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -35,6 +41,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32iafdc -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv32-unknown-elf -march=rv32iafdqc -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv32-unknown-elf -march=rv32g -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -80,6 +88,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64imafd -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv64-unknown-elf -march=rv64imafdq -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64ic -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -91,6 +101,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64imafdc -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv64-unknown-elf -march=rv64imafdqc -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64ia -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -98,6 +110,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64iafd -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv64-unknown-elf -march=rv64iafdq -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64iac -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -105,6 +119,8 @@ // RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64iafdc -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s +// RUN: %clang --target=riscv64-unknown-elf -march=rv64iafdqc -### %s \ +// RUN: -fsyntax-only 2>&1 | FileCheck %s // RUN: %clang --target=riscv64-unknown-elf -march=rv64g -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck %s @@ -211,11 +227,6 @@ // RV32-LETTER: error: invalid arch name 'rv32q', // RV32-LETTER: first letter after 'rv32' should be 'e', 'i' or 'g' -// RUN: not %clang --target=riscv32-unknown-elf -march=rv32imcq -### %s \ -// RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ORDER %s -// RV32-ORDER: error: invalid arch name 'rv32imcq', -// RV32-ORDER: unsupported standard user-level extension 'q' - // RUN: not %clang --target=riscv32-unknown-elf -march=rv32izvl64b -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ZVL64B-ER %s // RV32-ZVL64B-ER: error: invalid arch name 'rv32izvl64b', @@ -226,11 +237,6 @@ // RV32-STD-INVAL: error: invalid arch name 'rv32imw', // RV32-STD-INVAL: invalid standard user-level extension 'w' -// RUN: not %clang --target=riscv32-unknown-elf -march=rv32imqc -### %s \ -// RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-STD %s -// RV32-STD: error: invalid arch name 'rv32imqc', -// RV32-STD: unsupported standard user-level extension 'q' - // RUN: not %clang --target=riscv32-unknown-elf -march=rv32xabc -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X %s // RV32X: error: invalid arch name 'rv32xabc', diff --git a/clang/test/Headers/__clang_hip_cmath-return_types.hip b/clang/test/Headers/__clang_hip_cmath-return_types.hip new file mode 100644 index 0000000000000..146235244c45f --- /dev/null +++ b/clang/test/Headers/__clang_hip_cmath-return_types.hip @@ -0,0 +1,1023 @@ +// RUN: %clang_cc1 -include __clang_hip_runtime_wrapper.h \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/Inputs/include \ +// RUN: -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-unknown \ +// RUN: -target-cpu gfx906 %s -fcuda-is-device -fsyntax-only -o - + +template +struct is_same { + static constexpr bool value = false; +}; + +template +struct is_same { + static constexpr bool value = true; +}; + +__device__ void test_abs() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_acos() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_asin() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_atan() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_atan2() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_ceil() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_cos() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_cosh() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_exp() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_fabs() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_floor() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_fmod() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_frexp() +{ + int ip; + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_ldexp() +{ + int ip = 1; + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_log() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_log10() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_modf() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + double i; +} + +__device__ void test_pow() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_sin() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_sinh() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_sqrt() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_tan() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_tanh() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_signbit() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_fpclassify() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isfinite() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isnormal() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isgreater() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isgreaterequal() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isinf() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isless() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_islessequal() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_islessgreater() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isnan() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_isunordered() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_acosh() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_asinh() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_atanh() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_cbrt() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_copysign() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_erf() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_erfc() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_exp2() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_expm1() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_fdim() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_fma() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + + static_assert(is_same::value, ""); +} + +__device__ void test_fmax() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_fmin() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_hypot() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_ilogb() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_lgamma() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_llrint() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_llround() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_log1p() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_log2() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_logb() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_lrint() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_lround() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_nan() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_nearbyint() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_nextafter() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_remainder() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_remquo() +{ + int ip; + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_rint() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_round() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_scalbln() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_scalbn() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_tgamma() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__device__ void test_trunc() +{ + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); +} + +__global__ void tests() +{ + test_abs(); + test_acos(); + test_asin(); + test_atan(); + test_atan2(); + test_ceil(); + test_cos(); + test_cosh(); + test_exp(); + test_fabs(); + test_floor(); + test_fmod(); + test_frexp(); + test_ldexp(); + test_log(); + test_log10(); + test_modf(); + test_pow(); + test_sin(); + test_sinh(); + test_sqrt(); + test_tan(); + test_tanh(); + test_signbit(); + test_fpclassify(); + test_isfinite(); + test_isnormal(); + test_isgreater(); + test_isgreaterequal(); + test_isinf(); + test_isless(); + test_islessequal(); + test_islessgreater(); + test_isnan(); + test_isunordered(); + test_acosh(); + test_asinh(); + test_atanh(); + test_cbrt(); + test_copysign(); + test_erf(); + test_erfc(); + test_exp2(); + test_expm1(); + test_fdim(); + test_fma(); + test_fmax(); + test_fmin(); + test_hypot(); + test_ilogb(); + test_lgamma(); + test_llrint(); + test_llround(); + test_log1p(); + test_log2(); + test_logb(); + test_lrint(); + test_lround(); + test_nan(); + test_nearbyint(); + test_nextafter(); + test_remainder(); + test_remquo(); + test_rint(); + test_round(); + test_scalbln(); + test_scalbn(); + test_tgamma(); + test_trunc(); +} diff --git a/clang/test/Headers/opencl-c-header.cl b/clang/test/Headers/opencl-c-header.cl index 7317ff0adaafb..17cbb67f26038 100644 --- a/clang/test/Headers/opencl-c-header.cl +++ b/clang/test/Headers/opencl-c-header.cl @@ -193,6 +193,9 @@ global atomic_int z = ATOMIC_VAR_INIT(99); #if __opencl_c_ext_image_unorm_int_2_101010 != 1 #error "Incorrectly defined __opencl_c_ext_image_unorm_int_2_101010" #endif +#if __opencl_c_ext_image_unsigned_10x6_12x4_14x2 != 1 +#error "Incorrectly defined __opencl_c_ext_image_unsigned_10x6_12x4_14x2" +#endif #else @@ -283,6 +286,9 @@ global atomic_int z = ATOMIC_VAR_INIT(99); #ifdef __opencl_c_ext_image_unorm_int_2_101010 #error "Incorrect __opencl_c_ext_image_unorm_int_2_101010 define" #endif +#ifdef __opencl_c_ext_image_unsigned_10x6_12x4_14x2 +#error "Incorrect __opencl_c_ext_image_unsigned_10x6_12x4_14x2 define" +#endif #endif //(defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200) diff --git a/clang/test/Modules/pr130712.cppm b/clang/test/Modules/pr130712.cppm new file mode 100644 index 0000000000000..4c7a21ea1f289 --- /dev/null +++ b/clang/test/Modules/pr130712.cppm @@ -0,0 +1,33 @@ +// RUN: split-file %s %t + +// There are two requirements here to result in the owner of a macro being null. +// 1) There must be a configuration mismatch between a header and a file it depends on +// 2) -fmodules-local-submodule-visibility must be enabled. + +// In the following example, when compiling module C, A_H has no owning module. + +// RUN: %clang_cc1 -I%t -emit-module -o %t/a.pcm -fmodules %t/module.modulemap -fmodule-name=a -fmodules-local-submodule-visibility +// RUN: %clang_cc1 -fexceptions -Wno-module-file-config-mismatch -I%t -emit-module -o %t/b.pcm -fmodules %t/module.modulemap -fmodule-name=b -fmodules-local-submodule-visibility -fmodule-file=%t/a.pcm +// RUN: %clang_cc1 -fexceptions -Wno-module-file-config-mismatch -I%t -emit-module -o %t/c.pcm -fmodules %t/module.modulemap -fmodule-name=c -fmodules-local-submodule-visibility -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm + +//--- module.modulemap +module a { header "a.h" } +module b { header "b.h" } +module c { header "c.h" } + +//--- a.h +#ifndef A_H +#define A_H +#endif + +//--- b.h +#ifndef B_H +#define B_H + +#include + +#endif + +//--- c.h +#include +#include diff --git a/clang/test/Modules/pr140130.cpp b/clang/test/Modules/pr140130.cpp new file mode 100644 index 0000000000000..da26a005b04f8 --- /dev/null +++ b/clang/test/Modules/pr140130.cpp @@ -0,0 +1,33 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// RUN: cd %t +// RUN: %clang_cc1 -iquote . -fmodules -fno-cxx-modules -emit-module \ +// RUN: -std=c++20 -fmodule-name=c -xc++ c.cppmap -o c.pcm +// RUN: %clang_cc1 -iquote . -fmodules -fno-cxx-modules -emit-module \ +// RUN: -std=c++20 -fmodule-name=a -fmodule-map-file=a.cppmap \ +// RUN: -fmodule-file=c.pcm -xc++ a.cppmap -o a.pcm + +//--- a.cppmap +module "a" { + header "a.h" +} +//--- a.h +#include "b.h" +//--- b.h +#ifndef _B_H_ +#define _B_H_ +struct B { + consteval B() {} + union { + int a; + }; +}; +constexpr B b; +#endif +//--- c.cppmap +module "c" { +header "c.h" +} +//--- c.h +#include "b.h" diff --git a/clang/test/Modules/sdk-settings-json-dep.m b/clang/test/Modules/sdk-settings-json-dep.m new file mode 100644 index 0000000000000..196f4219bd989 --- /dev/null +++ b/clang/test/Modules/sdk-settings-json-dep.m @@ -0,0 +1,53 @@ +// This test checks that the module cache gets invalidated when the SDKSettings.json file changes. + +// RUN: rm -rf %t +// RUN: split-file %s %t + +//--- AppleTVOS15.0.sdk/SDKSettings-old.json +{ + "DisplayName": "tvOS 15.0", + "Version": "15.0", + "CanonicalName": "appletvos15.0", + "MaximumDeploymentTarget": "15.0.99", + "PropertyConditionFallbackNames": [] +} +//--- AppleTVOS15.0.sdk/SDKSettings-new.json +{ + "DisplayName": "tvOS 15.0", + "Version": "15.0", + "CanonicalName": "appletvos15.0", + "MaximumDeploymentTarget": "15.0.99", + "PropertyConditionFallbackNames": [], + "VersionMap": { + "iOS_tvOS": { + "13.2": "13.1" + }, + "tvOS_iOS": { + "13.1": "13.2" + } + } +} +//--- module.modulemap +module M { header "M.h" } +//--- M.h +void foo(void) __attribute__((availability(iOS, obsoleted = 13.2))); +void test() { foo(); } + +//--- tu.m +#include "M.h" + +// Compiling for tvOS 13.1 without "VersionMap" should succeed, since by default iOS 13.2 gets mapped to tvOS 13.2, +// and \c foo is therefore **not** deprecated. +// RUN: cp %t/AppleTVOS15.0.sdk/SDKSettings-old.json %t/AppleTVOS15.0.sdk/SDKSettings.json +// RUN: %clang -target x86_64-apple-tvos13.1 -isysroot %t/AppleTVOS15.0.sdk \ +// RUN: -fsyntax-only %t/tu.m -o %t/tu.o -fmodules -Xclang -fdisable-module-hash -fmodules-cache-path=%t/cache + +// Compiling for tvOS 13.1 with "VersionMap" saying it maps to iOS 13.2 should fail, since \c foo is now deprecated. +// RUN: sleep 1 +// RUN: cp %t/AppleTVOS15.0.sdk/SDKSettings-new.json %t/AppleTVOS15.0.sdk/SDKSettings.json +// RUN: not %clang -target x86_64-apple-tvos13.1 -isysroot %t/AppleTVOS15.0.sdk \ +// RUN: -fsyntax-only %t/tu.m -o %t/tu.o -fmodules -Xclang -fdisable-module-hash -fmodules-cache-path=%t/cache 2>&1 \ +// RUN: | FileCheck %s +// CHECK: M.h:2:15: error: 'foo' is unavailable: obsoleted in tvOS 13.1 +// CHECK: M.h:1:6: note: 'foo' has been explicitly marked unavailable here +// CHECK: tu.m:1:10: fatal error: could not build module 'M' diff --git a/clang/test/OpenMP/metadirective_messages.cpp b/clang/test/OpenMP/metadirective_messages.cpp index a248e9a4e82a9..9d2934f8b1e10 100644 --- a/clang/test/OpenMP/metadirective_messages.cpp +++ b/clang/test/OpenMP/metadirective_messages.cpp @@ -49,3 +49,13 @@ void foo() { ; #endif } + +namespace GH139665 { +void f(){ +#pragma omp metadirective( // expected-error {{expected at least one clause on '#pragma omp metadirective' directive}} +} + +void g() { +#pragma omp metadirective align // expected-error {{expected '(' after 'align'}} +} +} // namespace GH139665 diff --git a/clang/test/OpenMP/openmp_non_c_directives.c b/clang/test/OpenMP/openmp_non_c_directives.c new file mode 100644 index 0000000000000..844d7dad551bc --- /dev/null +++ b/clang/test/OpenMP/openmp_non_c_directives.c @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s + +// Test the reaction to some Fortran-only directives. + +void foo() { +#pragma omp allocators // expected-error {{expected an OpenMP directive}} +#pragma omp do // expected-error {{expected an OpenMP directive}} +#pragma omp end workshare // expected-error {{expected an OpenMP directive}} +#pragma omp parallel workshare // expected-warning {{extra tokens at the end of '#pragma omp parallel' are ignored}} +#pragma omp workshare // expected-error {{expected an OpenMP directive}} +} + diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index 25f15cc5283f9..e3b456e0245f7 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -20,6 +20,7 @@ // CHECK-NOT: __riscv_m {{.*$}} // CHECK-NOT: __riscv_mul {{.*$}} // CHECK-NOT: __riscv_muldiv {{.*$}} +// CHECK-NOT: __riscv_q {{.*$}} // CHECK-NOT: __riscv_sha {{.*$}} // CHECK-NOT: __riscv_shcounterenw {{.*$}} // CHECK-NOT: __riscv_shgatpa {{.*$}} @@ -334,6 +335,17 @@ // CHECK-M-EXT: __riscv_mul 1 // CHECK-M-EXT: __riscv_muldiv 1 +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32ifdq -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-Q-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64ifdq -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-Q-EXT %s +// CHECK-Q-EXT: __riscv_fdiv 1 +// CHECK-Q-EXT: __riscv_flen 128 +// CHECK-Q-EXT: __riscv_fsqrt 1 +// CHECK-Q-EXT: __riscv_q 2002000{{$}} + // RUN: %clang --target=riscv32-unknown-linux-gnu \ // RUN: -march=rv32isha -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-SHCOUNTERENW-EXT %s diff --git a/clang/test/Refactor/source-loc-zero.cpp b/clang/test/Refactor/source-loc-zero.cpp new file mode 100644 index 0000000000000..61b782743aece --- /dev/null +++ b/clang/test/Refactor/source-loc-zero.cpp @@ -0,0 +1,17 @@ +// Regression test for #139375 +// Clang uses 1-based indexing for source locations given from the command-line. +// Verify that `clang-refactor` rejects 0 as an invalid value for line or column number. + +// For range start: +// RUN: not clang-refactor local-rename -selection=%s:0:1-1:1 -new-name=test %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DIAG %s +// RUN: not clang-refactor local-rename -selection=%s:1:0-1:1 -new-name=test %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DIAG %s + +// For range end: +// RUN: not clang-refactor local-rename -selection=%s:1:1-0:1 -new-name=test %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DIAG %s +// RUN: not clang-refactor local-rename -selection=%s:1:1-1:0 -new-name=test %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DIAG %s + +// CHECK-DIAG: error: '-selection' option must be specified using :: or ::-: format, where and are integers greater than zero. diff --git a/clang/test/Sema/atomic-expr.c b/clang/test/Sema/atomic-expr.c index 7e5219dd3f14a..96571e3e68c87 100644 --- a/clang/test/Sema/atomic-expr.c +++ b/clang/test/Sema/atomic-expr.c @@ -114,6 +114,23 @@ void func_16(void) { (void)sizeof(xp->val); (void)sizeof(y.ival); (void)sizeof(yp->ival); + + // Also, do not diagnose in unreachable code paths. + { + if (0) { + x.val = 12; + xp->val = 12; + (void)y.ival; + (void)yp->ival; + } + + return; + + x.val = 12; + xp->val = 12; + (void)y.ival; + (void)yp->ival; + } } // Ensure that we correctly implement assignment constraints from C2x 6.5.16.1. diff --git a/clang/test/Sema/bitfield-layout.c b/clang/test/Sema/bitfield-layout.c index 079720cc9b40b..595b24d3e857e 100644 --- a/clang/test/Sema/bitfield-layout.c +++ b/clang/test/Sema/bitfield-layout.c @@ -3,6 +3,8 @@ // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=aarch64-linux-gnu // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-pc-linux-gnu // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-scei-ps4 +// RUN: %clang_cc1 %s -fsyntax-only -verify=checkms -triple=i686-apple-darwin9 -Wms-bitfield-padding + // expected-no-diagnostics #include @@ -24,12 +26,27 @@ CHECK_ALIGN(struct, a, 1) #endif // Zero-width bit-fields with packed -struct __attribute__((packed)) a2 { short x : 9; char : 0; int y : 17; }; +struct __attribute__((packed)) a2 { + short x : 9; // #a2x + char : 0; // #a2anon + // checkms-warning@-1 {{bit-field '' of type 'char' has a different storage size than the preceding bit-field (1 vs 2 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#a2x {{preceding bit-field 'x' declared here with type 'short'}} + int y : 17; + // checkms-warning@-1 {{bit-field 'y' of type 'int' has a different storage size than the preceding bit-field (4 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#a2anon {{preceding bit-field '' declared here with type 'char'}} +}; + CHECK_SIZE(struct, a2, 5) CHECK_ALIGN(struct, a2, 1) // Zero-width bit-fields at the end of packed struct -struct __attribute__((packed)) a3 { short x : 9; int : 0; }; +struct __attribute__((packed)) a3 { + short x : 9; // #a3x + int : 0; + // checkms-warning@-1 {{bit-field '' of type 'int' has a different storage size than the preceding bit-field (4 vs 2 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#a3x {{preceding bit-field 'x' declared here with type 'short'}} +}; + #if defined(__arm__) || defined(__aarch64__) CHECK_SIZE(struct, a3, 4) CHECK_ALIGN(struct, a3, 4) @@ -39,7 +56,12 @@ CHECK_ALIGN(struct, a3, 1) #endif // For comparison, non-zero-width bit-fields at the end of packed struct -struct __attribute__((packed)) a4 { short x : 9; int : 1; }; +struct __attribute__((packed)) a4 { + short x : 9; // #a4x + int : 1; + // checkms-warning@-1 {{bit-field '' of type 'int' has a different storage size than the preceding bit-field (4 vs 2 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#a4x {{preceding bit-field 'x' declared here with type 'short'}} +}; CHECK_SIZE(struct, a4, 2) CHECK_ALIGN(struct, a4, 1) @@ -165,22 +187,28 @@ CHECK_OFFSET(struct, g4, c, 3); #endif struct g5 { - char : 1; + char : 1; // #g5 __attribute__((aligned(1))) int n : 24; + // checkms-warning@-1 {{bit-field 'n' of type 'int' has a different storage size than the preceding bit-field (4 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#g5 {{preceding bit-field '' declared here with type 'char'}} }; CHECK_SIZE(struct, g5, 4); CHECK_ALIGN(struct, g5, 4); struct __attribute__((packed)) g6 { - char : 1; + char : 1; // #g6 __attribute__((aligned(1))) int n : 24; + // checkms-warning@-1 {{bit-field 'n' of type 'int' has a different storage size than the preceding bit-field (4 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#g6 {{preceding bit-field '' declared here with type 'char'}} }; CHECK_SIZE(struct, g6, 4); CHECK_ALIGN(struct, g6, 1); struct g7 { - char : 1; + char : 1; // #g7 __attribute__((aligned(1))) int n : 25; + // checkms-warning@-1 {{bit-field 'n' of type 'int' has a different storage size than the preceding bit-field (4 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#g7 {{preceding bit-field '' declared here with type 'char'}} }; #if defined(__ORBIS__) CHECK_SIZE(struct, g7, 4); @@ -190,8 +218,10 @@ CHECK_SIZE(struct, g7, 8); CHECK_ALIGN(struct, g7, 4); struct __attribute__((packed)) g8 { - char : 1; + char : 1; // #g8 __attribute__((aligned(1))) int n : 25; + // checkms-warning@-1 {{bit-field 'n' of type 'int' has a different storage size than the preceding bit-field (4 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#g8 {{preceding bit-field '' declared here with type 'char'}} }; #if defined(__ORBIS__) CHECK_SIZE(struct, g8, 4); diff --git a/clang/test/Sema/bitfield-layout_1.c b/clang/test/Sema/bitfield-layout_1.c index 24277c3911495..3db83c7463503 100644 --- a/clang/test/Sema/bitfield-layout_1.c +++ b/clang/test/Sema/bitfield-layout_1.c @@ -2,6 +2,7 @@ // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=arm-linux-gnueabihf // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=aarch64-linux-gnu // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-pc-linux-gnu +// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=i686-apple-darwin9 -Wms-bitfield-padding // expected-no-diagnostics #define CHECK_SIZE(name, size) \ diff --git a/clang/test/Sema/builtins-wasm.c b/clang/test/Sema/builtins-wasm.c index 1aae365c95aff..31e5291d3ae5e 100644 --- a/clang/test/Sema/builtins-wasm.c +++ b/clang/test/Sema/builtins-wasm.c @@ -8,6 +8,9 @@ typedef void (*__funcref funcref_t)(); void test_ref_null() { funcref_t func = __builtin_wasm_ref_null_func(0); // expected-error {{too many arguments to function call, expected 0, have 1}} __externref_t ref = __builtin_wasm_ref_null_extern(0); // expected-error {{too many arguments to function call, expected 0, have 1}} + __builtin_wasm_ref_is_null_extern(ref, 1); // expected-error {{too many arguments to function call, expected 1, have 2}} + __builtin_wasm_ref_is_null_extern(); // expected-error {{too few arguments to function call, expected 1, have 0}} + __builtin_wasm_ref_is_null_extern(1); // expected-error {{1st argument must be an externref}} } void test_table_size(__externref_t ref, void *ptr, int arr[]) { diff --git a/clang/test/Sema/mms-bitfields.c b/clang/test/Sema/mms-bitfields.c index cee5b0669d252..a976578845229 100644 --- a/clang/test/Sema/mms-bitfields.c +++ b/clang/test/Sema/mms-bitfields.c @@ -1,12 +1,16 @@ // RUN: %clang_cc1 -mms-bitfields -fsyntax-only -verify -triple x86_64-apple-darwin9 %s +// RUN: %clang_cc1 -mms-bitfields -fsyntax-only -Wms-bitfield-padding -verify=checkms -triple x86_64-apple-darwin9 %s + // expected-no-diagnostics // The -mms-bitfields commandline parameter should behave the same // as the ms_struct attribute. struct { - int a : 1; + int a : 1; // #a short b : 1; + // checkms-warning@-1 {{bit-field 'b' of type 'short' has a different storage size than the preceding bit-field (2 vs 4 bytes) and will not be packed under the Microsoft ABI}} + // checkms-note@#a {{preceding bit-field 'a' declared here with type 'int'}} } t; // MS pads out bitfields between different types. diff --git a/clang/test/SemaCXX/bitfield.cpp b/clang/test/SemaCXX/bitfield.cpp index 083c28ffbb3d4..bb3094561bea4 100644 --- a/clang/test/SemaCXX/bitfield.cpp +++ b/clang/test/SemaCXX/bitfield.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -verify +// RUN: %clang_cc1 %s -verify -Wms-bitfield-padding // expected-no-diagnostics diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp index 0cdc16ed4e822..88e0a8f153b10 100644 --- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp +++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp @@ -178,3 +178,24 @@ namespace extern_reference_used_as_unknown { int y; constinit int& g = (x,y); // expected-warning {{left operand of comma operator has no effect}} } + +namespace GH139452 { +struct Dummy { + explicit operator bool() const noexcept { return true; } +}; + +struct Base { int error; }; +struct Derived : virtual Base { }; + +template +constexpr R get_value() { + const auto& derived_val = Derived{}; + if (derived_val.error != 0) + /* nothing */; + return R{}; +} + +int f() { + return !get_value(); // contextually convert the function call result to bool +} +} diff --git a/clang/test/SemaCXX/consteval-assert.cpp b/clang/test/SemaCXX/consteval-assert.cpp new file mode 100644 index 0000000000000..b54a38ff26105 --- /dev/null +++ b/clang/test/SemaCXX/consteval-assert.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -std=c++23 -verify=expected,cxx20_plus -DTEST_LINUX %s +// RUN: %clang_cc1 -std=c++23 -verify=expected,cxx20_plus -DTEST_WINDOWS %s +// RUN: %clang_cc1 -std=c++23 -verify=expected,cxx20_plus -DTEST_DARWIN %s + +#ifdef __ASSERT_FUNCTION +#undef __ASSERT_FUNCTION +#endif + +#if defined(TEST_LINUX) + extern "C" void __assert_fail(const char*, const char*, unsigned, const char*); + #define assert(cond) \ + ((cond) ? (void)0 : __assert_fail(#cond, __FILE__, __LINE__, __func__)) +#elif defined(TEST_DARWIN) + void __assert_rtn(const char *, const char *, int, const char *); + #define assert(cond) \ + (__builtin_expect(!(cond), 0) ? __assert_rtn(__func__, __FILE__, __LINE__, #cond) : (void)0) +#elif defined(TEST_WINDOWS) + void /*__cdecl*/ _wassert(const wchar_t*, const wchar_t*, unsigned); + #define _CRT_WIDE_(s) L ## s + #define _CRT_WIDE(s) _CRT_WIDE_(s) + #define assert(cond) \ + (void)((!!(cond)) || (_wassert(_CRT_WIDE(#cond), _CRT_WIDE(__FILE__), (unsigned)(__LINE__)), 0)) +#endif + +consteval int square(int x) { + int result = x * x; + assert(result == 42); // expected-note {{assertion failed during evaluation of constant expression}} + return result; +} + +void test() { + auto val = square(2); // expected-note {{in call to 'square(2)'}} \ + // expected-error {{call to consteval function 'square' is not a constant expression}} +} diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp index d9d144cafdbcc..d9932e4dd8241 100644 --- a/clang/test/SemaCXX/cxx2a-consteval.cpp +++ b/clang/test/SemaCXX/cxx2a-consteval.cpp @@ -1300,3 +1300,25 @@ void foo() { } } + +// https://github.com/llvm/llvm-project/issues/139160 +namespace GH139160{ + // original test case taken from Github + struct A {int x[1]; }; + A f(); // expected-note {{declared here}} + typedef int *t[]; + consteval int* f(int* x) { return x; } + + int ** x = (t){f(f().x)}; // expected-error {{call to consteval function 'GH139160::f' is not a constant expression}} + // expected-note@-1 {{non-constexpr function 'f' cannot be used in a constant expression}} + // expected-error@-2 {{initializer element is not a compile-time constant}} + + struct B {int value, value_two;}; + B make_struct() {return {10, 20};} // expected-note {{declared here}} + consteval int get_value(B container) {return container.value;} + B result = (B){10, get_value(make_struct())}; // expected-error {{initializer element is not a compile-time constant}} + // expected-error@-1 {{call to consteval function 'GH139160::get_value' is not a constant expression}} + // expected-note@-2 {{non-constexpr function 'make_struct' cannot be used in a constant expression}} +}; + + diff --git a/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp b/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp index b94225274fffb..76007ff3913dd 100644 --- a/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp +++ b/clang/test/SemaCXX/cxx2a-three-way-comparison.cpp @@ -58,3 +58,12 @@ namespace PR44325 { // implicit rewrite rules, not for explicit use by programs. bool c = cmp_cat() < 0; // expected-warning {{zero as null pointer constant}} } + +namespace GH137452 { +struct comparable_t { + __attribute__((vector_size(32))) double numbers; // expected-note {{declared here}} + auto operator<=>(const comparable_t& rhs) const = default; // expected-warning {{explicitly defaulted three-way comparison operator is implicitly deleted}} \ + expected-note {{replace 'default' with 'delete'}} \ + expected-note {{defaulted 'operator<=>' is implicitly deleted because defaulted comparison of vector types is not supported}} +}; +} // namespace GH137452 diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp index 7e392213710a4..2286da8d1c0e5 100644 --- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp +++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp @@ -926,6 +926,33 @@ struct C { (&fref)(); } }; + +struct CTpl { + template + constexpr int c(this const CTpl&, T) { // #P2797-ctpl-1 + return 42; + } + + template + void c(T)&; // #P2797-ctpl-2 + + template + static void c(T = 0, T = 0); // #P2797-ctpl-3 + + void d() { + c(0); // expected-error {{call to member function 'c' is ambiguous}} + // expected-note@#P2797-ctpl-1{{candidate}} + // expected-note@#P2797-ctpl-2{{candidate}} + // expected-note@#P2797-ctpl-3{{candidate}} + (CTpl::c)(0); // expected-error {{call to member function 'c' is ambiguous}} + // expected-note@#P2797-ctpl-1{{candidate}} + // expected-note@#P2797-ctpl-2{{candidate}} + // expected-note@#P2797-ctpl-3{{candidate}} + + static_assert((&CTpl::c)(CTpl{}, 0) == 42); // selects #1 + } +}; + } namespace GH85992 { diff --git a/clang/test/SemaCXX/dllexport.cpp b/clang/test/SemaCXX/dllexport.cpp index 22d92c30954e8..f503e2fc311d1 100644 --- a/clang/test/SemaCXX/dllexport.cpp +++ b/clang/test/SemaCXX/dllexport.cpp @@ -2,6 +2,8 @@ // RUN: %clang_cc1 -triple x86_64-win32 -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DMS %s // RUN: %clang_cc1 -triple i686-mingw32 -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s +// RUN: %clang_cc1 -triple i686-pc-cygwin -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s // RUN: %clang_cc1 -triple i686-windows-itanium -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DWI %s // RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DWI %s // RUN: %clang_cc1 -triple x86_64-scei-ps4 -fsyntax-only -fdeclspec -verify -std=c++11 -Wunsupported-dll-base-class-template -DPS %s diff --git a/clang/test/SemaCXX/dllimport.cpp b/clang/test/SemaCXX/dllimport.cpp index 996e92f611d3f..b7a1a62b8725b 100644 --- a/clang/test/SemaCXX/dllimport.cpp +++ b/clang/test/SemaCXX/dllimport.cpp @@ -3,6 +3,9 @@ // RUN: %clang_cc1 -triple i686-mingw32 -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c++17 -Wunsupported-dll-base-class-template -DGNU %s +// RUN: %clang_cc1 -triple i686-pc-cygwin -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -fsyntax-only -fms-extensions -verify -std=c++17 -Wunsupported-dll-base-class-template -DGNU %s // RUN: %clang_cc1 -triple i686-windows-itanium -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DWI %s // RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c++17 -Wunsupported-dll-base-class-template -DWI %s // RUN: %clang_cc1 -triple x86_64-scei-ps4 -fsyntax-only -fdeclspec -verify -std=c++11 -Wunsupported-dll-base-class-template -DPS %s diff --git a/clang/test/SemaCXX/libstdcxx_format_kind_hack.cpp b/clang/test/SemaCXX/libstdcxx_format_kind_hack.cpp new file mode 100644 index 0000000000000..35611c870b8d1 --- /dev/null +++ b/clang/test/SemaCXX/libstdcxx_format_kind_hack.cpp @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -fsyntax-only -std=c++23 -verify %s + +// expected-no-diagnostics + +// Primary variable template std::format_kind is defined as followed since +// libstdc++ 15.1, which triggers compilation error introduced by GH134522. +// This file tests the workaround. + +#define __GLIBCXX__ 20250513 + +namespace std { + template + constexpr auto format_kind = + __primary_template_not_defined( + format_kind<_Rg> + ); +} diff --git a/clang/test/SemaCXX/libstdcxx_gets_hack.cpp b/clang/test/SemaCXX/libstdcxx_gets_hack.cpp deleted file mode 100644 index 0d915d01474c3..0000000000000 --- a/clang/test/SemaCXX/libstdcxx_gets_hack.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only %s -std=c++14 -verify - -// This is a test for an egregious hack in Clang that works around -// an issue with libstdc++'s detection of whether glibc provides a -// ::gets function. If there is no ::gets, ignore -// using ::gets; -// in namespace std. -// -// See PR18402 and gcc.gnu.org/PR77795 for more details. - -#ifdef BE_THE_HEADER - -#pragma GCC system_header -namespace std { - using ::gets; - using ::getx; // expected-error {{no member named 'getx'}} -} - -#else - -#define BE_THE_HEADER -#include "libstdcxx_pointer_return_false_hack.cpp" - -namespace foo { - using ::gets; // expected-error {{no member named 'gets'}} -} - -#endif diff --git a/clang/test/SemaCXX/libstdcxx_pointer_return_false_hack.cpp b/clang/test/SemaCXX/libstdcxx_pointer_return_false_hack.cpp deleted file mode 100644 index 17e1548ac50d6..0000000000000 --- a/clang/test/SemaCXX/libstdcxx_pointer_return_false_hack.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only %s -std=c++11 -verify - -// This is a test for an egregious hack in Clang that works around -// an issue with libstdc++-4.2's implementation. -// The code in question returns 'false' from a function with a pointer -// return type, which is ill-formed in C++11. - -#ifdef BE_THE_HEADER - -#pragma GCC system_header -namespace std { - namespace tr1 { - template struct hashnode; - template struct hashtable { - typedef hashnode node; - node *find_node() { - // This is ill-formed in C++11, per core issue 903, but we accept - // it anyway in a system header. - return false; - } - }; - } -} - -#else - -#define BE_THE_HEADER -#include "libstdcxx_pointer_return_false_hack.cpp" - -auto *test1 = std::tr1::hashtable().find_node(); - -void *test2() { return false; } // expected-error {{cannot initialize}} - -#endif diff --git a/clang/test/SemaCXX/ms_struct-bitfield-padding.cpp b/clang/test/SemaCXX/ms_struct-bitfield-padding.cpp new file mode 100644 index 0000000000000..c0f90f798118a --- /dev/null +++ b/clang/test/SemaCXX/ms_struct-bitfield-padding.cpp @@ -0,0 +1,196 @@ + +// RUN: %clang_cc1 -fsyntax-only -Wms-bitfield-padding -verify -triple armv8 -std=c++23 %s +// RUN: %clang_cc1 -fsyntax-only -DMS_BITFIELDS -mms-bitfields -verify=msbitfields -triple armv8-apple-macos10.15 -std=c++23 %s + +// msbitfields-no-diagnostics + +enum Enum1 { Enum1_A, Enum1_B }; +enum Enum2 { Enum2_A, Enum2_B }; + +enum class EnumU32_1 : unsigned { A, B }; +enum class EnumU32_2 : unsigned { A, B }; +enum class EnumU64 : unsigned long long { A, B }; +enum class EnumI32 : int { A, B }; +enum class EnumU8 : unsigned char { A, B }; +enum class EnumI8 : char { A, B }; +enum class EnumU16 : unsigned short { A, B }; +enum class EnumI16 : short { A, B }; + +struct A { + unsigned int a : 15; + unsigned int b : 15; +}; +static_assert(sizeof(A) == 4); + +struct B { + unsigned int a : 15; + int b : 15; +}; +static_assert(sizeof(B) == 4); + +struct C { + unsigned int a : 15; + int b : 15; +}; +static_assert(sizeof(C) == 4); + +struct D { + Enum1 a : 15; + Enum1 b : 15; +}; +static_assert(sizeof(D) == 4); + +struct E { + Enum1 a : 15; + Enum2 b : 15; +}; +static_assert(sizeof(E) == 4); + +struct F { + EnumU32_1 a : 15; + EnumU32_2 b : 15; +}; +static_assert(sizeof(F) == 4); + +struct G { + EnumU32_1 a : 15; + EnumU64 b : 15; + // expected-warning@-1 {{bit-field 'b' of type 'EnumU64' has a different storage size than the preceding bit-field (8 vs 4 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'a' declared here with type 'EnumU32_1'}} +}; + +#ifdef MS_BITFIELDS + static_assert(sizeof(G) == 16); +#else + static_assert(sizeof(G) == 8); +#endif + +struct H { + EnumU32_1 a : 10; + EnumI32 b : 10; + EnumU32_1 c : 10; +}; +static_assert(sizeof(H) == 4); + +struct I { + EnumU8 a : 3; + EnumI8 b : 5; + EnumU32_1 c : 10; + // expected-warning@-1 {{bit-field 'c' of type 'EnumU32_1' has a different storage size than the preceding bit-field (4 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'b' declared here with type 'EnumI8'}} +}; +#ifdef MS_BITFIELDS +static_assert(sizeof(I) == 8); +#else +static_assert(sizeof(I) == 4); +#endif + +struct J { + EnumU8 : 0; + EnumU8 b : 4; +}; +static_assert(sizeof(J) == 1); + +struct K { + EnumU8 a : 4; + EnumU8 : 0; +}; +static_assert(sizeof(K) == 1); + +struct L { + EnumU32_1 a : 10; + EnumU32_2 b : 10; + EnumU32_1 c : 10; +}; + +static_assert(sizeof(L) == 4); + +struct M { + EnumU32_1 a : 10; + EnumI32 b : 10; + EnumU32_1 c : 10; +}; + +static_assert(sizeof(M) == 4); + +struct N { + EnumU32_1 a : 10; + EnumU64 b : 10; + // expected-warning@-1 {{bit-field 'b' of type 'EnumU64' has a different storage size than the preceding bit-field (8 vs 4 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'a' declared here with type 'EnumU32_1'}} + EnumU32_1 c : 10; + // expected-warning@-1 {{bit-field 'c' of type 'EnumU32_1' has a different storage size than the preceding bit-field (4 vs 8 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-5 {{preceding bit-field 'b' declared here with type 'EnumU64'}} +}; + +#ifdef MS_BITFIELDS +static_assert(sizeof(N) == 24); +#else +static_assert(sizeof(N) == 8); +#endif + +struct O { + EnumU16 a : 10; + EnumU32_1 b : 10; + // expected-warning@-1 {{bit-field 'b' of type 'EnumU32_1' has a different storage size than the preceding bit-field (4 vs 2 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'a' declared here with type 'EnumU16'}} +}; +#ifdef MS_BITFIELDS +static_assert(sizeof(O) == 8); +#else +static_assert(sizeof(O) == 4); +#endif + +struct P { + EnumU32_1 a : 10; + EnumU16 b : 10; + // expected-warning@-1 {{bit-field 'b' of type 'EnumU16' has a different storage size than the preceding bit-field (2 vs 4 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'a' declared here with type 'EnumU32_1'}} +}; +#ifdef MS_BITFIELDS +static_assert(sizeof(P) == 8); +#else +static_assert(sizeof(P) == 4); +#endif + +struct Q { + EnumU8 a : 6; + EnumU16 b : 6; + // expected-warning@-1 {{bit-field 'b' of type 'EnumU16' has a different storage size than the preceding bit-field (2 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'a' declared here with type 'EnumU8'}} +}; +#ifdef MS_BITFIELDS +static_assert(sizeof(Q) == 4); +#else +static_assert(sizeof(Q) == 2); +#endif + +struct R { + EnumU16 a : 9; + EnumU16 b : 9; + EnumU8 c : 6; + // expected-warning@-1 {{bit-field 'c' of type 'EnumU8' has a different storage size than the preceding bit-field (1 vs 2 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'b' declared here with type 'EnumU16'}} +}; + +#ifdef MS_BITFIELDS +static_assert(sizeof(R) == 6); +#else +static_assert(sizeof(R) == 4); +#endif + +struct S { + char a : 4; + char b : 4; + char c : 4; + char d : 4; + short x : 7; + // expected-warning@-1 {{bit-field 'x' of type 'short' has a different storage size than the preceding bit-field (2 vs 1 bytes) and will not be packed under the Microsoft ABI}} + // expected-note@-3 {{preceding bit-field 'd' declared here with type 'char'}} + // This is a false positive. Reporting this correctly requires duplicating the record layout process + // in target and MS layout modes, and it's also unclear if that's the correct choice for users of + // this diagnostic. + short y : 9; +}; + +static_assert(sizeof(S) == 4); diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp index 277c5df3bb62b..4c5ac5dcbbd04 100644 --- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp +++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp @@ -232,3 +232,45 @@ struct InitListAreNotPerfectCpy { }; InitListAreNotPerfectCpy InitListAreNotPerfectCpy_test({InitListAreNotPerfectCpy{}}); + +namespace PointerToMemFunc { +template +class A; +struct N { + template + void f(T); +}; +template +struct E { + template > + void g() = delete; + void g(void (T::*)(char)); +}; +void f() { + E e; + e.g(&N::f); +} +} + +#if __cplusplus >= 201402 +namespace PointerToMemData { +struct N { + int field; +}; +template +struct B { + B(It, T); + template + B(B); +}; +template +struct C { + auto g() { return B(0, T{}); } +}; +void f() { + using T = decltype(C{}.g()); +} + +} + +#endif diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp new file mode 100644 index 0000000000000..fcff006d0e028 --- /dev/null +++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp @@ -0,0 +1,151 @@ +// RUN: %clang_cc1 -verify -fsyntax-only -std=c++20 -Wconversion %s + +void c8(char8_t); +void c16(char16_t); +void c32(char32_t); + +void test(char8_t u8, char16_t u16, char32_t u32) { + c8(u8); + c8(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' may lose precision and change the meaning of the represented code unit}} + c8(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' may lose precision and change the meaning of the represented code unit}} + + c16(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char16_t' may change the meaning of the represented code unit}} + c16(u16); + c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' may lose precision and change the meaning of the represented code unit}} + + c32(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' may change the meaning of the represented code unit}} + c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' may change the meaning of the represented code unit}} + c32(u32); + + + c8(char32_t(0x7f)); + c8(char32_t(0x80)); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' changes the meaning of the code point ''}} + + c8(char16_t(0x7f)); + c8(char16_t(0x80)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code point ''}} + c8(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code unit '<0xD800>'}} + c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code point ''}} + + + c16(char32_t(0x7f)); + c16(char32_t(0x80)); + c16(char32_t(0xD7FF)); + c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}} + c16(char32_t(0xE000)); + c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}} + + + c32(char8_t(0x7f)); + c32(char8_t(0x80)); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' changes the meaning of the code unit '<0x80>'}} + c32(char8_t(0xFF)); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' changes the meaning of the code unit '<0xFF>'}} + + + c32(char16_t(0x7f)); + c32(char16_t(0x80)); + + c32(char16_t(0xD7FF)); + c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}} + c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}} + c32(char16_t(0xE000)); + c32(char16_t(u'☕')); + + (void)static_cast(char8_t(0x80)); //no warnings for explicit conversions. + + using Char8 = char8_t; + Char8 c81 = u16; // expected-warning {{implicit conversion from 'char16_t' to 'Char8' (aka 'char8_t') may lose precision and change the meaning of the represented code unit}} + + [[maybe_unused]] char c = u16; // expected-warning {{implicit conversion loses integer precision: 'char16_t' to 'char'}} + + // FIXME: We should apply the same logic to wchar + [[maybe_unused]] wchar_t wc = u16; + [[maybe_unused]] wchar_t wc2 = u8; +} + +void test_comp(char8_t u8, char16_t u16, char32_t u32) { + (void)(u8 == u8' '); + (void)(u8 == u' '); + (void)(u8 == U' '); + + (void)(u16 == u8' '); + (void)(u16 == U' '); + + (void)(u32 == u8' '); + (void)(u32 == u' '); + (void)(u32 == U' '); + + (void)(u8 == u'\u00FF'); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' may compare different code points}} + (void)(u8 == U'\u00FF'); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different code points}} + + (void)(u16 == u8'\xFF'); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different code points}} + (void)(u16 == u'\u00FF'); + (void)(u16 == U'\u00FF'); + (void)(u16 == U'\xD800'); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different code points}} + + (void)(u32 == u8'\xFF'); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different code points}} + (void)(u32 == u'\u00FF'); + (void)(u32 == u'\xD800'); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different code points}} + + (void)(char8_t(0x7f) == char8_t(0x7f)); + (void)(char8_t(0x7f) == char16_t(0x7f)); + (void)(char8_t(0x7f) == char32_t(0x7f)); + + (void)(char8_t(0x80) == char8_t(0x80)); + (void)(char8_t(0x80) == char16_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' compares unrelated code units '<0x80>' and '}} + (void)(char8_t(0x80) == char32_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' compares unrelated code units '<0x80>' and '}} + + (void)(char8_t(0x80) == char8_t(0x7f)); + (void)(char8_t(0x80) == char16_t(0x7f)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' compares unrelated code units '<0x80>' and ''}} + (void)(char8_t(0x80) == char32_t(0x7f)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' compares unrelated code units '<0x80>' and ''}} + + + (void)(char16_t(0x7f) < char8_t(0x7f)); + (void)(char16_t(0x7f) < char16_t(0x7f)); + (void)(char16_t(0x7f) < char32_t(0x7f)); + + (void)(char16_t(0x80) < char8_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' compares unrelated code units '' and '<0x80>'}} + (void)(char16_t(0x80) < char16_t(0x80)); + (void)(char16_t(0x80) < char32_t(0x80)); + + (void)(char16_t(0x80) == char8_t(0x7f)); + (void)(char16_t(0x80) < char16_t(0x7f)); + (void)(char16_t(0x80) < char32_t(0x7f)); + + + (void)(char32_t(0x7f) < char8_t(0x7f)); + (void)(char32_t(0x7f) < char16_t(0x7f)); + (void)(char32_t(0x7f) < char32_t(0x7f)); + + (void)(char32_t(0x80) < char8_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' compares unrelated code units '' and '<0x80>'}} + (void)(char32_t(0x80) < char16_t(0x80)); + (void)(char32_t(0x80) < char32_t(0x80)); + + (void)(char32_t(0x80) == char8_t(0x7f)); + (void)(char32_t(0x80) < char16_t(0x7f)); + (void)(char32_t(0x80) < char32_t(0x7f)); + + + (void)(char32_t(U'🐉') <= char16_t(0xD800)); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' compares unrelated code units '🐉' and '<0xD800>'}} + (void)(char32_t(U'🐉') <= char16_t(0xD7FF)); + + (void)(char16_t(0xD800) >= char32_t(U'🐉')); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' compares unrelated code units '<0xD800>' and '🐉'}} + (void)(char16_t(0xD7FF) >= char32_t(U'🐉')); +} + +void check_arithmetic(char8_t u8, char16_t u16, char32_t u32) { + + (void)(u8 + u8); + (void)(u16 += u16); + (void)(u32 & u32); + (void)(1 ? u16 : u16); + + (void)(u8 + u16); // expected-warning {{arithmetic between different Unicode character types 'char8_t' and 'char16_t'}} + (void)(u8 += u16); // expected-warning {{compound assignment of different Unicode character types 'char8_t' and 'char16_t'}} + (void)(u8 & u16); // expected-warning {{bitwise operation between different Unicode character types 'char8_t' and 'char16_t'}} + (void)(1 ? u8 : u16); // expected-warning {{conditional expression between different Unicode character types 'char8_t' and 'char16_t'}} + + + (void)(u16 * u32); // expected-warning {{arithmetic between different Unicode character types 'char16_t' and 'char32_t'}} + (void)(u16 -= u32); // expected-warning {{compound assignment of different Unicode character types 'char16_t' and 'char32_t'}} + (void)(u16 | u32); // expected-warning {{bitwise operation between different Unicode character types 'char16_t' and 'char32_t'}} + (void)(1 ? u32 : u16); // expected-warning {{conditional expression between different Unicode character types 'char32_t' and 'char16_t'}} +} diff --git a/clang/test/SemaCXX/warn-nrvo.cpp b/clang/test/SemaCXX/warn-nrvo.cpp new file mode 100644 index 0000000000000..55bbdbd3e6e40 --- /dev/null +++ b/clang/test/SemaCXX/warn-nrvo.cpp @@ -0,0 +1,73 @@ +// RUN: %clang_cc1 -fsyntax-only -Wnrvo -Wno-return-mismatch -verify %s +struct MyClass { + int value; + int c; + MyClass(int v) : value(v), c(0) {} + MyClass(const MyClass& other) : value(other.value) { c++; } +}; + +MyClass create_object(bool condition) { + MyClass obj1(1); + MyClass obj2(2); + if (condition) { + return obj1; // expected-warning{{not eliding copy on return}} + } + return obj2; // expected-warning{{not eliding copy on return}} +} + +MyClass create_object2(){ + MyClass obj(1); + return obj; // no warning +} + +template +T create_object3(){ + T obj(1); + return obj; // no warning +} + +// Known issue: if a function template uses a +// deduced return type (i.e. auto or decltype(auto)), +// then NRVO is not applied for any instantiation of +// that function template +// (see https://github.com/llvm/llvm-project/issues/95280). +template +auto create_object4(){ + T obj(1); + return obj; // expected-warning{{not eliding copy on return}} +} + +template +MyClass create_object5(){ + MyClass obj1(1); + if constexpr (F){ + MyClass obj2(2); + return obj2; // no warning + } + // Missed NRVO optimization by clang + return obj1; // expected-warning{{not eliding copy on return}} +} + +constexpr bool Flag = false; + +MyClass create_object6(){ + MyClass obj1(1); + if constexpr (Flag){ + MyClass obj2(2); + return obj2; // expected-warning{{not eliding copy on return}} + } + return obj1; // no warning +} + +void create_object7(){ + if constexpr (Flag){ + MyClass obj1(1); + return obj1; // no warning + } +} + +void init_templates(){ + create_object3(); // no warning + create_object4(); // expected-note {{in instantiation of function template specialization 'create_object4' requested here}} + create_object5(); // expected-note {{in instantiation of function template specialization 'create_object5' requested here}} +} diff --git a/clang/test/SemaObjCXX/cxxoperator-selector.mm b/clang/test/SemaObjCXX/cxxoperator-selector.mm index 8134b82ebacbb..2348bd783f519 100644 --- a/clang/test/SemaObjCXX/cxxoperator-selector.mm +++ b/clang/test/SemaObjCXX/cxxoperator-selector.mm @@ -19,5 +19,10 @@ - (id) and{return 0; }; - (id) xor{return 0; }; - (id) or{return 0; }; +- (void)decltype {} +- (void)constexpr {} +- (void)noexcept {} +- (void)nullptr {} + - (void)dataSetForValuesBetween:(NSDate *)startDate and:(NSDate *)endDate { return; } @end diff --git a/clang/test/SemaOpenACC/gh139894.cpp b/clang/test/SemaOpenACC/gh139894.cpp new file mode 100644 index 0000000000000..f1f6298a97665 --- /dev/null +++ b/clang/test/SemaOpenACC/gh139894.cpp @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +// Ensure that these don't assert, they previously assumed that their directive +// kind would be valid, but we should make sure that we handle that gracefully +// in cases where they don't. + +// expected-error@+1{{invalid OpenACC directive 'foo'}} +#pragma acc foo gang(1) + +// expected-error@+1{{invalid OpenACC directive 'foo'}} +#pragma acc foo vector(1) + +// expected-error@+1{{invalid OpenACC directive 'foo'}} +#pragma acc foo worker(1) diff --git a/clang/test/SemaTemplate/GH55509.cpp b/clang/test/SemaTemplate/GH55509.cpp index 773a84305a0cd..b1ba8e513356d 100644 --- a/clang/test/SemaTemplate/GH55509.cpp +++ b/clang/test/SemaTemplate/GH55509.cpp @@ -110,3 +110,38 @@ namespace regression2 { } template void A::f(); } // namespace regression2 + +namespace GH139226 { + +struct FakeStream {}; + +template +class BinaryTree; + +template +FakeStream& operator<<(FakeStream& os, BinaryTree& b); + +template +FakeStream& operator>>(FakeStream& os, BinaryTree& b) { + return os; +} + +template +struct BinaryTree { + T* root{}; + friend FakeStream& operator<< (FakeStream& os, BinaryTree&) { + // expected-error@-1 {{friend function specialization cannot be defined}} + return os; + } + + friend FakeStream& operator>> (FakeStream& os, BinaryTree&); +}; + +void foo() { + FakeStream fakeout; + BinaryTree a{}; + fakeout << a; + fakeout >> a; +} + +} diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp index e5d00491d3fb8..bf505dec0ca14 100644 --- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp +++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp @@ -853,3 +853,18 @@ template requires C auto TplClass::buggy() -> void {} } + +namespace GH139476 { + +namespace moo { + template + constexpr bool baa = true; + + template requires baa + void caw(); +} + +template requires moo::baa +void moo::caw() {} + +} diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index f963b656b663c..2b35bb5dcbdaf 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -70,6 +70,8 @@ llvm_config.use_clang() +config.substitutions.append(("%src_dir", config.clang_src_dir)) + config.substitutions.append(("%src_include_dir", config.clang_src_dir + "/include")) config.substitutions.append(("%target_triple", config.target_triple)) diff --git a/clang/tools/clang-refactor/ClangRefactor.cpp b/clang/tools/clang-refactor/ClangRefactor.cpp index 968f0594085d4..a92b3f91beaed 100644 --- a/clang/tools/clang-refactor/ClangRefactor.cpp +++ b/clang/tools/clang-refactor/ClangRefactor.cpp @@ -160,7 +160,8 @@ SourceSelectionArgument::fromString(StringRef Value) { return std::make_unique(std::move(*Range)); llvm::errs() << "error: '-selection' option must be specified using " ":: or " - "::-: format\n"; + "::-: format, " + "where and are integers greater than zero.\n"; return nullptr; } diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index dae2b9a9fe683..3b42267f4d5f4 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -346,7 +346,10 @@ template static auto toJSONStrings(llvm::json::OStream &JOS, Container &&Strings) { return [&JOS, Strings = std::forward(Strings)] { for (StringRef Str : Strings) - JOS.value(Str); + // Not reporting SDKSettings.json so that test checks can remain (mostly) + // platform-agnostic. + if (!Str.ends_with("SDKSettings.json")) + JOS.value(Str); }; } @@ -498,7 +501,12 @@ class FullDeps { toJSONStrings(JOS, MD.getBuildArguments())); JOS.attribute("context-hash", StringRef(MD.ID.ContextHash)); JOS.attributeArray("file-deps", [&] { - MD.forEachFileDep([&](StringRef FileDep) { JOS.value(FileDep); }); + MD.forEachFileDep([&](StringRef FileDep) { + // Not reporting SDKSettings.json so that test checks can remain + // (mostly) platform-agnostic. + if (!FileDep.ends_with("SDKSettings.json")) + JOS.value(FileDep); + }); }); JOS.attributeArray("link-libraries", toJSONSorted(JOS, MD.LinkLibraries)); diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index cddd301e22e50..190bf64d35eda 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -308,8 +308,10 @@ TEST_P(ImportExpr, ImportShuffleVectorExpr) { const auto Pattern = functionDecl(hasDescendant(shuffleVectorExpr( allOf(has(declRefExpr(to(parmVarDecl(hasName("a"))))), has(declRefExpr(to(parmVarDecl(hasName("b"))))), - has(integerLiteral(equals(0))), has(integerLiteral(equals(1))), - has(integerLiteral(equals(2))), has(integerLiteral(equals(3))))))); + has(constantExpr(has(integerLiteral(equals(0))))), + has(constantExpr(has(integerLiteral(equals(1))))), + has(constantExpr(has(integerLiteral(equals(2))))), + has(constantExpr(has(integerLiteral(equals(3))))))))); testImport(Code, Lang_C99, "", Lang_C99, Verifier, Pattern); } diff --git a/clang/unittests/CIR/CMakeLists.txt b/clang/unittests/CIR/CMakeLists.txt new file mode 100644 index 0000000000000..650fde38c48a9 --- /dev/null +++ b/clang/unittests/CIR/CMakeLists.txt @@ -0,0 +1,16 @@ +set(MLIR_INCLUDE_DIR ${LLVM_MAIN_SRC_DIR}/../mlir/include ) +set(MLIR_TABLEGEN_OUTPUT_DIR ${CMAKE_BINARY_DIR}/tools/mlir/include) +include_directories(SYSTEM ${MLIR_INCLUDE_DIR}) +include_directories(${MLIR_TABLEGEN_OUTPUT_DIR}) + +add_distinct_clang_unittest(CIRUnitTests + PointerLikeTest.cpp + LLVM_COMPONENTS + Core + + LINK_LIBS + MLIRCIR + CIROpenACCSupport + MLIRIR + MLIROpenACCDialect + ) diff --git a/clang/unittests/CIR/PointerLikeTest.cpp b/clang/unittests/CIR/PointerLikeTest.cpp new file mode 100644 index 0000000000000..c0da271d56d4c --- /dev/null +++ b/clang/unittests/CIR/PointerLikeTest.cpp @@ -0,0 +1,364 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Unit tests for CIR implementation of OpenACC's PointertLikeType interface +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Value.h" +#include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h" +#include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" +#include "clang/CIR/Dialect/OpenACC/CIROpenACCTypeInterfaces.h" +#include "clang/CIR/Dialect/OpenACC/RegisterOpenACCExtensions.h" +#include "gtest/gtest.h" + +using namespace mlir; +using namespace cir; + +//===----------------------------------------------------------------------===// +// Test Fixture +//===----------------------------------------------------------------------===// + +class CIROpenACCPointerLikeTest : public ::testing::Test { +protected: + CIROpenACCPointerLikeTest() : b(&context), loc(UnknownLoc::get(&context)) { + context.loadDialect(); + context.loadDialect(); + + // Register extension to integrate CIR types with OpenACC. + mlir::DialectRegistry registry; + cir::acc::registerOpenACCExtensions(registry); + context.appendDialectRegistry(registry); + } + + MLIRContext context; + OpBuilder b; + Location loc; + llvm::StringMap recordNames; + + mlir::IntegerAttr getAlignOne(mlir::MLIRContext *ctx) { + // Note that mlir::IntegerType is used instead of cir::IntType here + // because we don't need sign information for this to be useful, so keep + // it simple. + clang::CharUnits align = clang::CharUnits::One(); + return mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 64), + align.getQuantity()); + } + + mlir::StringAttr getUniqueRecordName(const std::string &baseName) { + auto it = recordNames.find(baseName); + if (it == recordNames.end()) { + recordNames[baseName] = 0; + return b.getStringAttr(baseName); + } + + return b.getStringAttr(baseName + "." + + std::to_string(recordNames[baseName]++)); + } + + // General handler for types without a specific test + void testSingleType(mlir::Type ty, + mlir::acc::VariableTypeCategory expectedTypeCategory) { + mlir::Type ptrTy = cir::PointerType::get(ty); + + // cir::PointerType should be castable to acc::PointerLikeType + auto pltTy = dyn_cast_if_present(ptrTy); + ASSERT_NE(pltTy, nullptr); + + EXPECT_EQ(pltTy.getElementType(), ty); + + OwningOpRef varPtrOp = + b.create(loc, ptrTy, ty, "", getAlignOne(&context)); + + mlir::Value val = varPtrOp.get(); + mlir::acc::VariableTypeCategory typeCategory = pltTy.getPointeeTypeCategory( + cast>(val), + mlir::acc::getVarType(varPtrOp.get())); + + EXPECT_EQ(typeCategory, expectedTypeCategory); + } + + void testScalarType(mlir::Type ty) { + testSingleType(ty, mlir::acc::VariableTypeCategory::scalar); + } + + void testNonScalarType(mlir::Type ty) { + testSingleType(ty, mlir::acc::VariableTypeCategory::nonscalar); + } + + void testUncategorizedType(mlir::Type ty) { + testSingleType(ty, mlir::acc::VariableTypeCategory::uncategorized); + } + + void testArrayType(mlir::Type ty) { + // Build the array pointer type. + mlir::Type arrTy = cir::ArrayType::get(ty, 10); + mlir::Type ptrTy = cir::PointerType::get(arrTy); + + // Verify that the pointer points to the array type.. + auto pltTy = dyn_cast_if_present(ptrTy); + ASSERT_NE(pltTy, nullptr); + EXPECT_EQ(pltTy.getElementType(), arrTy); + + // Create an alloca for the array + OwningOpRef varPtrOp = + b.create(loc, ptrTy, arrTy, "", getAlignOne(&context)); + + // Verify that the type category is array. + mlir::Value val = varPtrOp.get(); + mlir::acc::VariableTypeCategory typeCategory = pltTy.getPointeeTypeCategory( + cast>(val), + mlir::acc::getVarType(varPtrOp.get())); + EXPECT_EQ(typeCategory, mlir::acc::VariableTypeCategory::array); + + // Create an array-to-pointer decay cast. + mlir::Type ptrToElemTy = cir::PointerType::get(ty); + OwningOpRef decayPtr = b.create( + loc, ptrToElemTy, cir::CastKind::array_to_ptrdecay, val); + mlir::Value decayVal = decayPtr.get(); + + // Verify that we still get the expected element type. + auto decayPltTy = + dyn_cast_if_present(decayVal.getType()); + ASSERT_NE(decayPltTy, nullptr); + EXPECT_EQ(decayPltTy.getElementType(), ty); + + // Verify that we still identify the type category as an array. + mlir::acc::VariableTypeCategory decayTypeCategory = + decayPltTy.getPointeeTypeCategory( + cast>(decayVal), + mlir::acc::getVarType(decayPtr.get())); + EXPECT_EQ(decayTypeCategory, mlir::acc::VariableTypeCategory::array); + + // Create an element access. + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + mlir::Value index = + b.create(loc, cir::IntAttr::get(i32Ty, 2)); + OwningOpRef accessPtr = + b.create(loc, ptrToElemTy, decayVal, index); + mlir::Value accessVal = accessPtr.get(); + + // Verify that we still get the expected element type. + auto accessPltTy = + dyn_cast_if_present(accessVal.getType()); + ASSERT_NE(accessPltTy, nullptr); + EXPECT_EQ(accessPltTy.getElementType(), ty); + + // Verify that we still identify the type category as an array. + mlir::acc::VariableTypeCategory accessTypeCategory = + accessPltTy.getPointeeTypeCategory( + cast>(accessVal), + mlir::acc::getVarType(accessPtr.get())); + EXPECT_EQ(accessTypeCategory, mlir::acc::VariableTypeCategory::array); + } + + // Structures and unions are accessed in the same way, so use a common test. + void testRecordType(mlir::Type ty1, mlir::Type ty2, + cir::RecordType::RecordKind kind) { + // Build the structure pointer type. + cir::RecordType structTy = + cir::RecordType::get(&context, getUniqueRecordName("S"), kind); + structTy.complete({ty1, ty2}, false, false); + mlir::Type ptrTy = cir::PointerType::get(structTy); + + // Verify that the pointer points to the structure type. + auto pltTy = dyn_cast_if_present(ptrTy); + ASSERT_NE(pltTy, nullptr); + EXPECT_EQ(pltTy.getElementType(), structTy); + + // Create an alloca for the array + OwningOpRef varPtrOp = b.create( + loc, ptrTy, structTy, "", getAlignOne(&context)); + + // Verify that the type category is composite. + mlir::Value val = varPtrOp.get(); + mlir::acc::VariableTypeCategory typeCategory = pltTy.getPointeeTypeCategory( + cast>(val), + mlir::acc::getVarType(varPtrOp.get())); + EXPECT_EQ(typeCategory, mlir::acc::VariableTypeCategory::composite); + + // Access the first element of the structure. + OwningOpRef access1 = b.create( + loc, cir::PointerType::get(ty1), val, b.getStringAttr("f1"), 0); + mlir::Value accessVal1 = access1.get(); + + // Verify that we get the expected element type. + auto access1PltTy = + dyn_cast_if_present(accessVal1.getType()); + ASSERT_NE(access1PltTy, nullptr); + EXPECT_EQ(access1PltTy.getElementType(), ty1); + + // Verify that the type category is still composite. + mlir::acc::VariableTypeCategory access1TypeCategory = + access1PltTy.getPointeeTypeCategory( + cast>(accessVal1), + mlir::acc::getVarType(access1.get())); + EXPECT_EQ(access1TypeCategory, mlir::acc::VariableTypeCategory::composite); + + // Access the second element of the structure. + OwningOpRef access2 = b.create( + loc, cir::PointerType::get(ty2), val, b.getStringAttr("f2"), 1); + mlir::Value accessVal2 = access2.get(); + + // Verify that we get the expected element type. + auto access2PltTy = + dyn_cast_if_present(accessVal2.getType()); + ASSERT_NE(access2PltTy, nullptr); + EXPECT_EQ(access2PltTy.getElementType(), ty2); + + // Verify that the type category is still composite. + mlir::acc::VariableTypeCategory access2TypeCategory = + access2PltTy.getPointeeTypeCategory( + cast>(accessVal2), + mlir::acc::getVarType(access2.get())); + EXPECT_EQ(access2TypeCategory, mlir::acc::VariableTypeCategory::composite); + } + + void testStructType(mlir::Type ty1, mlir::Type ty2) { + testRecordType(ty1, ty2, cir::RecordType::RecordKind::Struct); + } + + void testUnionType(mlir::Type ty1, mlir::Type ty2) { + testRecordType(ty1, ty2, cir::RecordType::RecordKind::Union); + } + + // This is testing a case like this: + // + // struct S { + // int *f1; + // int *f2; + // } *p; + // int *pMember = p->f2; + // + // That is, we are not testing a pointer to a member, we're testing a pointer + // that is loaded as a member value. + void testPointerToMemberType( + mlir::Type ty, mlir::acc::VariableTypeCategory expectedTypeCategory) { + // Construct a struct type with two members that are pointers to the input + // type. + mlir::Type ptrTy = cir::PointerType::get(ty); + cir::RecordType structTy = + cir::RecordType::get(&context, getUniqueRecordName("S"), + cir::RecordType::RecordKind::Struct); + structTy.complete({ptrTy, ptrTy}, false, false); + mlir::Type structPptrTy = cir::PointerType::get(structTy); + + // Create an alloca for the struct. + OwningOpRef varPtrOp = b.create( + loc, structPptrTy, structTy, "S", getAlignOne(&context)); + mlir::Value val = varPtrOp.get(); + + // Get a pointer to the second member. + OwningOpRef access = b.create( + loc, cir::PointerType::get(ptrTy), val, b.getStringAttr("f2"), 1); + mlir::Value accessVal = access.get(); + + // Load the value of the second member. This is the pointer we want to test. + OwningOpRef loadOp = b.create(loc, accessVal); + mlir::Value loadVal = loadOp.get(); + + // Verify that the type category is the expected type category. + auto pltTy = dyn_cast_if_present(ptrTy); + mlir::acc::VariableTypeCategory typeCategory = pltTy.getPointeeTypeCategory( + cast>(loadVal), + mlir::acc::getVarType(loadOp.get())); + + EXPECT_EQ(typeCategory, expectedTypeCategory); + } +}; + +TEST_F(CIROpenACCPointerLikeTest, testPointerToInt) { + // Test various scalar types. + testScalarType(cir::IntType::get(&context, 8, true)); + testScalarType(cir::IntType::get(&context, 8, false)); + testScalarType(cir::IntType::get(&context, 16, true)); + testScalarType(cir::IntType::get(&context, 16, false)); + testScalarType(cir::IntType::get(&context, 32, true)); + testScalarType(cir::IntType::get(&context, 32, false)); + testScalarType(cir::IntType::get(&context, 64, true)); + testScalarType(cir::IntType::get(&context, 64, false)); + testScalarType(cir::IntType::get(&context, 128, true)); + testScalarType(cir::IntType::get(&context, 128, false)); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToBool) { + testScalarType(cir::BoolType::get(&context)); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToFloat) { + testScalarType(cir::SingleType::get(&context)); + testScalarType(cir::DoubleType::get(&context)); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToPointer) { + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + mlir::Type ptrTy = cir::PointerType::get(i32Ty); + testScalarType(ptrTy); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToArray) { + // Test an array type. + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + testArrayType(i32Ty); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToStruct) { + // Test a struct type. + mlir::Type i16Ty = cir::IntType::get(&context, 16, true); + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + testStructType(i16Ty, i32Ty); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToUnion) { + // Test a union type. + mlir::Type i16Ty = cir::IntType::get(&context, 16, true); + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + testUnionType(i16Ty, i32Ty); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToFunction) { + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + mlir::Type funcTy = + cir::FuncType::get(SmallVector{i32Ty, i32Ty}, i32Ty); + testNonScalarType(funcTy); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToVector) { + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + mlir::Type vecTy = cir::VectorType::get(i32Ty, 4); + testNonScalarType(vecTy); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToVoid) { + mlir::Type voidTy = cir::VoidType::get(&context); + testUncategorizedType(voidTy); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToIntMember) { + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + testPointerToMemberType(i32Ty, mlir::acc::VariableTypeCategory::scalar); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToArrayMember) { + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + mlir::Type arrTy = cir::ArrayType::get(i32Ty, 10); + testPointerToMemberType(arrTy, mlir::acc::VariableTypeCategory::array); +} + +TEST_F(CIROpenACCPointerLikeTest, testPointerToStructMember) { + mlir::Type i32Ty = cir::IntType::get(&context, 32, true); + cir::RecordType structTy = cir::RecordType::get( + &context, getUniqueRecordName("S"), cir::RecordType::RecordKind::Struct); + structTy.complete({i32Ty, i32Ty}, false, false); + testPointerToMemberType(structTy, mlir::acc::VariableTypeCategory::composite); +} diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index b4114d419b75c..aef28f914b640 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -105,7 +105,9 @@ add_subdirectory(Index) add_subdirectory(InstallAPI) add_subdirectory(Serialization) add_subdirectory(Support) - +if (CLANG_ENABLE_CIR) + add_subdirectory(CIR) +endif() # If we're doing a single merged clang unit test binary, add that target after # all the previous subdirectories have been processed. diff --git a/clang/unittests/Driver/ToolChainTest.cpp b/clang/unittests/Driver/ToolChainTest.cpp index 9fe8cd18beb9b..c1ffe4a82ce4b 100644 --- a/clang/unittests/Driver/ToolChainTest.cpp +++ b/clang/unittests/Driver/ToolChainTest.cpp @@ -96,7 +96,7 @@ TEST(ToolChainTest, VFSGCCInstallation) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); EXPECT_EQ( "Found candidate GCC installation: " "/usr/lib/gcc/arm-linux-gnueabihf/4.6.3\n" @@ -120,7 +120,7 @@ TEST(ToolChainTest, VFSGCCInstallation) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); // Test that 4.5.3 from --sysroot is not overridden by 4.6.3 (larger // version) from /usr. EXPECT_EQ("Found candidate GCC installation: " @@ -162,7 +162,7 @@ TEST(ToolChainTest, VFSGCCInstallationRelativeDir) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); EXPECT_EQ("Found candidate GCC installation: " "/home/test/bin/../lib/gcc/arm-linux-gnueabi/4.6.1\n" "Selected GCC installation: " @@ -213,7 +213,7 @@ TEST(ToolChainTest, VFSSolarisMultiGCCInstallation) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); EXPECT_EQ("Found candidate GCC installation: " "/usr/gcc/11/lib/gcc/x86_64-pc-solaris2.11/11.4.0\n" "Selected GCC installation: " @@ -237,7 +237,7 @@ TEST(ToolChainTest, VFSSolarisMultiGCCInstallation) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); EXPECT_EQ("Found candidate GCC installation: " "/usr/gcc/11/lib/gcc/x86_64-pc-solaris2.11/11.4.0\n" "Selected GCC installation: " @@ -261,7 +261,7 @@ TEST(ToolChainTest, VFSSolarisMultiGCCInstallation) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); EXPECT_EQ("Found candidate GCC installation: " "/usr/gcc/11/lib/gcc/x86_64-pc-solaris2.11/11.4.0\n" "Selected GCC installation: " @@ -285,7 +285,7 @@ TEST(ToolChainTest, VFSSolarisMultiGCCInstallation) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); EXPECT_EQ("Found candidate GCC installation: " "/usr/gcc/11/lib/gcc/sparcv9-sun-solaris2.11/11.4.0\n" "Selected GCC installation: " @@ -308,7 +308,7 @@ TEST(ToolChainTest, VFSSolarisMultiGCCInstallation) { C->getDefaultToolChain().printVerboseInfo(OS); } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(S.begin(), S.end(), '\\', '/'); + llvm::replace(S, '\\', '/'); EXPECT_EQ("Found candidate GCC installation: " "/usr/gcc/11/lib/gcc/sparcv9-sun-solaris2.11/11.4.0\n" "Selected GCC installation: " @@ -329,7 +329,7 @@ MATCHER_P(jobHasArgs, Substr, "") { Args += Arg; } if (is_style_windows(llvm::sys::path::Style::native)) - std::replace(Args.begin(), Args.end(), '\\', '/'); + llvm::replace(Args, '\\', '/'); if (llvm::StringRef(Args).contains(Substr)) return true; *result_listener << "whose args are '" << Args << "'"; diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index b7b2580d72a0e..39493d718f0af 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -2053,8 +2053,20 @@ void NeonEmitter::createIntrinsic(const Record *R, auto &Entry = IntrinsicMap[Name]; for (auto &I : NewTypeSpecs) { + + // MFloat8 type is only available on AArch64. If encountered set ArchGuard + // correctly. + std::string NewArchGuard = ArchGuard; + if (Type(I.first, ".").isMFloat8()) { + if (NewArchGuard.empty()) { + NewArchGuard = "defined(__aarch64__)"; + } else if (NewArchGuard.find("defined(__aarch64__)") == + std::string::npos) { + NewArchGuard = "defined(__aarch64__) && (" + NewArchGuard + ")"; + } + } Entry.emplace_back(R, Name, Proto, I.first, I.second, CK, Body, *this, - ArchGuard, TargetGuard, IsUnavailable, BigEndianSafe); + NewArchGuard, TargetGuard, IsUnavailable, BigEndianSafe); Out.push_back(&Entry.back()); } diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake index d346b0ec01b03..86e19e08270d7 100644 --- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake +++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake @@ -123,7 +123,7 @@ macro(set_output_name output name arch) else() if(ANDROID AND ${arch} STREQUAL "i386") set(${output} "${name}-i686${COMPILER_RT_OS_SUFFIX}") - elseif("${arch}" MATCHES "^arm") + elseif(NOT "${arch}" MATCHES "^arm64" AND "${arch}" MATCHES "^arm") if(COMPILER_RT_DEFAULT_TARGET_ONLY) set(triple "${COMPILER_RT_DEFAULT_TARGET_TRIPLE}") else() diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake index cbb43a5958d2f..8c9c84ad64bc0 100644 --- a/compiler-rt/cmake/builtin-config-ix.cmake +++ b/compiler-rt/cmake/builtin-config-ix.cmake @@ -59,7 +59,7 @@ else() endif() set(AMDGPU amdgcn) -set(ARM64 aarch64) +set(ARM64 aarch64 arm64ec) set(ARM32 arm armhf armv4t armv5te armv6 armv6m armv7m armv7em armv7 armv7s armv7k armv8m.base armv8m.main armv8.1m.main) set(AVR avr) set(HEXAGON hexagon) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 5efc4ab0e85bc..d9b7800a95565 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -668,6 +668,7 @@ set(armv7k_SOURCES ${arm_SOURCES}) set(arm64_SOURCES ${aarch64_SOURCES}) set(arm64e_SOURCES ${aarch64_SOURCES}) set(arm64_32_SOURCES ${aarch64_SOURCES}) +set(arm64ec_SOURCES ${aarch64_SOURCES}) # macho_embedded archs set(armv6m_SOURCES ${thumb1_SOURCES}) diff --git a/compiler-rt/lib/builtins/aarch64/chkstk.S b/compiler-rt/lib/builtins/aarch64/chkstk.S index 01f90366f0302..563c09ecbc390 100644 --- a/compiler-rt/lib/builtins/aarch64/chkstk.S +++ b/compiler-rt/lib/builtins/aarch64/chkstk.S @@ -15,12 +15,18 @@ // bl __chkstk // sub sp, sp, x15, lsl #4 -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__arm64ec__) + +#ifdef __arm64ec__ +#define CHKSTK_FUNC __chkstk_arm64ec +#else +#define CHKSTK_FUNC __chkstk +#endif #define PAGE_SIZE 4096 .p2align 2 -DEFINE_COMPILERRT_FUNCTION(__chkstk) +DEFINE_COMPILERRT_FUNCTION(CHKSTK_FUNC) lsl x16, x15, #4 mov x17, sp 1: @@ -30,6 +36,6 @@ DEFINE_COMPILERRT_FUNCTION(__chkstk) b.gt 1b ret -END_COMPILERRT_FUNCTION(__chkstk) +END_COMPILERRT_FUNCTION(CHKSTK_FUNC) -#endif // __aarch64__ +#endif // defined(__aarch64__) || defined(__arm64ec__) diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S index 1fe18f4a46819..d7c1db7243ef8 100644 --- a/compiler-rt/lib/builtins/aarch64/lse.S +++ b/compiler-rt/lib/builtins/aarch64/lse.S @@ -20,7 +20,7 @@ // Routines may modify temporary registers tmp0, tmp1, tmp2, // return value x0 and the flags only. -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__arm64ec__) #ifdef HAS_ASM_LSE .arch armv8-a+lse @@ -267,4 +267,4 @@ NO_EXEC_STACK_DIRECTIVE // GNU property note for BTI and PAC GNU_PROPERTY_BTI_PAC -#endif // __aarch64__ +#endif // defined(__aarch64__) || defined(__arm64ec__) diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S index e736829967c0c..73b1ab2c76aa3 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S +++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S @@ -235,7 +235,7 @@ END_COMPILERRT_FUNCTION(__arm_sc_memcpy) DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy) // This version uses FP registers. Use this only on targets with them -#if defined(__aarch64__) && __ARM_FP != 0 +#if (defined(__aarch64__) && __ARM_FP != 0) || defined(__arm64ec__) // // __arm_sc_memset // diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c index 441eabd1fe922..eb58452d624ee 100644 --- a/compiler-rt/lib/builtins/clear_cache.c +++ b/compiler-rt/lib/builtins/clear_cache.c @@ -59,13 +59,14 @@ uintptr_t GetCurrentProcess(void); // specified range. void __clear_cache(void *start, void *end) { -#if __i386__ || __x86_64__ || defined(_M_IX86) || defined(_M_X64) +#if defined(_WIN32) && \ + (defined(__arm__) || defined(__aarch64__) || defined(__arm64ec__)) + FlushInstructionCache(GetCurrentProcess(), start, end - start); +#elif __i386__ || __x86_64__ || defined(_M_IX86) || defined(_M_X64) // Intel processors have a unified instruction and data cache // so there is nothing to do #elif defined(__s390__) // no-op -#elif defined(_WIN32) && (defined(__arm__) || defined(__aarch64__)) - FlushInstructionCache(GetCurrentProcess(), start, end - start); #elif defined(__arm__) && !defined(__APPLE__) #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) struct arm_sync_icache_args arg; diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c index 4082fd62ea11a..be002dd71992a 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64.c +++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c @@ -14,7 +14,8 @@ #include "aarch64.h" -#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) +#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) && \ + !defined(__arm64ec__) && !defined(_M_ARM64EC) #error This file is intended only for aarch64-based targets #endif diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.h b/compiler-rt/lib/builtins/cpu_model/aarch64.h index 2a734b02b7c90..3d9b3aa0e594e 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64.h +++ b/compiler-rt/lib/builtins/cpu_model/aarch64.h @@ -8,7 +8,8 @@ #include "cpu_model.h" -#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) +#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) && \ + !defined(__arm64ec__) && !defined(_M_ARM64EC) #error This file is intended only for aarch64-based targets #endif diff --git a/compiler-rt/lib/builtins/fp_compare_impl.inc b/compiler-rt/lib/builtins/fp_compare_impl.inc index a9a4f6fbf5dfe..f883338c471d3 100644 --- a/compiler-rt/lib/builtins/fp_compare_impl.inc +++ b/compiler-rt/lib/builtins/fp_compare_impl.inc @@ -12,7 +12,7 @@ // functions. We need to ensure that the return value is sign-extended in the // same way as GCC expects (since otherwise GCC-generated __builtin_isinf // returns true for finite 128-bit floating-point numbers). -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__arm64ec__) // AArch64 GCC overrides libgcc_cmp_return to use int instead of long. typedef int CMP_RESULT; #elif __SIZEOF_POINTER__ == 8 && __SIZEOF_LONG__ == 4 diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h index fae58497a8f80..95b24aac1ff1d 100644 --- a/compiler-rt/lib/builtins/fp_lib.h +++ b/compiler-rt/lib/builtins/fp_lib.h @@ -359,7 +359,7 @@ static __inline fp_t __compiler_rt_scalbn(fp_t x, int y) { return __compiler_rt_scalbnX(x, y); } static __inline fp_t __compiler_rt_fmax(fp_t x, fp_t y) { -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__arm64ec__) // Use __builtin_fmax which turns into an fmaxnm instruction on AArch64. return __builtin_fmax(x, y); #else diff --git a/compiler-rt/lib/builtins/udivmodti4.c b/compiler-rt/lib/builtins/udivmodti4.c index 55def37c9e1fe..6ce213fd5f2a4 100644 --- a/compiler-rt/lib/builtins/udivmodti4.c +++ b/compiler-rt/lib/builtins/udivmodti4.c @@ -83,7 +83,7 @@ static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v, static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v, du_int *r) { -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(__arm64ec__) du_int result; __asm__("divq %[v]" : "=a"(result), "=d"(*r) diff --git a/compiler-rt/test/builtins/Unit/enable_execute_stack_test.c b/compiler-rt/test/builtins/Unit/enable_execute_stack_test.c index eb1fa97797ac8..b3cb4df005ca5 100644 --- a/compiler-rt/test/builtins/Unit/enable_execute_stack_test.c +++ b/compiler-rt/test/builtins/Unit/enable_execute_stack_test.c @@ -10,9 +10,22 @@ extern void __enable_execute_stack(void* addr); typedef int (*pfunc)(void); +#ifdef __arm64ec__ +// On ARM64EC, we need the x86_64 version of this function, but the compiler +// would normally generate the AArch64 variant, so we hardcode it here. +static char func1[] = { + 0xb8, 0x01, 0x00, 0x00, 0x00, // movl $0x1, %eax + 0xc3 // retq +}; +static char func2[] = { + 0xb8, 0x02, 0x00, 0x00, 0x00, // movl $0x2, %eax + 0xc3 // retq +}; +#else // Make these static to avoid ILT jumps for incremental linking on Windows. static int func1() { return 1; } static int func2() { return 2; } +#endif void *__attribute__((noinline)) memcpy_f(void *dst, const void *src, size_t n) { diff --git a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c index d9f02bf472b5a..982f3a4629dbd 100644 --- a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c +++ b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c @@ -4,7 +4,7 @@ #include -#if _ARCH_PPC || __aarch64__ +#if _ARCH_PPC || __aarch64__ || __arm64ec__ #include "int_lib.h" @@ -35,7 +35,7 @@ char assumption_3[sizeof(long double)*CHAR_BIT == 128] = {0}; int main() { -#if _ARCH_PPC || __aarch64__ +#if _ARCH_PPC || __aarch64__ || __arm64ec__ if (test__fixunstfdi(0.0, 0)) return 1; diff --git a/compiler-rt/test/builtins/Unit/multc3_test.c b/compiler-rt/test/builtins/Unit/multc3_test.c index 06f55a68d991a..e9c99a72be35e 100644 --- a/compiler-rt/test/builtins/Unit/multc3_test.c +++ b/compiler-rt/test/builtins/Unit/multc3_test.c @@ -4,7 +4,7 @@ #include -#if _ARCH_PPC || __aarch64__ +#if _ARCH_PPC || __aarch64__ || __arm64ec__ #include "int_lib.h" #include @@ -348,7 +348,7 @@ long double x[][2] = int main() { -#if _ARCH_PPC || __aarch64__ +#if _ARCH_PPC || __aarch64__ || __arm64ec__ const unsigned N = sizeof(x) / sizeof(x[0]); unsigned i, j; for (i = 0; i < N; ++i) diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp index 21ffe1381bd46..ab1017a2efc07 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/sanitizer_set_report_path_test.cpp @@ -18,8 +18,10 @@ int main(int argc, char **argv) { // Try setting again with an invalid/inaccessible directory. char buff_bad[1000]; sprintf(buff_bad, "%s/report", argv[0]); + fprintf(stderr, "Expected bad report path: %s\n", buff_bad); + // CHECK: Expected bad report path: [[BADPATH:.*]]/report __sanitizer_set_report_path(buff_bad); assert(strncmp(buff, __sanitizer_get_report_path(), strlen(buff)) == 0); } -// CHECK: ERROR: Can't create directory: {{.*}}Posix/Output/sanitizer_set_report_path_test.cpp.tmp +// CHECK: ERROR: Can't create directory: [[BADPATH]] diff --git a/flang-rt/include/flang-rt/runtime/emit-encoded.h b/flang-rt/include/flang-rt/runtime/emit-encoded.h index d99f56b29558e..ea83901fcc8be 100644 --- a/flang-rt/include/flang-rt/runtime/emit-encoded.h +++ b/flang-rt/include/flang-rt/runtime/emit-encoded.h @@ -64,7 +64,7 @@ RT_API_ATTRS bool EmitEncoded( } else { // CHARACTER kind conversion for internal output while (chars-- > 0) { - char32_t buffer = *data++; + char32_t buffer = static_cast(*data++); char *p{reinterpret_cast(&buffer)}; if constexpr (!isHostLittleEndian) { p += sizeof(buffer) - internalKind; diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp index 4a813cd489022..9be75da9520e3 100644 --- a/flang-rt/lib/runtime/assign.cpp +++ b/flang-rt/lib/runtime/assign.cpp @@ -79,15 +79,18 @@ static RT_API_ATTRS int AllocateAssignmentLHS( to.raw().elem_len = from.ElementBytes(); } const typeInfo::DerivedType *derived{nullptr}; + DescriptorAddendum *toAddendum{to.Addendum()}; if (const DescriptorAddendum * fromAddendum{from.Addendum()}) { derived = fromAddendum->derivedType(); - if (DescriptorAddendum * toAddendum{to.Addendum()}) { + if (toAddendum) { toAddendum->set_derivedType(derived); std::size_t lenParms{derived ? derived->LenParameters() : 0}; for (std::size_t j{0}; j < lenParms; ++j) { toAddendum->SetLenParameterValue(j, fromAddendum->LenParameterValue(j)); } } + } else if (toAddendum) { + toAddendum->set_derivedType(nullptr); } // subtle: leave bounds in place when "from" is scalar (10.2.1.3(3)) int rank{from.rank()}; diff --git a/flang-rt/lib/runtime/edit-input.cpp b/flang-rt/lib/runtime/edit-input.cpp index 68ea34bf63cca..0cc287aa3b47e 100644 --- a/flang-rt/lib/runtime/edit-input.cpp +++ b/flang-rt/lib/runtime/edit-input.cpp @@ -983,7 +983,7 @@ static RT_API_ATTRS bool EditDelimitedCharacterInput( } } if (length > 0) { - *x++ = *ch; + *x++ = static_cast(*ch); --length; } } @@ -1030,7 +1030,7 @@ static RT_API_ATTRS bool EditListDirectedCharacterInput( break; } if (length > 0) { - *x++ = *ch; + *x++ = static_cast(*ch); --length; } else if (edit.IsNamelist()) { // GNU compatibility @@ -1111,7 +1111,7 @@ RT_API_ATTRS bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, (sizeof *x == 2 && *ucs > 0xffff)) { *x++ = '?'; } else { - *x++ = *ucs; + *x++ = static_cast(*ucs); } --lengthChars; } else if (chunkBytes == 0) { @@ -1130,7 +1130,7 @@ RT_API_ATTRS bool EditCharacterInput(IoStatementState &io, const DataEdit &edit, (sizeof *x == 2 && buffer > 0xffff)) { *x++ = '?'; } else { - *x++ = buffer; + *x++ = static_cast(buffer); } --lengthChars; } diff --git a/flang/docs/ModFiles.md b/flang/docs/ModFiles.md index dd0ade5cebbfc..fc05c2677fc26 100644 --- a/flang/docs/ModFiles.md +++ b/flang/docs/ModFiles.md @@ -171,3 +171,14 @@ modules of dependent libraries need not also be packaged with the library. When the compiler reads a hermetic module file, the copies of the dependent modules are read into their own scope, and will not conflict with other modules of the same name that client code might `USE`. + +One can use the `-fhermetic-module-files` option when building the top-level +module files of a library for which not all of the implementation modules +will (or can) be shipped. + +It is also possible to convert a default module file to a hermetic one after +the fact. +Since module files are Fortran source, simply copy the module file to a new +temporary free form Fortran source file and recompile it (`-fsyntax-only`) +with the `-fhermetic-module-files` flag, and that will regenerate the module +file in place with all of its dependent modules included. diff --git a/flang/include/flang/Parser/preprocessor.h b/flang/include/flang/Parser/preprocessor.h index 86528a7e68def..15810a34ee6a5 100644 --- a/flang/include/flang/Parser/preprocessor.h +++ b/flang/include/flang/Parser/preprocessor.h @@ -116,6 +116,7 @@ class Preprocessor { bool IsIfPredicateTrue(const TokenSequence &expr, std::size_t first, std::size_t exprTokens, Prescanner &); void LineDirective(const TokenSequence &, std::size_t, Prescanner &); + TokenSequence TokenizeMacroBody(const std::string &); AllSources &allSources_; std::list names_; diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h index 97c1e30631840..4cded64d170cd 100644 --- a/flang/include/flang/Semantics/symbol.h +++ b/flang/include/flang/Semantics/symbol.h @@ -600,6 +600,7 @@ class TypeParamDetails { public: TypeParamDetails() = default; TypeParamDetails(const TypeParamDetails &) = default; + TypeParamDetails &operator=(const TypeParamDetails &) = default; std::optional attr() const { return attr_; } TypeParamDetails &set_attr(common::TypeParamAttr); MaybeIntExpr &init() { return init_; } diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index f4876256a378f..02454543d0a60 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -1407,8 +1407,7 @@ bool ClauseProcessor::processUseDeviceAddr( const parser::CharBlock &source) { mlir::Location location = converter.genLocation(source); llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; processMapObjects(stmtCtx, location, clause.v, mapTypeBits, parentMemberIndices, result.useDeviceAddrVars, useDeviceSyms); @@ -1429,8 +1428,7 @@ bool ClauseProcessor::processUseDevicePtr( const parser::CharBlock &source) { mlir::Location location = converter.genLocation(source); llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; processMapObjects(stmtCtx, location, clause.v, mapTypeBits, parentMemberIndices, result.useDevicePtrVars, useDeviceSyms); diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index 3f4cfb8c11a9d..173dceb07b193 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -398,14 +398,16 @@ mlir::Value createParentSymAndGenIntermediateMaps( interimBounds, treatIndexAsSection); } - // Remove all map TO, FROM and TOFROM bits, from the intermediate - // allocatable maps, we simply wish to alloc or release them. It may be - // safer to just pass OMP_MAP_NONE as the map type, but we may still + // Remove all map-type bits (e.g. TO, FROM, etc.) from the intermediate + // allocatable maps, as we simply wish to alloc or release them. It may + // be safer to just pass OMP_MAP_NONE as the map type, but we may still // need some of the other map types the mapped member utilises, so for // now it's good to keep an eye on this. llvm::omp::OpenMPOffloadMappingFlags interimMapType = mapTypeBits; interimMapType &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; interimMapType &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + interimMapType &= + ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; // Create a map for the intermediate member and insert it and it's // indices into the parentMemberIndices list to track it. diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp index b09bbf6106dbb..8a9e9b80134b8 100644 --- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp +++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp @@ -123,23 +123,24 @@ class CfgLoopConv : public mlir::OpRewritePattern { : terminator->operand_begin(); loopCarried.append(begin, terminator->operand_end()); loopCarried.push_back(itersMinusOne); - rewriter.create(loc, conditionalBlock, loopCarried); + auto backEdge = + rewriter.create(loc, conditionalBlock, loopCarried); rewriter.eraseOp(terminator); + // Copy loop annotations from the do loop to the loop back edge. + if (auto ann = loop.getLoopAnnotation()) + backEdge->setAttr("loop_annotation", *ann); + // Conditional block rewriter.setInsertionPointToEnd(conditionalBlock); auto zero = rewriter.create(loc, 0); auto comparison = rewriter.create( loc, arith::CmpIPredicate::sgt, itersLeft, zero); - auto cond = rewriter.create( + rewriter.create( loc, comparison, firstBlock, llvm::ArrayRef(), endBlock, llvm::ArrayRef()); - // Copy loop annotations from the do loop to the loop entry condition. - if (auto ann = loop.getLoopAnnotation()) - cond->setAttr("loop_annotation", *ann); - // The result of the loop operation is the values of the condition block // arguments except the induction variable on the last iteration. auto args = loop.getFinalValue() diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 0254ac4309ee5..52d3a5844c969 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -98,10 +98,12 @@ struct OmpDirectiveNameParser { using Token = TokenStringMatch; std::optional Parse(ParseState &state) const { + auto begin{state.GetLocation()}; for (const NameWithId &nid : directives()) { if (attempt(Token(nid.first.data())).Parse(state)) { OmpDirectiveName n; n.v = nid.second; + n.source = parser::CharBlock(begin, state.GetLocation()); return n; } } @@ -1104,18 +1106,8 @@ TYPE_PARSER( // "WHEN" >> construct(construct( parenthesized(Parser{}))) || // Cancellable constructs - "DO"_id >= - construct(construct( - Parser{})) || - "PARALLEL"_id >= - construct(construct( - Parser{})) || - "SECTIONS"_id >= - construct(construct( - Parser{})) || - "TASKGROUP"_id >= - construct(construct( - Parser{}))) + construct(construct( + Parser{}))) // [Clause, [Clause], ...] TYPE_PARSER(sourced(construct( diff --git a/flang/lib/Parser/preprocessor.cpp b/flang/lib/Parser/preprocessor.cpp index 6e8e3aee19b09..a5de14d864762 100644 --- a/flang/lib/Parser/preprocessor.cpp +++ b/flang/lib/Parser/preprocessor.cpp @@ -301,8 +301,82 @@ void Preprocessor::DefineStandardMacros() { Define("__TIMESTAMP__"s, "__TIMESTAMP__"s); } +static const std::string idChars{ + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"s}; + +static std::optional> TokenizeMacroNameAndArgs( + const std::string &str) { + // TODO: variadic macros on the command line (?) + std::vector names; + for (std::string::size_type at{0};;) { + auto nameStart{str.find_first_not_of(" "s, at)}; + if (nameStart == str.npos) { + return std::nullopt; + } + auto nameEnd{str.find_first_not_of(idChars, nameStart)}; + if (nameEnd == str.npos) { + return std::nullopt; + } + auto punc{str.find_first_not_of(" "s, nameEnd)}; + if (punc == str.npos) { + return std::nullopt; + } + if ((at == 0 && str[punc] != '(') || + (at > 0 && str[punc] != ',' && str[punc] != ')')) { + return std::nullopt; + } + names.push_back(str.substr(nameStart, nameEnd - nameStart)); + at = punc + 1; + if (str[punc] == ')') { + if (str.find_first_not_of(" "s, at) != str.npos) { + return std::nullopt; + } else { + return names; + } + } + } +} + +TokenSequence Preprocessor::TokenizeMacroBody(const std::string &str) { + TokenSequence tokens; + Provenance provenance{allSources_.AddCompilerInsertion(str).start()}; + auto end{str.size()}; + for (std::string::size_type at{0}; at < end;) { + // Alternate between tokens that are identifiers (and therefore subject + // to argument replacement) and those that are not. + auto start{str.find_first_of(idChars, at)}; + if (start == str.npos) { + tokens.Put(str.substr(at), provenance + at); + break; + } else if (start > at) { + tokens.Put(str.substr(at, start - at), provenance + at); + } + at = str.find_first_not_of(idChars, start + 1); + if (at == str.npos) { + tokens.Put(str.substr(start), provenance + start); + break; + } else { + tokens.Put(str.substr(start, at - start), provenance + start); + } + } + return tokens; +} + void Preprocessor::Define(const std::string ¯o, const std::string &value) { - definitions_.emplace(SaveTokenAsName(macro), Definition{value, allSources_}); + if (auto lhs{TokenizeMacroNameAndArgs(macro)}) { + // function-like macro + CharBlock macroName{SaveTokenAsName(lhs->front())}; + auto iter{lhs->begin()}; + ++iter; + std::vector argNames{iter, lhs->end()}; + auto rhs{TokenizeMacroBody(value)}; + definitions_.emplace(std::make_pair(macroName, + Definition{ + argNames, rhs, 0, rhs.SizeInTokens(), /*isVariadic=*/false})); + } else { // keyword macro + definitions_.emplace( + SaveTokenAsName(macro), Definition{value, allSources_}); + } } void Preprocessor::Undefine(std::string macro) { definitions_.erase(macro); } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 78736ee1929d1..5ae4bc29b72f7 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -2422,20 +2422,30 @@ void OmpStructureChecker::Leave(const parser::OpenMPCriticalConstruct &) { void OmpStructureChecker::Enter( const parser::OmpClause::CancellationConstructType &x) { - // Do not call CheckAllowed/CheckAllowedClause, because in case of an error - // it will print "CANCELLATION_CONSTRUCT_TYPE" as the clause name instead of - // the contained construct name. + llvm::omp::Directive dir{GetContext().directive}; auto &dirName{std::get(x.v.t)}; - switch (dirName.v) { - case llvm::omp::Directive::OMPD_do: - case llvm::omp::Directive::OMPD_parallel: - case llvm::omp::Directive::OMPD_sections: - case llvm::omp::Directive::OMPD_taskgroup: - break; - default: - context_.Say(dirName.source, "%s is not a cancellable construct"_err_en_US, - parser::ToUpperCaseLetters(getDirectiveName(dirName.v).str())); - break; + + if (dir != llvm::omp::Directive::OMPD_cancel && + dir != llvm::omp::Directive::OMPD_cancellation_point) { + // Do not call CheckAllowed/CheckAllowedClause, because in case of an error + // it will print "CANCELLATION_CONSTRUCT_TYPE" as the clause name instead + // of the contained construct name. + context_.Say(dirName.source, "%s cannot follow %s"_err_en_US, + parser::ToUpperCaseLetters(getDirectiveName(dirName.v)), + parser::ToUpperCaseLetters(getDirectiveName(dir))); + } else { + switch (dirName.v) { + case llvm::omp::Directive::OMPD_do: + case llvm::omp::Directive::OMPD_parallel: + case llvm::omp::Directive::OMPD_sections: + case llvm::omp::Directive::OMPD_taskgroup: + break; + default: + context_.Say(dirName.source, + "%s is not a cancellable construct"_err_en_US, + parser::ToUpperCaseLetters(getDirectiveName(dirName.v))); + break; + } } } diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index c35492097cfbc..b3ad608ee6744 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -4904,6 +4904,19 @@ std::optional ArgumentAnalyzer::AnalyzeExpr( "TYPE(*) dummy argument may only be used as an actual argument"_err_en_US); } else if (MaybeExpr argExpr{AnalyzeExprOrWholeAssumedSizeArray(expr)}) { if (isProcedureCall_ || !IsProcedureDesignator(*argExpr)) { + // Pad Hollerith actual argument with spaces up to a multiple of 8 + // bytes, in case the data are interpreted as double precision + // (or a smaller numeric type) by legacy code. + if (auto hollerith{UnwrapExpr>(*argExpr)}; + hollerith && hollerith->wasHollerith()) { + std::string bytes{hollerith->values()}; + while ((bytes.size() % 8) != 0) { + bytes += ' '; + } + Constant c{std::move(bytes)}; + c.set_wasHollerith(true); + argExpr = AsGenericExpr(std::move(c)); + } ActualArgument arg{std::move(*argExpr)}; SetArgSourceLocation(arg, expr.source); return std::move(arg); diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir index 8019ecf7f6a05..b13921f822b4d 100644 --- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir +++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir @@ -423,14 +423,15 @@ func.func @_QPopenmp_target_data_region() { func.func @_QPomp_target_data_empty() { %0 = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_data_emptyEa"} - omp.target_data use_device_addr(%0 -> %arg0 : !fir.ref>) { + %1 = omp.map.info var_ptr(%0 : !fir.ref>, !fir.ref>) map_clauses(return_param) capture(ByRef) -> !fir.ref> {name = ""} + omp.target_data use_device_addr(%1 -> %arg0 : !fir.ref>) { omp.terminator } return } // CHECK-LABEL: llvm.func @_QPomp_target_data_empty -// CHECK: omp.target_data use_device_addr(%1 -> %{{.*}} : !llvm.ptr) { +// CHECK: omp.target_data use_device_addr(%{{.*}} -> %{{.*}} : !llvm.ptr) { // CHECK: } // ----- diff --git a/flang/test/Fir/vector-always.fir b/flang/test/Fir/vector-always.fir index 00eb0e7a756ee..ec06b94a3d0f8 100644 --- a/flang/test/Fir/vector-always.fir +++ b/flang/test/Fir/vector-always.fir @@ -13,7 +13,9 @@ func.func @_QPvector_always() -> i32 { %c10_i32 = arith.constant 10 : i32 %c1_i32 = arith.constant 1 : i32 %c10 = arith.constant 10 : index -// CHECK: cf.cond_br %{{.*}}, ^{{.*}}, ^{{.*}} {loop_annotation = #[[ANNOTATION]]} +// CHECK: cf.cond_br +// CHECK-NOT: loop_annotation +// CHECK: cf.br ^{{.*}} {loop_annotation = #[[ANNOTATION]]} %8:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) attributes {loopAnnotation = #loop_annotation} { fir.result %c1, %c1_i32 : index, i32 } diff --git a/flang/test/Integration/unroll.f90 b/flang/test/Integration/unroll.f90 index aa47e465b63fc..f2c2ecb5cffac 100644 --- a/flang/test/Integration/unroll.f90 +++ b/flang/test/Integration/unroll.f90 @@ -3,8 +3,10 @@ ! CHECK-LABEL: unroll_dir subroutine unroll_dir integer :: a(10) - !dir$ unroll - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_FULL_ANNO:.*]] + !dir$ unroll + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_FULL_ANNO:.*]] do i=1,10 a(i)=i end do @@ -14,7 +16,9 @@ end subroutine unroll_dir subroutine unroll_dir_0 integer :: a(10) !dir$ unroll 0 - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_DISABLE_ANNO:.*]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[UNROLL_DISABLE_ANNO:.*]] do i=1,10 a(i)=i end do @@ -24,7 +28,9 @@ end subroutine unroll_dir_0 subroutine unroll_dir_1 integer :: a(10) !dir$ unroll 1 - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_DISABLE_ANNO]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[UNROLL_DISABLE_ANNO]] do i=1,10 a(i)=i end do @@ -34,7 +40,9 @@ end subroutine unroll_dir_1 subroutine unroll_dir_2 integer :: a(10) !dir$ unroll 2 - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_COUNT_2:.*]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_COUNT_2:.*]] do i=1,10 a(i)=i end do diff --git a/flang/test/Integration/unroll_and_jam.f90 b/flang/test/Integration/unroll_and_jam.f90 index b9c16d34ac90a..05b3aaa04a1e0 100644 --- a/flang/test/Integration/unroll_and_jam.f90 +++ b/flang/test/Integration/unroll_and_jam.f90 @@ -4,7 +4,9 @@ subroutine unroll_and_jam_dir integer :: a(10) !dir$ unroll_and_jam 4 - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] do i=1,10 a(i)=i end do @@ -14,7 +16,9 @@ end subroutine unroll_and_jam_dir subroutine unroll_and_jam_dir_0 integer :: a(10) !dir$ unroll_and_jam 0 - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE:.*]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE:.*]] do i=1,10 a(i)=i end do @@ -24,7 +28,9 @@ end subroutine unroll_and_jam_dir_0 subroutine unroll_and_jam_dir_1 integer :: a(10) !dir$ unroll_and_jam 1 - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] do i=1,10 a(i)=i end do @@ -34,7 +40,9 @@ end subroutine unroll_and_jam_dir_1 subroutine nounroll_and_jam_dir integer :: a(10) !dir$ nounroll_and_jam - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] do i=1,10 a(i)=i end do @@ -44,7 +52,9 @@ end subroutine nounroll_and_jam_dir subroutine unroll_and_jam_dir_no_factor integer :: a(10) !dir$ unroll_and_jam - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_NO_FACTOR:.*]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION_NO_FACTOR:.*]] do i=1,10 a(i)=i end do diff --git a/flang/test/Integration/vector-always.f90 b/flang/test/Integration/vector-always.f90 index ee2aa8ab485e0..1d8aad97bde70 100644 --- a/flang/test/Integration/vector-always.f90 +++ b/flang/test/Integration/vector-always.f90 @@ -4,7 +4,9 @@ subroutine vector_always integer :: a(10) !dir$ vector always - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] do i=1,10 a(i)=i end do @@ -14,7 +16,9 @@ end subroutine vector_always subroutine no_vector integer :: a(10) !dir$ novector - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION2:.*]] + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}} + ! CHECK-NOT: !llvm.loop + ! CHECK: br label {{.*}}, !llvm.loop ![[ANNOTATION2:.*]] do i=1,10 a(i)=i end do diff --git a/flang/test/Lower/OpenACC/acc-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-data-unwrap-defaultbounds.f90 index d010d39cef4eb..789db34adefee 100644 --- a/flang/test/Lower/OpenACC/acc-data-unwrap-defaultbounds.f90 +++ b/flang/test/Lower/OpenACC/acc-data-unwrap-defaultbounds.f90 @@ -155,8 +155,8 @@ subroutine acc_data !$acc data present(a) async !$acc end data -! CHECK: acc.data dataOperands(%{{.*}}) { -! CHECK: } attributes {asyncOnly = [#acc.device_type]} +! CHECK: acc.data async dataOperands(%{{.*}}) { +! CHECK: } !$acc data copy(a) async(1) !$acc end data diff --git a/flang/test/Lower/OpenACC/acc-data.f90 b/flang/test/Lower/OpenACC/acc-data.f90 index 7965fdc0ac707..3032ce7109c1e 100644 --- a/flang/test/Lower/OpenACC/acc-data.f90 +++ b/flang/test/Lower/OpenACC/acc-data.f90 @@ -155,8 +155,8 @@ subroutine acc_data !$acc data present(a) async !$acc end data -! CHECK: acc.data dataOperands(%{{.*}}) { -! CHECK: } attributes {asyncOnly = [#acc.device_type]} +! CHECK: acc.data async dataOperands(%{{.*}}) { +! CHECK: } !$acc data copy(a) async(1) !$acc end data diff --git a/flang/test/Lower/OpenACC/acc-enter-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-enter-data-unwrap-defaultbounds.f90 index c42350a07c498..3e08068bdec44 100644 --- a/flang/test/Lower/OpenACC/acc-enter-data-unwrap-defaultbounds.f90 +++ b/flang/test/Lower/OpenACC/acc-enter-data-unwrap-defaultbounds.f90 @@ -94,20 +94,20 @@ subroutine acc_enter_data !$acc enter data create(a) async !CHECK: %[[BOUND0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index) !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[EXTENT_C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index) -!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref> {asyncOnly = [#acc.device_type], name = "a", structured = false} -!CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref>) attributes {async} +!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND0]], %[[BOUND1]]) async -> !fir.ref> {name = "a", structured = false} +!CHECK: acc.enter_data async dataOperands(%[[CREATE_A]] : !fir.ref>) !$acc enter data create(a) wait !CHECK: %[[BOUND0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index) !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[EXTENT_C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index) !CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref> {name = "a", structured = false} -!CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref>) attributes {wait} +!CHECK: acc.enter_data wait dataOperands(%[[CREATE_A]] : !fir.ref>) !$acc enter data create(a) async wait !CHECK: %[[BOUND0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index) !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[EXTENT_C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index) -!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref> {asyncOnly = [#acc.device_type], name = "a", structured = false} -!CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref>) attributes {async, wait} +!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND0]], %[[BOUND1]]) async -> !fir.ref> {name = "a", structured = false} +!CHECK: acc.enter_data async wait dataOperands(%[[CREATE_A]] : !fir.ref>) !$acc enter data create(a) async(1) !CHECK: %[[ASYNC1:.*]] = arith.constant 1 : i32 diff --git a/flang/test/Lower/OpenACC/acc-enter-data.f90 b/flang/test/Lower/OpenACC/acc-enter-data.f90 index 3e49259c360eb..f7396660a6d3c 100644 --- a/flang/test/Lower/OpenACC/acc-enter-data.f90 +++ b/flang/test/Lower/OpenACC/acc-enter-data.f90 @@ -53,16 +53,16 @@ subroutine acc_enter_data !CHECK: acc.enter_data dataOperands(%[[COPYIN_A]], %[[CREATE_B]], %[[ATTACH_D]] : !fir.ref>, !fir.ref>, !fir.ref>>){{$}} !$acc enter data create(a) async -!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {asyncOnly = [#acc.device_type], name = "a", structured = false} -!CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref>) attributes {async} +!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) async -> !fir.ref> {name = "a", structured = false} +!CHECK: acc.enter_data async dataOperands(%[[CREATE_A]] : !fir.ref>) !$acc enter data create(a) wait !CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a", structured = false} -!CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref>) attributes {wait} +!CHECK: acc.enter_data wait dataOperands(%[[CREATE_A]] : !fir.ref>) !$acc enter data create(a) async wait -!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {asyncOnly = [#acc.device_type], name = "a", structured = false} -!CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref>) attributes {async, wait} +!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref>) async -> !fir.ref> {name = "a", structured = false} +!CHECK: acc.enter_data async wait dataOperands(%[[CREATE_A]] : !fir.ref>) !$acc enter data create(a) async(1) !CHECK: %[[ASYNC1:.*]] = arith.constant 1 : i32 diff --git a/flang/test/Lower/OpenACC/acc-exit-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-exit-data-unwrap-defaultbounds.f90 index 7999a7647f49b..fd942173b637a 100644 --- a/flang/test/Lower/OpenACC/acc-exit-data-unwrap-defaultbounds.f90 +++ b/flang/test/Lower/OpenACC/acc-exit-data-unwrap-defaultbounds.f90 @@ -56,19 +56,19 @@ subroutine acc_exit_data !CHECK: acc.detach accPtr(%[[DEVPTR_D]] : !fir.ptr) {name = "d", structured = false} !$acc exit data delete(a) async -!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref> {asyncOnly = [#acc.device_type], dataClause = #acc, name = "a", structured = false} -!CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref>) attributes {async} -!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) bounds(%{{.*}}, %{{.*}}) {asyncOnly = [#acc.device_type], name = "a", structured = false} +!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%{{.*}}, %{{.*}}) async -> !fir.ref> {dataClause = #acc, name = "a", structured = false} +!CHECK: acc.exit_data async dataOperands(%[[DEVPTR]] : !fir.ref>) +!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) bounds(%{{.*}}, %{{.*}}) async {name = "a", structured = false} !$acc exit data delete(a) wait !CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref> {dataClause = #acc, name = "a", structured = false} -!CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref>) attributes {wait} +!CHECK: acc.exit_data wait dataOperands(%[[DEVPTR]] : !fir.ref>) !CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) bounds(%{{.*}}, %{{.*}}) {name = "a", structured = false} !$acc exit data delete(a) async wait -!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref> {asyncOnly = [#acc.device_type], dataClause = #acc, name = "a", structured = false} -!CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref>) attributes {async, wait} -!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) bounds(%{{.*}}, %{{.*}}) {asyncOnly = [#acc.device_type], name = "a", structured = false} +!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%{{.*}}, %{{.*}}) async -> !fir.ref> {dataClause = #acc, name = "a", structured = false} +!CHECK: acc.exit_data async wait dataOperands(%[[DEVPTR]] : !fir.ref>) +!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) bounds(%{{.*}}, %{{.*}}) async {name = "a", structured = false} !$acc exit data delete(a) async(1) !CHECK: %[[ASYNC1:.*]] = arith.constant 1 : i32 diff --git a/flang/test/Lower/OpenACC/acc-exit-data.f90 b/flang/test/Lower/OpenACC/acc-exit-data.f90 index bf5f7094913a1..cbc63ac81945c 100644 --- a/flang/test/Lower/OpenACC/acc-exit-data.f90 +++ b/flang/test/Lower/OpenACC/acc-exit-data.f90 @@ -54,19 +54,19 @@ subroutine acc_exit_data !CHECK: acc.detach accPtr(%[[DEVPTR_D]] : !fir.ref>>) {name = "d", structured = false} !$acc exit data delete(a) async -!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {asyncOnly = [#acc.device_type], dataClause = #acc, name = "a", structured = false} -!CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref>) attributes {async} -!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) {asyncOnly = [#acc.device_type], name = "a", structured = false} +!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) async -> !fir.ref> {dataClause = #acc, name = "a", structured = false} +!CHECK: acc.exit_data async dataOperands(%[[DEVPTR]] : !fir.ref>) +!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) async {name = "a", structured = false} !$acc exit data delete(a) wait !CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {dataClause = #acc, name = "a", structured = false} -!CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref>) attributes {wait} +!CHECK: acc.exit_data wait dataOperands(%[[DEVPTR]] : !fir.ref>) !CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) {name = "a", structured = false} !$acc exit data delete(a) async wait -!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {asyncOnly = [#acc.device_type], dataClause = #acc, name = "a", structured = false} -!CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref>) attributes {async, wait} -!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) {asyncOnly = [#acc.device_type], name = "a", structured = false} +!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) async -> !fir.ref> {dataClause = #acc, name = "a", structured = false} +!CHECK: acc.exit_data async wait dataOperands(%[[DEVPTR]] : !fir.ref>) +!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref>) async {name = "a", structured = false} !$acc exit data delete(a) async(1) !CHECK: %[[ASYNC1:.*]] = arith.constant 1 : i32 diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90 index 0ded708cb1a3b..8608b0ad98ce6 100644 --- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90 @@ -69,12 +69,12 @@ subroutine acc_kernels_loop END DO !$acc end kernels loop -! CHECK: acc.kernels {{.*}} { +! CHECK: acc.kernels {{.*}} async { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} ! CHECK: acc.terminator -! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} +! CHECK-NEXT: } !$acc kernels loop async(1) DO i = 1, n @@ -102,6 +102,12 @@ subroutine acc_kernels_loop ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} + !$acc kernels loop async(async) device_type(nvidia) async(1) + DO i = 1, n + a(i) = b(i) + END DO +! CHECK: acc.kernels combined(loop) async(%{{.*}} : i32, %c1{{.*}} : i32 [#acc.device_type]) + !$acc kernels loop wait DO i = 1, n a(i) = b(i) diff --git a/flang/test/Lower/OpenACC/acc-kernels.f90 b/flang/test/Lower/OpenACC/acc-kernels.f90 index 6b7a625b34f71..b90870db25095 100644 --- a/flang/test/Lower/OpenACC/acc-kernels.f90 +++ b/flang/test/Lower/OpenACC/acc-kernels.f90 @@ -38,9 +38,9 @@ subroutine acc_kernels !$acc kernels async !$acc end kernels -! CHECK: acc.kernels { +! CHECK: acc.kernels async { ! CHECK: acc.terminator -! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} +! CHECK-NEXT: } !$acc kernels async(1) !$acc end kernels diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90 index ccd37d87262e3..4cf268d2517f5 100644 --- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90 @@ -71,12 +71,12 @@ subroutine acc_parallel_loop END DO !$acc end parallel loop -! CHECK: acc.parallel {{.*}} { +! CHECK: acc.parallel {{.*}} async { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} ! CHECK: acc.yield -! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} +! CHECK-NEXT: } !$acc parallel loop async(1) DO i = 1, n @@ -104,6 +104,12 @@ subroutine acc_parallel_loop ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} + !$acc parallel loop async(async) device_type(nvidia) async(1) + DO i = 1, n + a(i) = b(i) + END DO +! CHECK: acc.parallel combined(loop) async(%{{.*}} : i32, %c1{{.*}} : i32 [#acc.device_type]) + !$acc parallel loop wait DO i = 1, n a(i) = b(i) diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90 index e00ea41210966..1eae106ba61b2 100644 --- a/flang/test/Lower/OpenACC/acc-parallel.f90 +++ b/flang/test/Lower/OpenACC/acc-parallel.f90 @@ -60,9 +60,9 @@ subroutine acc_parallel !$acc parallel async !$acc end parallel -! CHECK: acc.parallel { +! CHECK: acc.parallel async { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} +! CHECK-NEXT: } !$acc parallel async(1) !$acc end parallel diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90 index 478dfa0d96c3b..34391f78ae707 100644 --- a/flang/test/Lower/OpenACC/acc-serial-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90 @@ -90,12 +90,12 @@ subroutine acc_serial_loop END DO !$acc end serial loop -! CHECK: acc.serial {{.*}} { +! CHECK: acc.serial {{.*}} async { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} ! CHECK: acc.yield -! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} +! CHECK-NEXT: } !$acc serial loop async(1) DO i = 1, n @@ -123,6 +123,12 @@ subroutine acc_serial_loop ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} + !$acc serial loop async(async) device_type(nvidia) async(1) + DO i = 1, n + a(i) = b(i) + END DO +! CHECK: acc.serial combined(loop) async(%{{.*}} : i32, %c1{{.*}} : i32 [#acc.device_type]) + !$acc serial loop wait DO i = 1, n a(i) = b(i) diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90 index 9ba44ce6b9197..1e4f32fd209ef 100644 --- a/flang/test/Lower/OpenACC/acc-serial.f90 +++ b/flang/test/Lower/OpenACC/acc-serial.f90 @@ -60,9 +60,9 @@ subroutine acc_serial !$acc serial async !$acc end serial -! CHECK: acc.serial { +! CHECK: acc.serial async { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type]} +! CHECK-NEXT: } !$acc serial async(1) !$acc end serial diff --git a/flang/test/Lower/OpenACC/acc-update.f90 b/flang/test/Lower/OpenACC/acc-update.f90 index f96b105ed93bd..f98af425de985 100644 --- a/flang/test/Lower/OpenACC/acc-update.f90 +++ b/flang/test/Lower/OpenACC/acc-update.f90 @@ -63,9 +63,9 @@ subroutine acc_update ! CHECK: acc.update_host accPtr(%[[DEVPTR_B]] : !fir.ref>) to varPtr(%[[DECLB]]#0 : !fir.ref>) {name = "b", structured = false} !$acc update host(a) async -! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {asyncOnly = [#acc.device_type], dataClause = #acc, name = "a", structured = false} +! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) async -> !fir.ref> {dataClause = #acc, name = "a", structured = false} ! CHECK: acc.update async dataOperands(%[[DEVPTR_A]] : !fir.ref>) -! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) to varPtr(%[[DECLA]]#0 : !fir.ref>) {asyncOnly = [#acc.device_type], name = "a", structured = false} +! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) async to varPtr(%[[DECLA]]#0 : !fir.ref>) {name = "a", structured = false} !$acc update host(a) wait ! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {dataClause = #acc, name = "a", structured = false} @@ -73,9 +73,9 @@ subroutine acc_update ! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) to varPtr(%[[DECLA]]#0 : !fir.ref>) {name = "a", structured = false} !$acc update host(a) async wait -! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {asyncOnly = [#acc.device_type], dataClause = #acc, name = "a", structured = false} +! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) async -> !fir.ref> {dataClause = #acc, name = "a", structured = false} ! CHECK: acc.update async wait dataOperands(%[[DEVPTR_A]] : !fir.ref>) -! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) to varPtr(%[[DECLA]]#0 : !fir.ref>) {asyncOnly = [#acc.device_type], name = "a", structured = false} +! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) async to varPtr(%[[DECLA]]#0 : !fir.ref>) {name = "a", structured = false} !$acc update host(a) async(1) ! CHECK: [[ASYNC1:%.*]] = arith.constant 1 : i32 @@ -108,8 +108,8 @@ subroutine acc_update ! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) to varPtr(%[[DECLA]]#0 : !fir.ref>) {name = "a", structured = false} !$acc update host(a) device_type(host, nvidia) async -! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {asyncOnly = [#acc.device_type, #acc.device_type], dataClause = #acc, name = "a", structured = false} +! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref>) async([#acc.device_type, #acc.device_type]) -> !fir.ref> {dataClause = #acc, name = "a", structured = false} ! CHECK: acc.update async([#acc.device_type, #acc.device_type]) dataOperands(%[[DEVPTR_A]] : !fir.ref>) -! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) to varPtr(%[[DECLA]]#0 : !fir.ref>) {asyncOnly = [#acc.device_type, #acc.device_type], name = "a", structured = false} +! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref>) async([#acc.device_type, #acc.device_type]) to varPtr(%[[DECLA]]#0 : !fir.ref>) {name = "a", structured = false} end subroutine acc_update diff --git a/flang/test/Lower/OpenACC/acc-wait.f90 b/flang/test/Lower/OpenACC/acc-wait.f90 index 8a42c97a12811..35db640a054c2 100644 --- a/flang/test/Lower/OpenACC/acc-wait.f90 +++ b/flang/test/Lower/OpenACC/acc-wait.f90 @@ -25,7 +25,7 @@ subroutine acc_update !$acc wait(1) async !CHECK: [[WAIT3:%.*]] = arith.constant 1 : i32 -!CHECK: acc.wait([[WAIT3]] : i32) attributes {async} +!CHECK: acc.wait([[WAIT3]] : i32) async !$acc wait(1) async(async) !CHECK: [[WAIT3:%.*]] = arith.constant 1 : i32 diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 new file mode 100644 index 0000000000000..8acc399a92abe --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 @@ -0,0 +1,13 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASKLOOP construct +subroutine omp_taskloop_inreduction() + integer x + x = 0 + !$omp taskloop in_reduction(+:x) + do i = 1, 100 + x = x + 1 + end do + !$omp end taskloop +end subroutine omp_taskloop_inreduction diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 new file mode 100644 index 0000000000000..0c16bd227257f --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 @@ -0,0 +1,13 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause REDUCTION in TASKLOOP construct +subroutine omp_taskloop_reduction() + integer x + x = 0 + !$omp taskloop reduction(+:x) + do i = 1, 100 + x = x + 1 + end do + !$omp end taskloop +end subroutine omp_taskloop_reduction diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 4815e6564fc7e..f04aacc63fc2b 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -544,7 +544,7 @@ subroutine omp_target_device_addr !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr({{.*}} : !fir.llvm_ptr>) -> !fir.llvm_ptr> {name = ""} !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> {name = "a"} - !CHECK: %[[DEV_ADDR_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr({{.*}} : !fir.llvm_ptr>) -> !fir.llvm_ptr> {name = ""} + !CHECK: %[[DEV_ADDR_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, i32) map_clauses(return_param) capture(ByRef) var_ptr_ptr({{.*}} : !fir.llvm_ptr>) -> !fir.llvm_ptr> {name = ""} !CHECK: %[[DEV_ADDR:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref>>, !fir.box>) map_clauses(to) capture(ByRef) members(%[[DEV_ADDR_MEMBERS]] : [0] : !fir.llvm_ptr>) -> !fir.ref>> {name = "a"} !CHECK: omp.target_data map_entries(%[[MAP]], %[[MAP_MEMBERS]] : {{.*}}) use_device_addr(%[[DEV_ADDR]] -> %[[ARG_0:.*]], %[[DEV_ADDR_MEMBERS]] -> %[[ARG_1:.*]] : !fir.ref>>, !fir.llvm_ptr>) { !$omp target data map(tofrom: a) use_device_addr(a) diff --git a/flang/test/Preprocessing/func-on-command-line.F90 b/flang/test/Preprocessing/func-on-command-line.F90 new file mode 100644 index 0000000000000..cf844e021b371 --- /dev/null +++ b/flang/test/Preprocessing/func-on-command-line.F90 @@ -0,0 +1,4 @@ +! RUN: %flang_fc1 -fdebug-unparse "-Dfoo(a,b)=bar(a+b)" %s | FileCheck %s +! CHECK: CALL bar(3_4) +call foo(1,2) +end diff --git a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 index 8653978fb6249..29985a02eb6ef 100644 --- a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 +++ b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 @@ -295,4 +295,13 @@ program openacc_kernels_loop_validity if(i == 10) cycle end do + !$acc kernels loop async(1) device_type(nvidia) async(3) + do i = 1, n + end do + +!ERROR: At most one ASYNC clause can appear on the KERNELS LOOP directive or in group separated by the DEVICE_TYPE clause + !$acc kernels loop async(1) device_type(nvidia) async async + do i = 1, n + end do + end program openacc_kernels_loop_validity diff --git a/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90 b/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90 index 7f33f9e145110..78e1a7ad7c452 100644 --- a/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90 +++ b/flang/test/Semantics/OpenACC/acc-parallel-loop-validity.f90 @@ -141,4 +141,13 @@ program openacc_parallel_loop_validity if(i == 10) cycle end do + !$acc parallel loop async(1) device_type(nvidia) async(3) + do i = 1, n + end do + +!ERROR: At most one ASYNC clause can appear on the PARALLEL LOOP directive or in group separated by the DEVICE_TYPE clause + !$acc parallel loop async(1) device_type(nvidia) async async + do i = 1, n + end do + end program openacc_parallel_loop_validity diff --git a/flang/test/Semantics/OpenACC/acc-serial-loop.f90 b/flang/test/Semantics/OpenACC/acc-serial-loop.f90 index 2832274680eca..5d2be7f7c6474 100644 --- a/flang/test/Semantics/OpenACC/acc-serial-loop.f90 +++ b/flang/test/Semantics/OpenACC/acc-serial-loop.f90 @@ -111,4 +111,13 @@ program openacc_serial_loop_validity if(i == 10) cycle end do + !$acc serial loop async(1) device_type(nvidia) async(3) + do i = 1, n + end do + +!ERROR: At most one ASYNC clause can appear on the SERIAL LOOP directive or in group separated by the DEVICE_TYPE clause + !$acc serial loop async(1) device_type(nvidia) async async + do i = 1, n + end do + end program openacc_serial_loop_validity diff --git a/flang/test/Semantics/OpenMP/cancellation-construct-type.f90 b/flang/test/Semantics/OpenMP/cancellation-construct-type.f90 new file mode 100644 index 0000000000000..c9d1408fd83ef --- /dev/null +++ b/flang/test/Semantics/OpenMP/cancellation-construct-type.f90 @@ -0,0 +1,11 @@ +!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags + +subroutine f(x) + integer :: x +!ERROR: PARALLEL cannot follow SECTIONS +!$omp sections parallel +!$omp section + x = x + 1 +!$omp end sections +end +end diff --git a/flang/test/Semantics/pad-hollerith-arg.f b/flang/test/Semantics/pad-hollerith-arg.f new file mode 100644 index 0000000000000..75678441ea45f --- /dev/null +++ b/flang/test/Semantics/pad-hollerith-arg.f @@ -0,0 +1,5 @@ +! RUN: %flang_fc1 -fdebug-unparse %s | FileCheck %s +! Ensure that Hollerith actual arguments are blank padded. +! CHECK: CALL foo("abc ") + call foo(3habc) + end diff --git a/libc/config/config.json b/libc/config/config.json index d738aade74427..bfe956855cb52 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -98,7 +98,7 @@ }, "LIBC_CONF_FREXP_INF_NAN_EXPONENT": { "value": "", - "doc": "The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified. Configue an explicit exp value for Inf/NaN inputs." + "doc": "The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified. Configure an explicit exp value for Inf/NaN inputs." } }, "qsort": { diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index c2a31b9f5c964..6ee6cd3c7ba11 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -33,8 +33,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.fcntl.openat # poll.h entrypoints - # TODO: https://github.com/llvm/llvm-project/issues/125940 - # libc.src.poll.poll + libc.src.poll.poll # sched.h entrypoints libc.src.sched.sched_get_priority_max @@ -290,7 +289,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.sys.statvfs.statvfs # sys/utimes.h entrypoints - # libc.src.sys.time.utimes + libc.src.sys.time.utimes # sys/utsname.h entrypoints libc.src.sys.utsname.uname diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index dee9a63101eb9..8d53390ae19bf 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -33,7 +33,7 @@ to learn about the defaults for your platform and target. * **"general" options** - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** - - ``LIBC_CONF_FREXP_INF_NAN_EXPONENT``: The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified. Configue an explicit exp value for Inf/NaN inputs. + - ``LIBC_CONF_FREXP_INF_NAN_EXPONENT``: The value written back to the second parameter when calling frexp/frexpf/frexpl` with `+/-Inf`/`NaN` is unspecified. Configure an explicit exp value for Inf/NaN inputs. - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST. * **"printf" options** - ``LIBC_CONF_PRINTF_DISABLE_FIXED_POINT``: Disable printing fixed point values in printf and friends. diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst index 28d8277417b2b..0bd3a69ae3ffe 100644 --- a/libc/docs/dev/code_style.rst +++ b/libc/docs/dev/code_style.rst @@ -285,7 +285,7 @@ Example usage: } // LIBC_NAMESPACE_DECL Having hidden visibility on the namespace ensures extern declarations in a given TU -have known visibility and never generate GOT indirextions. The attribute guarantees +have known visibility and never generate GOT indirections. The attribute guarantees this independently of global compile options and build systems. .. diff --git a/libc/docs/dev/source_tree_layout.rst b/libc/docs/dev/source_tree_layout.rst index 62c0434a0b2aa..8430a7478cf05 100644 --- a/libc/docs/dev/source_tree_layout.rst +++ b/libc/docs/dev/source_tree_layout.rst @@ -22,7 +22,7 @@ directories:: - test - utils -Each of these directories is explained breifly below. +Each of these directories is explained briefly below. The ``benchmarks`` directory ---------------------------- diff --git a/libc/hdr/types/ACTION.h b/libc/hdr/types/ACTION.h new file mode 100644 index 0000000000000..0b63521dff64d --- /dev/null +++ b/libc/hdr/types/ACTION.h @@ -0,0 +1,22 @@ +//===-- Proxy header for ACTION -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_ACTION_H +#define LLVM_LIBC_HDR_TYPES_ACTION_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/ACTION.h" + +#else // Overlay mode + +#include + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_ACTION_H diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index b2890871902f2..5f6197c93d445 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -383,3 +383,12 @@ add_proxy_header_library( libc.include.llvm-libc-types.ENTRY libc.include.search ) + +add_proxy_header_library( + ACTION + HDRS + ACTION.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.ACTION + libc.include.search +) diff --git a/libc/include/sys/syscall.h.def b/libc/include/sys/syscall.h.def index 03c19eb0885ed..6d74cc6f78556 100644 --- a/libc/include/sys/syscall.h.def +++ b/libc/include/sys/syscall.h.def @@ -1517,6 +1517,10 @@ #define SYS_ppoll __NR_ppoll #endif +#ifdef __NR_ppoll_time64 +#define SYS_ppoll_time64 __NR_ppoll_time64 +#endif + #ifdef __NR_prctl #define SYS_prctl __NR_prctl #endif @@ -2301,6 +2305,10 @@ #define SYS_utimes __NR_utimes #endif +#ifdef __NR_utimensat_time64 +#define SYS_utimensat_time64 __NR_utimensat_time64 +#endif + #ifdef __NR_utrap_install #define SYS_utrap_install __NR_utrap_install #endif diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h index f18ace7419940..6c3e1520e5aff 100644 --- a/libc/src/__support/FPUtil/dyadic_float.h +++ b/libc/src/__support/FPUtil/dyadic_float.h @@ -175,7 +175,7 @@ template struct DyadicFloat { LIBC_INLINE constexpr cpp::enable_if_t< cpp::is_floating_point_v && (FPBits::FRACTION_LEN < Bits), T> generic_as() const { - using FPBits = FPBits; + using FPBits = FPBits; using StorageType = typename FPBits::StorageType; constexpr int EXTRA_FRACTION_LEN = Bits - 1 - FPBits::FRACTION_LEN; diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index 0fd3a6498b865..1b3e6edfc4e0d 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -92,6 +92,18 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x, return __gpu_shuffle_idx_u32(lane_mask, idx, x, width); } +LIBC_INLINE uint64_t shuffle(uint64_t lane_mask, uint32_t idx, uint64_t x, + uint32_t width = __gpu_num_lanes()) { + return __gpu_shuffle_idx_u64(lane_mask, idx, x, width); +} + +template +LIBC_INLINE T *shuffle(uint64_t lane_mask, uint32_t idx, T *x, + uint32_t width = __gpu_num_lanes()) { + return reinterpret_cast(__gpu_shuffle_idx_u64( + lane_mask, idx, reinterpret_cast(x), width)); +} + LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) { return __gpu_match_any_u32(lane_mask, x); } diff --git a/libc/src/poll/linux/poll.cpp b/libc/src/poll/linux/poll.cpp index d7c195878ae12..2579ec04c1200 100644 --- a/libc/src/poll/linux/poll.cpp +++ b/libc/src/poll/linux/poll.cpp @@ -18,14 +18,24 @@ #include // SYS_poll, SYS_ppoll +#ifdef SYS_poll +constexpr auto POLL_SYSCALL_ID = SYS_poll; +#elif defined(SYS_ppoll) +constexpr auto POLL_SYSCALL_ID = SYS_ppoll; +#elif defined(SYS_ppoll_time64) +constexpr auto POLL_SYSCALL_ID = SYS_ppoll_time64; +#else +#error "poll, ppoll, ppoll_time64 syscalls not available." +#endif + namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, poll, (pollfd * fds, nfds_t nfds, int timeout)) { int ret = 0; #ifdef SYS_poll - ret = LIBC_NAMESPACE::syscall_impl(SYS_poll, fds, nfds, timeout); -#elif defined(SYS_ppoll) + ret = LIBC_NAMESPACE::syscall_impl(POLL_SYSCALL_ID, fds, nfds, timeout); +#elif defined(SYS_ppoll) || defined(SYS_ppoll_time64) timespec ts, *tsp; if (timeout >= 0) { ts.tv_sec = timeout / 1000; @@ -34,11 +44,8 @@ LLVM_LIBC_FUNCTION(int, poll, (pollfd * fds, nfds_t nfds, int timeout)) { } else { tsp = nullptr; } - ret = - LIBC_NAMESPACE::syscall_impl(SYS_ppoll, fds, nfds, tsp, nullptr, 0); -#else -// TODO: https://github.com/llvm/llvm-project/issues/125940 -#error "SYS_ppoll_time64?" + ret = LIBC_NAMESPACE::syscall_impl(POLL_SYSCALL_ID, fds, nfds, tsp, + nullptr, 0); #endif if (ret < 0) { diff --git a/libc/src/search/CMakeLists.txt b/libc/src/search/CMakeLists.txt index 6b7fe3ae5e123..0ed513e648ed1 100644 --- a/libc/src/search/CMakeLists.txt +++ b/libc/src/search/CMakeLists.txt @@ -34,6 +34,7 @@ add_entrypoint_object( HDRS hsearch.h DEPENDS + libc.hdr.types.ACTION libc.hdr.types.ENTRY libc.src.search.hsearch.global libc.src.__support.HashTable.table @@ -48,6 +49,7 @@ add_entrypoint_object( HDRS hsearch_r.h DEPENDS + libc.hdr.types.ACTION libc.hdr.types.ENTRY libc.src.__support.HashTable.table libc.src.errno.errno diff --git a/libc/src/search/hsearch.h b/libc/src/search/hsearch.h index 6619451580940..820ebde522231 100644 --- a/libc/src/search/hsearch.h +++ b/libc/src/search/hsearch.h @@ -9,9 +9,9 @@ #ifndef LLVM_LIBC_SRC_SEARCH_HSEARCH_H #define LLVM_LIBC_SRC_SEARCH_HSEARCH_H +#include "hdr/types/ACTION.h" #include "hdr/types/ENTRY.h" #include "src/__support/macros/config.h" -#include // ACTION namespace LIBC_NAMESPACE_DECL { ENTRY *hsearch(ENTRY item, ACTION action); diff --git a/libc/src/search/hsearch_r.h b/libc/src/search/hsearch_r.h index db2f5a8d3cbe1..98f956fc6c6a7 100644 --- a/libc/src/search/hsearch_r.h +++ b/libc/src/search/hsearch_r.h @@ -9,9 +9,10 @@ #ifndef LLVM_LIBC_SRC_SEARCH_HSEARCH_R_H #define LLVM_LIBC_SRC_SEARCH_HSEARCH_R_H +#include "hdr/types/ACTION.h" #include "hdr/types/ENTRY.h" #include "src/__support/macros/config.h" -#include // ACTION +#include // hsearch_data namespace LIBC_NAMESPACE_DECL { int hsearch_r(ENTRY item, ACTION action, ENTRY **retval, diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp index e6e3d073a81a4..76b69937a5f48 100644 --- a/libc/src/sys/time/linux/utimes.cpp +++ b/libc/src/sys/time/linux/utimes.cpp @@ -9,6 +9,7 @@ #include "src/sys/time/utimes.h" #include "hdr/fcntl_macros.h" +#include "hdr/types/struct_timespec.h" #include "hdr/types/struct_timeval.h" #include "src/__support/OSUtil/syscall.h" @@ -20,14 +21,24 @@ namespace LIBC_NAMESPACE_DECL { +#ifdef SYS_utimes +constexpr auto UTIMES_SYSCALL_ID = SYS_utimes; +#elif defined(SYS_utimensat) +constexpr auto UTIMES_SYSCALL_ID = SYS_utimensat; +#elif defined(SYS_utimensat_time64) +constexpr auto UTIMES_SYSCALL_ID = SYS_utimensat_time64; +#else +#error "utimes, utimensat, utimensat_time64, syscalls not available." +#endif + LLVM_LIBC_FUNCTION(int, utimes, (const char *path, const struct timeval times[2])) { int ret; #ifdef SYS_utimes // No need to define a timespec struct, use the syscall directly. - ret = LIBC_NAMESPACE::syscall_impl(SYS_utimes, path, times); -#elif defined(SYS_utimensat) + ret = LIBC_NAMESPACE::syscall_impl(UTIMES_SYSCALL_ID, path, times); +#elif defined(SYS_utimensat) || defined(SYS_utimensat_time64) // the utimensat syscall requires a timespec struct, not timeval. struct timespec ts[2]; struct timespec *ts_ptr = nullptr; // default value if times is nullptr @@ -59,11 +70,8 @@ LLVM_LIBC_FUNCTION(int, utimes, // utimensat syscall. // flags=0 means don't follow symlinks (like utimes) - ret = LIBC_NAMESPACE::syscall_impl(SYS_utimensat, AT_FDCWD, path, ts_ptr, - 0); - -#else -#error "utimensat and utimes syscalls not available." + ret = LIBC_NAMESPACE::syscall_impl(UTIMES_SYSCALL_ID, AT_FDCWD, path, + ts_ptr, 0); #endif // SYS_utimensat if (ret < 0) { diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h index d8772ce38792a..c9f70d2998d37 100644 --- a/libclc/clc/include/clc/clcmacro.h +++ b/libclc/clc/include/clc/clcmacro.h @@ -14,100 +14,140 @@ #define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ - return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ + return (RET_TYPE##2)(FUNCTION(x.s0), FUNCTION(x.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ - return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \ + return (RET_TYPE##3)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \ - return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + return (RET_TYPE##4)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \ + FUNCTION(x.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \ - return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + return (RET_TYPE##8)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \ + FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5), \ + FUNCTION(x.s6), FUNCTION(x.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \ - return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \ + FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \ + FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \ + FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf)); \ } #define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ + return (RET_TYPE##2)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \ - return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \ - FUNCTION(x.z, y.z)); \ + return (RET_TYPE##3)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ + FUNCTION(x.s2, y.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) { \ - return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + return (RET_TYPE##4)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ + FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) { \ - return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + return (RET_TYPE##8)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \ + FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \ + FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \ + FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) { \ - return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), \ + FUNCTION(x.s3, y.s3), FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \ + FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), FUNCTION(x.s8, y.s8), \ + FUNCTION(x.s9, y.s9), FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \ + FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), FUNCTION(x.se, y.se), \ + FUNCTION(x.sf, y.sf)); \ } #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##2)(FUNCTION(x, y.s0), FUNCTION(x, y.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) { \ - return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y), \ - FUNCTION(x, y.z)); \ + return (RET_TYPE##3)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \ + FUNCTION(x, y.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) { \ - return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##4)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \ + FUNCTION(x, y.s2), FUNCTION(x, y.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) { \ - return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##8)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \ + FUNCTION(x, y.s2), FUNCTION(x, y.s3), \ + FUNCTION(x, y.s4), FUNCTION(x, y.s5), \ + FUNCTION(x, y.s6), FUNCTION(x, y.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) { \ - return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x, y.s0), FUNCTION(x, y.s1), FUNCTION(x, y.s2), \ + FUNCTION(x, y.s3), FUNCTION(x, y.s4), FUNCTION(x, y.s5), \ + FUNCTION(x, y.s6), FUNCTION(x, y.s7), FUNCTION(x, y.s8), \ + FUNCTION(x, y.s9), FUNCTION(x, y.sa), FUNCTION(x, y.sb), \ + FUNCTION(x, y.sc), FUNCTION(x, y.sd), FUNCTION(x, y.se), \ + FUNCTION(x, y.sf)); \ } #define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ ARG3_TYPE##2 z) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ + return (RET_TYPE##2)(FUNCTION(x.s0, y.s0, z.s0), \ + FUNCTION(x.s1, y.s1, z.s1)); \ } \ \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \ ARG3_TYPE##3 z) { \ - return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \ - FUNCTION(x.z, y.z, z.z)); \ + return (RET_TYPE##3)(FUNCTION(x.s0, y.s0, z.s0), \ + FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2)); \ } \ \ DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, \ ARG3_TYPE##4 z) { \ - return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), \ - FUNCTION(x.hi, y.hi, z.hi)); \ + return (RET_TYPE##4)( \ + FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3)); \ } \ \ DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, \ ARG3_TYPE##8 z) { \ - return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), \ - FUNCTION(x.hi, y.hi, z.hi)); \ + return (RET_TYPE##8)( \ + FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \ + FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \ + FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7)); \ } \ \ DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, \ ARG3_TYPE##16 z) { \ - return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), \ - FUNCTION(x.hi, y.hi, z.hi)); \ + return (RET_TYPE##16)( \ + FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \ + FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \ + FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \ + FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7), \ + FUNCTION(x.s8, y.s8, z.s8), FUNCTION(x.s9, y.s9, z.s9), \ + FUNCTION(x.sa, y.sa, z.sa), FUNCTION(x.sb, y.sb, z.sb), \ + FUNCTION(x.sc, y.sc, z.sc), FUNCTION(x.sd, y.sd, z.sd), \ + FUNCTION(x.se, y.se, z.se), FUNCTION(x.sf, y.sf, z.sf)); \ } #define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ @@ -115,48 +155,53 @@ DECLSPEC __CLC_XCONCAT(RET_TYPE, 2) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 2) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) * y) { \ - return (__CLC_XCONCAT(RET_TYPE, 2))( \ - FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \ - FUNCTION(x.y, \ - (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1))); \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ + return (__CLC_XCONCAT(RET_TYPE, 2))(FUNCTION(x.s0, ptr), \ + FUNCTION(x.s1, ptr + 1)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 3) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 3) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 3) * y) { \ - return (__CLC_XCONCAT(RET_TYPE, 3))( \ - FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \ - FUNCTION(x.y, \ - (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1)), \ - FUNCTION(x.z, \ - (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ + return (__CLC_XCONCAT(RET_TYPE, 3))(FUNCTION(x.s0, ptr), \ + FUNCTION(x.s1, ptr + 1), \ + FUNCTION(x.s2, ptr + 2)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 4) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 4) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) * y) { \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 4))( \ - FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) *)y), \ - FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \ - ARG2_TYPE, 2) *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \ + FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ + FUNCTION(x.s3, ptr + 3)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 8) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 8) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) * y) { \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 8))( \ - FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) *)y), \ - FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \ - ARG2_TYPE, 4) *)((ADDR_SPACE ARG2_TYPE *)y + 4))); \ + FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ + FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \ + FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \ + FUNCTION(x.s7, ptr + 7)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 16) \ FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 16) x, \ ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 16) * y) { \ + ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 16))( \ - FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) *)y), \ - FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \ - ARG2_TYPE, 8) *)((ADDR_SPACE ARG2_TYPE *)y + 8))); \ + FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ + FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \ + FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \ + FUNCTION(x.s7, ptr + 7), FUNCTION(x.s8, ptr + 8), \ + FUNCTION(x.s9, ptr + 9), FUNCTION(x.sa, ptr + 10), \ + FUNCTION(x.sb, ptr + 11), FUNCTION(x.sc, ptr + 12), \ + FUNCTION(x.sd, ptr + 13), FUNCTION(x.se, ptr + 14), \ + FUNCTION(x.sf, ptr + 15)); \ } #define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \ diff --git a/libclc/clc/lib/generic/math/clc_lgamma_r.cl b/libclc/clc/lib/generic/math/clc_lgamma_r.cl index ad3d63b734eca..96a42bbb6e158 100644 --- a/libclc/clc/lib/generic/math/clc_lgamma_r.cl +++ b/libclc/clc/lib/generic/math/clc_lgamma_r.cl @@ -406,13 +406,13 @@ _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_lgamma_r, float, #define v4 1.04222645593369134254e-01 /* 0x3FBAAE55, 0xD6537C88 */ #define v5 3.21709242282423911810e-03 /* 0x3F6A5ABB, 0x57D0CF61 */ -#define s0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ -#define s1 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */ -#define s2 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */ -#define s3 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */ -#define s4 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */ -#define s5 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */ -#define s6 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */ +#define s0_d -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ +#define s1_d 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */ +#define s2_d 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */ +#define s3_d 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */ +#define s4_d 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */ +#define s5_d 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */ +#define s6_d 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */ #define r1 1.39200533467621045958e+00 /* 0x3FF645A7, 0x62C4AB74 */ #define r2 7.21935547567138069525e-01 /* 0x3FE71A18, 0x93D3DCDC */ @@ -530,10 +530,12 @@ _CLC_OVERLOAD _CLC_DEF double __clc_lgamma_r(double x, private int *ip) { __clc_fma( y, __clc_fma( - y, __clc_fma(y, __clc_fma(y, __clc_fma(y, s6, s5), s4), s3), - s2), - s1), - s0); + y, + __clc_fma(y, __clc_fma(y, __clc_fma(y, s6_d, s5_d), s4_d), + s3_d), + s2_d), + s1_d), + s0_d); double q = __clc_fma( y, __clc_fma( diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 3be2bf231eb30..d00b16a899664 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -284,8 +284,9 @@ function(add_libclc_builtin_set) TRIPLE ${ARG_TRIPLE} INPUT ${input_file} OUTPUT ${output_file} - EXTRA_OPTS -fno-builtin -nostdlib "${file_specific_compile_options}" - "${ARG_COMPILE_FLAGS}" -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir} + EXTRA_OPTS -fno-builtin -nostdlib "${ARG_COMPILE_FLAGS}" + "${file_specific_compile_options}" + -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir} DEPENDENCIES ${input_file_dep} ) list( APPEND compile_tgts ${tgt} ) diff --git a/libcxx/docs/CodingGuidelines.rst b/libcxx/docs/CodingGuidelines.rst index 4a601dffa87ca..ff312d16cf7bb 100644 --- a/libcxx/docs/CodingGuidelines.rst +++ b/libcxx/docs/CodingGuidelines.rst @@ -124,8 +124,8 @@ Write SFINAE with ``requires`` clauses in C++20-only code subsume other concepts. This means that overloads based on traits can be written without negating more general cases. They also show intent better. -Write ``enable_if`` as ``enable_if_t = 0`` -========================================================= +Write ``enable_if`` as ``enable_if_t = 0`` +========================================================== The form ``enable_if_t = 0`` is the only one that works in every language mode and for overload sets using the same template arguments otherwise. If the code must work in C++11 or C++03, the libc++-internal alias diff --git a/libcxx/docs/DesignDocs/FileTimeType.rst b/libcxx/docs/DesignDocs/FileTimeType.rst index f775fd840e236..946c9e515fb9b 100644 --- a/libcxx/docs/DesignDocs/FileTimeType.rst +++ b/libcxx/docs/DesignDocs/FileTimeType.rst @@ -33,7 +33,7 @@ which is defined as follows: }; To represent the range and resolution of ``timespec``, we need to (A) have -nanosecond resolution, and (B) use more than 64 bits (assuming a 64 bit ``time_t``). +nanosecond resolution, and (B) use more than 64 bits (assuming a 64-bit ``time_t``). As the standard requires us to use the ``chrono`` interface, we have to define our own filesystem clock which specifies the period and representation of @@ -207,7 +207,7 @@ code in some way: // Overflow during creation bug. file_time_type timespec_to_file_time_type(struct timespec ts) { - // woops! chrono::seconds and chrono::nanoseconds use a 64 bit representation + // woops! chrono::seconds and chrono::nanoseconds use a 64-bit representation // this may overflow before it's converted to a file_time_type. auto dur = seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec); return file_time_type(dur); @@ -272,7 +272,7 @@ look like. The first thing to notice is that we can't construct ``fs_timespec_rep`` like a ``timespec`` by passing ``{secs, nsecs}``. Instead we're limited to -constructing it from a single 64 bit integer. +constructing it from a single 64-bit integer. We also can't allow the user to inspect the ``tv_sec`` or ``tv_nsec`` values directly. A ``chrono::duration`` represents its value as a tick period and a @@ -350,12 +350,12 @@ Though the above example may appear silly, I think it follows from the incorrect notion that using a ``timespec`` rep in chrono actually makes it act as if it were an actual ``timespec``. -Interactions with 32 bit ``time_t`` +Interactions with 32-bit ``time_t`` ----------------------------------- Up until now we've only be considering cases where ``time_t`` is 64 bits, but what -about 32 bit systems/builds where ``time_t`` is 32 bits? (this is the common case -for 32 bit builds). +about 32-bit systems/builds where ``time_t`` is 32 bits? (this is the common case +for 32-bit builds). When ``time_t`` is 32 bits, we can implement ``file_time_type`` simply using 64-bit ``long long``. There is no need to get either ``__int128_t`` or ``timespec`` emulation @@ -431,11 +431,11 @@ Pros: Cons: -* It isn't always available (but on 64 bit machines, it normally is). +* It isn't always available (but on 64-bit machines, it normally is). * It causes ``file_time_type`` to have a larger range than ``timespec``. * It doesn't always act the same as other builtin integer types. For example with ``cout`` or ``to_string``. -* Allows implicit truncation to 64 bit integers. +* Allows implicit truncation to 64-bit integers. * It can be implicitly converted to a builtin integer type by the user, truncating its value. diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index c571dd6f08fe9..4c4227dfef6a2 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -80,6 +80,9 @@ Deprecations and Removals - The ``_LIBCPP_VERBOSE_ABORT_NOT_NOEXCEPT`` has been removed, making ``std::__libcpp_verbose_abort`` unconditionally ``noexcept``. +- libc++ no longer adds ``constexpr`` to ``std::hash>::operator()``, as the ``constexpr`` addition + since C++20 was an unintended extension. + - TODO: The non-conforming extension ``packaged_task::result_type`` has been removed in LLVM 21. Potentially breaking changes diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index 9c2ac9edb6777..3320f7d2e7691 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -291,7 +291,7 @@ tests using exceptions. The code to write a test manually would be: .. code-block:: cpp - void test_excption([[maybe_unused]] int arg) { + void test_exception([[maybe_unused]] int arg) { #ifndef TEST_HAS_NO_EXCEPTIONS // do nothing when tests are disabled try { foo(arg); @@ -308,7 +308,7 @@ The same test using a macro: .. code-block:: cpp - void test_excption([[maybe_unused]] int arg) { + void test_exception([[maybe_unused]] int arg) { TEST_VALIDATE_EXCEPTION(bar, [](const bar& e) { LIBCPP_ASSERT(e.what() == what); diff --git a/libcxx/include/__cxx03/__algorithm/adjacent_find.h b/libcxx/include/__cxx03/__algorithm/adjacent_find.h index 6add0f3fe2b53..ac233233bbc74 100644 --- a/libcxx/include/__cxx03/__algorithm/adjacent_find.h +++ b/libcxx/include/__cxx03/__algorithm/adjacent_find.h @@ -26,8 +26,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter -__adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _Iter __adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) { if (__first == __last) return __first; _Iter __i = __first; @@ -40,13 +39,13 @@ __adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) { } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator adjacent_find(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred) { return std::__adjacent_find(std::move(__first), std::move(__last), __pred); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator adjacent_find(_ForwardIterator __first, _ForwardIterator __last) { return std::adjacent_find(std::move(__first), std::move(__last), __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/all_of.h b/libcxx/include/__cxx03/__algorithm/all_of.h index fe46ee5fca43c..8bc39b027e40b 100644 --- a/libcxx/include/__cxx03/__algorithm/all_of.h +++ b/libcxx/include/__cxx03/__algorithm/all_of.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) { for (; __first != __last; ++__first) if (!__pred(*__first)) diff --git a/libcxx/include/__cxx03/__algorithm/any_of.h b/libcxx/include/__cxx03/__algorithm/any_of.h index 26bf3996e8a6f..6b3462a91a9f4 100644 --- a/libcxx/include/__cxx03/__algorithm/any_of.h +++ b/libcxx/include/__cxx03/__algorithm/any_of.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool any_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) { for (; __first != __last; ++__first) if (__pred(*__first)) diff --git a/libcxx/include/__cxx03/__algorithm/binary_search.h b/libcxx/include/__cxx03/__algorithm/binary_search.h index a72da8e396639..37e273944554c 100644 --- a/libcxx/include/__cxx03/__algorithm/binary_search.h +++ b/libcxx/include/__cxx03/__algorithm/binary_search.h @@ -22,14 +22,14 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) { __first = std::lower_bound<_ForwardIterator, _Tp, __comp_ref_type<_Compare> >(__first, __last, __value, __comp); return __first != __last && !__comp(__value, *__first); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool binary_search(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { return std::binary_search(__first, __last, __value, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/comp.h b/libcxx/include/__cxx03/__algorithm/comp.h index 0c638b4e4a651..420c4344e0af7 100644 --- a/libcxx/include/__cxx03/__algorithm/comp.h +++ b/libcxx/include/__cxx03/__algorithm/comp.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD struct __equal_to { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator()(const _T1& __x, const _T2& __y) const { + _LIBCPP_HIDE_FROM_ABI bool operator()(const _T1& __x, const _T2& __y) const { return __x == __y; } }; @@ -36,7 +36,7 @@ struct __less {}; template <> struct __less { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator()(const _Tp& __lhs, const _Up& __rhs) const { + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __lhs, const _Up& __rhs) const { return __lhs < __rhs; } }; diff --git a/libcxx/include/__cxx03/__algorithm/comp_ref_type.h b/libcxx/include/__cxx03/__algorithm/comp_ref_type.h index ab793da0ad293..bf4d07c89d123 100644 --- a/libcxx/include/__cxx03/__algorithm/comp_ref_type.h +++ b/libcxx/include/__cxx03/__algorithm/comp_ref_type.h @@ -22,10 +22,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct __debug_less { _Compare& __comp_; - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI __debug_less(_Compare& __c) : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI __debug_less(_Compare& __c) : __comp_(__c) {} template - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Up& __y) { + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Up& __y) { bool __r = __comp_(__x, __y); if (__r) __do_compare_assert(0, __y, __x); @@ -33,7 +33,7 @@ struct __debug_less { } template - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(_Tp& __x, _Up& __y) { + _LIBCPP_HIDE_FROM_ABI bool operator()(_Tp& __x, _Up& __y) { bool __r = __comp_(__x, __y); if (__r) __do_compare_assert(0, __y, __x); @@ -41,16 +41,15 @@ struct __debug_less { } template - _LIBCPP_CONSTEXPR_SINCE_CXX14 inline - _LIBCPP_HIDE_FROM_ABI decltype((void)std::declval<_Compare&>()(std::declval<_LHS&>(), std::declval<_RHS&>())) - __do_compare_assert(int, _LHS& __l, _RHS& __r) { + inline _LIBCPP_HIDE_FROM_ABI decltype((void)std::declval<_Compare&>()(std::declval<_LHS&>(), std::declval<_RHS&>())) + __do_compare_assert(int, _LHS& __l, _RHS& __r) { _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering"); (void)__l; (void)__r; } template - _LIBCPP_CONSTEXPR_SINCE_CXX14 inline _LIBCPP_HIDE_FROM_ABI void __do_compare_assert(long, _LHS&, _RHS&) {} + inline _LIBCPP_HIDE_FROM_ABI void __do_compare_assert(long, _LHS&, _RHS&) {} }; // Pass the comparator by lvalue reference. Or in the debug mode, using a debugging wrapper that stores a reference. diff --git a/libcxx/include/__cxx03/__algorithm/copy.h b/libcxx/include/__cxx03/__algorithm/copy.h index 2aa0ab78b7858..ab164a8f9af51 100644 --- a/libcxx/include/__cxx03/__algorithm/copy.h +++ b/libcxx/include/__cxx03/__algorithm/copy.h @@ -29,13 +29,12 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter); +inline _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter); template struct __copy_impl { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _Sent __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _Sent __last, _OutIter __result) const { while (__first != __last) { *__result = *__first; ++__first; @@ -51,18 +50,16 @@ struct __copy_impl { _OutIter& __result_; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _CopySegment(_OutIter& __result) - : __result_(__result) {} + _LIBCPP_HIDE_FROM_ABI explicit _CopySegment(_OutIter& __result) : __result_(__result) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void + _LIBCPP_HIDE_FROM_ABI void operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) { __result_ = std::__copy<_AlgPolicy>(__lfirst, __llast, std::move(__result_)).second; } }; template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { std::__for_each_segment(__first, __last, _CopySegment<_InIter, _OutIter>(__result)); return std::make_pair(__last, std::move(__result)); } @@ -72,8 +69,7 @@ struct __copy_impl { __enable_if_t<__has_random_access_iterator_category<_InIter>::value && !__is_segmented_iterator<_InIter>::value && __is_segmented_iterator<_OutIter>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { using _Traits = __segmented_iterator_traits<_OutIter>; using _DiffT = typename common_type<__iter_diff_t<_InIter>, __iter_diff_t<_OutIter> >::type; @@ -97,21 +93,19 @@ struct __copy_impl { // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> - operator()(_In* __first, _In* __last, _Out* __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_In*, _Out*> operator()(_In* __first, _In* __last, _Out* __result) const { return std::__copy_trivial_impl(__first, __last, __result); } }; template -pair<_InIter, _OutIter> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 -__copy(_InIter __first, _Sent __last, _OutIter __result) { +pair<_InIter, _OutIter> inline _LIBCPP_HIDE_FROM_ABI __copy(_InIter __first, _Sent __last, _OutIter __result) { return std::__copy_move_unwrap_iters<__copy_impl<_AlgPolicy> >( std::move(__first), std::move(__last), std::move(__result)); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { return std::__copy<_ClassicAlgPolicy>(__first, __last, __result).second; } diff --git a/libcxx/include/__cxx03/__algorithm/copy_backward.h b/libcxx/include/__cxx03/__algorithm/copy_backward.h index 9262d13d6c175..0a84b6ed27a98 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_backward.h +++ b/libcxx/include/__cxx03/__algorithm/copy_backward.h @@ -29,14 +29,12 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter> -__copy_backward(_InIter __first, _Sent __last, _OutIter __result); +_LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> __copy_backward(_InIter __first, _Sent __last, _OutIter __result); template struct __copy_backward_impl { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _Sent __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _Sent __last, _OutIter __result) const { auto __last_iter = _IterOps<_AlgPolicy>::next(__first, __last); auto __original_last_iter = __last_iter; @@ -48,8 +46,7 @@ struct __copy_backward_impl { } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { using _Traits = __segmented_iterator_traits<_InIter>; auto __sfirst = _Traits::__segment(__first); auto __slast = _Traits::__segment(__last); @@ -79,8 +76,7 @@ struct __copy_backward_impl { __enable_if_t<__has_random_access_iterator_category<_InIter>::value && !__is_segmented_iterator<_InIter>::value && __is_segmented_iterator<_OutIter>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { using _Traits = __segmented_iterator_traits<_OutIter>; auto __orig_last = __last; auto __segment_iterator = _Traits::__segment(__result); @@ -107,21 +103,20 @@ struct __copy_backward_impl { // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> - operator()(_In* __first, _In* __last, _Out* __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_In*, _Out*> operator()(_In* __first, _In* __last, _Out* __result) const { return std::__copy_backward_trivial_impl(__first, __last, __result); } }; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1, _BidirectionalIterator2> +_LIBCPP_HIDE_FROM_ABI pair<_BidirectionalIterator1, _BidirectionalIterator2> __copy_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result) { return std::__copy_move_unwrap_iters<__copy_backward_impl<_AlgPolicy> >( std::move(__first), std::move(__last), std::move(__result)); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _BidirectionalIterator2 +inline _LIBCPP_HIDE_FROM_ABI _BidirectionalIterator2 copy_backward(_BidirectionalIterator1 __first, _BidirectionalIterator1 __last, _BidirectionalIterator2 __result) { static_assert(std::is_copy_constructible<_BidirectionalIterator1>::value && std::is_copy_constructible<_BidirectionalIterator1>::value, diff --git a/libcxx/include/__cxx03/__algorithm/copy_if.h b/libcxx/include/__cxx03/__algorithm/copy_if.h index 2db0c26fb86be..53a85bc51d8ec 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/copy_if.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator copy_if(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _Predicate __pred) { for (; __first != __last; ++__first) { if (__pred(*__first)) { diff --git a/libcxx/include/__cxx03/__algorithm/copy_move_common.h b/libcxx/include/__cxx03/__algorithm/copy_move_common.h index 637b5a01daa75..8d1ba8e39b8bc 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_move_common.h +++ b/libcxx/include/__cxx03/__algorithm/copy_move_common.h @@ -58,8 +58,7 @@ struct __can_lower_move_assignment_to_memmove { // `memmove` algorithms implementation. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> -__copy_trivial_impl(_In* __first, _In* __last, _Out* __result) { +_LIBCPP_HIDE_FROM_ABI pair<_In*, _Out*> __copy_trivial_impl(_In* __first, _In* __last, _Out* __result) { const size_t __n = static_cast(__last - __first); std::__constexpr_memmove(__result, __first, __element_count(__n)); @@ -68,8 +67,7 @@ __copy_trivial_impl(_In* __first, _In* __last, _Out* __result) { } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> -__copy_backward_trivial_impl(_In* __first, _In* __last, _Out* __result) { +_LIBCPP_HIDE_FROM_ABI pair<_In*, _Out*> __copy_backward_trivial_impl(_In* __first, _In* __last, _Out* __result) { const size_t __n = static_cast(__last - __first); __result -= __n; @@ -89,7 +87,7 @@ template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter> +_LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> __copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) { auto __range = std::__unwrap_range(__first, std::move(__last)); auto __result = _Algorithm()(std::move(__range.first), std::move(__range.second), std::__unwrap_iter(__out_first)); @@ -102,7 +100,7 @@ template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pair<_InIter, _OutIter> +_LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> __copy_move_unwrap_iters(_InIter __first, _Sent __last, _OutIter __out_first) { return _Algorithm()(std::move(__first), std::move(__last), std::move(__out_first)); } diff --git a/libcxx/include/__cxx03/__algorithm/copy_n.h b/libcxx/include/__cxx03/__algorithm/copy_n.h index aedb232b1bd5e..b32b908d89585 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_n.h +++ b/libcxx/include/__cxx03/__algorithm/copy_n.h @@ -27,8 +27,7 @@ template ::value && !__has_random_access_iterator_category<_InputIterator>::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator -copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result) { +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result) { typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize; _IntegralSize __n = __orig_n; if (__n > 0) { @@ -47,8 +46,7 @@ template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator -copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result) { +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator copy_n(_InputIterator __first, _Size __orig_n, _OutputIterator __result) { typedef typename iterator_traits<_InputIterator>::difference_type difference_type; typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize; _IntegralSize __n = __orig_n; diff --git a/libcxx/include/__cxx03/__algorithm/count.h b/libcxx/include/__cxx03/__algorithm/count.h index 28cc28f76dd8f..5440fd031a1d3 100644 --- a/libcxx/include/__cxx03/__algorithm/count.h +++ b/libcxx/include/__cxx03/__algorithm/count.h @@ -31,7 +31,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // generic implementation template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename _IterOps<_AlgPolicy>::template __difference_type<_Iter> +_LIBCPP_HIDE_FROM_ABI typename _IterOps<_AlgPolicy>::template __difference_type<_Iter> __count(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { typename _IterOps<_AlgPolicy>::template __difference_type<_Iter> __r(0); for (; __first != __last; ++__first) @@ -42,7 +42,7 @@ __count(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { // __bit_iterator implementation template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __bit_iterator<_Cp, _IsConst>::difference_type +_LIBCPP_HIDE_FROM_ABI typename __bit_iterator<_Cp, _IsConst>::difference_type __count_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) { using _It = __bit_iterator<_Cp, _IsConst>; using __storage_type = typename _It::__storage_type; @@ -71,7 +71,7 @@ __count_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __iter_diff_t<__bit_iterator<_Cp, _IsConst> > +_LIBCPP_HIDE_FROM_ABI __iter_diff_t<__bit_iterator<_Cp, _IsConst> > __count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) { if (__value) return std::__count_bool(__first, static_cast(__last - __first)); @@ -79,7 +79,7 @@ __count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __l } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __iter_diff_t<_InputIterator> +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI __iter_diff_t<_InputIterator> count(_InputIterator __first, _InputIterator __last, const _Tp& __value) { __identity __proj; return std::__count<_ClassicAlgPolicy>(__first, __last, __value, __proj); diff --git a/libcxx/include/__cxx03/__algorithm/count_if.h b/libcxx/include/__cxx03/__algorithm/count_if.h index d333e86189176..8a31989cf8a33 100644 --- a/libcxx/include/__cxx03/__algorithm/count_if.h +++ b/libcxx/include/__cxx03/__algorithm/count_if.h @@ -20,8 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 -typename iterator_traits<_InputIterator>::difference_type +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI typename iterator_traits<_InputIterator>::difference_type count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred) { typename iterator_traits<_InputIterator>::difference_type __r(0); for (; __first != __last; ++__first) diff --git a/libcxx/include/__cxx03/__algorithm/equal.h b/libcxx/include/__cxx03/__algorithm/equal.h index e1d458590e614..5dbc75720e2a0 100644 --- a/libcxx/include/__cxx03/__algorithm/equal.h +++ b/libcxx/include/__cxx03/__algorithm/equal.h @@ -34,7 +34,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __equal_iter_impl( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool __equal_iter_impl( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate& __pred) { for (; __first1 != __last1; ++__first1, (void)++__first2) if (!__pred(*__first1, *__first2)) @@ -48,20 +48,20 @@ template && !is_volatile<_Tp>::value && !is_volatile<_Up>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0> -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool __equal_iter_impl(_Tp* __first1, _Tp* __last1, _Up* __first2, _BinaryPredicate&) { return std::__constexpr_memcmp_equal(__first1, __first2, __element_count(__last1 - __first1)); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) { return std::__equal_iter_impl( std::__unwrap_iter(__first1), std::__unwrap_iter(__last1), std::__unwrap_iter(__first2), __pred); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2) { return std::equal(__first1, __last1, __first2, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/equal_range.h b/libcxx/include/__cxx03/__algorithm/equal_range.h index c2d23cdf0df4a..e84b536415c20 100644 --- a/libcxx/include/__cxx03/__algorithm/equal_range.h +++ b/libcxx/include/__cxx03/__algorithm/equal_range.h @@ -37,7 +37,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter> +_LIBCPP_HIDE_FROM_ABI pair<_Iter, _Iter> __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp, _Proj&& __proj) { auto __len = _IterOps<_AlgPolicy>::distance(__first, __last); _Iter __end = _IterOps<_AlgPolicy>::next(__first, __last); @@ -60,7 +60,7 @@ __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _ForwardIterator> equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) { static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value, "The comparator has to be callable"); static_assert(is_copy_constructible<_ForwardIterator>::value, "Iterator has to be copy constructible"); @@ -73,7 +73,7 @@ equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _ForwardIterator> equal_range(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { return std::equal_range(std::move(__first), std::move(__last), __value, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/fill.h b/libcxx/include/__cxx03/__algorithm/fill.h index 4aaf2744e8a58..c1b92ddf293cf 100644 --- a/libcxx/include/__cxx03/__algorithm/fill.h +++ b/libcxx/include/__cxx03/__algorithm/fill.h @@ -22,21 +22,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD // fill isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset. template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void __fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, forward_iterator_tag) { for (; __first != __last; ++__first) *__first = __value; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void __fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value, random_access_iterator_tag) { std::fill_n(__first, __last - __first, __value); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { +inline _LIBCPP_HIDE_FROM_ABI void fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { std::__fill(__first, __last, __value, typename iterator_traits<_ForwardIterator>::iterator_category()); } diff --git a/libcxx/include/__cxx03/__algorithm/fill_n.h b/libcxx/include/__cxx03/__algorithm/fill_n.h index 99b712c7b0360..b52f650241176 100644 --- a/libcxx/include/__cxx03/__algorithm/fill_n.h +++ b/libcxx/include/__cxx03/__algorithm/fill_n.h @@ -28,12 +28,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset. template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator -__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value); +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value); template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void -__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { +_LIBCPP_HIDE_FROM_ABI void __fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { using _It = __bit_iterator<_Cp, false>; using __storage_type = typename _It::__storage_type; @@ -66,7 +64,7 @@ __fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> +inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) { if (__n > 0) { if (__value) @@ -78,16 +76,14 @@ __fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator -__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { for (; __n > 0; ++__first, (void)--__n) *__first = __value; return __first; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator -fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) { return std::__fill_n(__first, std::__convert_to_integral(__n), __value); } diff --git a/libcxx/include/__cxx03/__algorithm/find.h b/libcxx/include/__cxx03/__algorithm/find.h index 8afa8cb389d16..ddf8201c6e951 100644 --- a/libcxx/include/__cxx03/__algorithm/find.h +++ b/libcxx/include/__cxx03/__algorithm/find.h @@ -41,8 +41,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // generic implementation template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter -__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { +_LIBCPP_HIDE_FROM_ABI _Iter __find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) { for (; __first != __last; ++__first) if (std::__invoke(__proj, *__first) == __value) break; @@ -56,7 +55,7 @@ template ::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value && sizeof(_Tp) == 1, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) { +_LIBCPP_HIDE_FROM_ABI _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) { if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first)) return __ret; return __last; @@ -69,7 +68,7 @@ template ::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value && sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t), int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) { +_LIBCPP_HIDE_FROM_ABI _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) { if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first)) return __ret; return __last; @@ -85,8 +84,7 @@ template ::value && is_integral<_Up>::value && is_signed<_Tp>::value == is_signed<_Up>::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* -__find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) { +_LIBCPP_HIDE_FROM_ABI _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) { if (__value < numeric_limits<_Tp>::min() || __value > numeric_limits<_Tp>::max()) return __last; return std::__find(__first, __last, _Tp(__value), __proj); @@ -94,7 +92,7 @@ __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) { // __bit_iterator implementation template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, _IsConst> +_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, _IsConst> __find_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) { using _It = __bit_iterator<_Cp, _IsConst>; using __storage_type = typename _It::__storage_type; @@ -130,7 +128,7 @@ __find_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n) } template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, _IsConst> +inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, _IsConst> __find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) { if (static_cast(__value)) return std::__find_bool(__first, static_cast(__last - __first)); @@ -146,7 +144,7 @@ template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator +_LIBCPP_HIDE_FROM_ABI _SegmentedIterator __find(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value, _Proj& __proj) { return std::__find_segment_if(std::move(__first), std::move(__last), __find_segment<_Tp>(__value), __proj); } @@ -155,18 +153,17 @@ template struct __find_segment { const _Tp& __value_; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __find_segment(const _Tp& __value) : __value_(__value) {} + _LIBCPP_HIDE_FROM_ABI __find_segment(const _Tp& __value) : __value_(__value) {} template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _InputIterator - operator()(_InputIterator __first, _InputIterator __last, _Proj& __proj) const { + _LIBCPP_HIDE_FROM_ABI _InputIterator operator()(_InputIterator __first, _InputIterator __last, _Proj& __proj) const { return std::__find(__first, __last, __value_, __proj); } }; // public API template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _InputIterator find(_InputIterator __first, _InputIterator __last, const _Tp& __value) { __identity __proj; return std::__rewrap_iter( diff --git a/libcxx/include/__cxx03/__algorithm/find_end.h b/libcxx/include/__cxx03/__algorithm/find_end.h index 5feececb0adfb..8045021d5a526 100644 --- a/libcxx/include/__cxx03/__algorithm/find_end.h +++ b/libcxx/include/__cxx03/__algorithm/find_end.h @@ -36,7 +36,7 @@ template < class _AlgPolicy, class _Pred, class _Proj1, class _Proj2> -_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> __find_end_impl( +_LIBCPP_HIDE_FROM_ABI inline pair<_Iter1, _Iter1> __find_end_impl( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, @@ -88,7 +88,7 @@ template < class _IterOps, class _Sent2, class _Proj1, class _Proj2> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter1 __find_end( +_LIBCPP_HIDE_FROM_ABI _Iter1 __find_end( _Iter1 __first1, _Sent1 __sent1, _Iter2 __first2, @@ -139,7 +139,7 @@ template < class _AlgPolicy, class _Sent2, class _Proj1, class _Proj2> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter1 __find_end( +_LIBCPP_HIDE_FROM_ABI _Iter1 __find_end( _Iter1 __first1, _Sent1 __sent1, _Iter2 __first2, @@ -184,7 +184,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter1 __find_end( } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_end_classic( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator1 __find_end_classic( _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, @@ -205,7 +205,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Fo } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_end( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator1 find_end( _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, @@ -215,7 +215,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Fo } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator1 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2) { return std::find_end(__first1, __last1, __first2, __last2, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/find_first_of.h b/libcxx/include/__cxx03/__algorithm/find_first_of.h index b1b3e5f3be01e..dd61fb4868f6c 100644 --- a/libcxx/include/__cxx03/__algorithm/find_first_of.h +++ b/libcxx/include/__cxx03/__algorithm/find_first_of.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_first_of_ce( +_LIBCPP_HIDE_FROM_ABI _ForwardIterator1 __find_first_of_ce( _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, @@ -35,7 +35,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_fir } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_first_of( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator1 find_first_of( _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, @@ -45,7 +45,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Fo } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 find_first_of( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator1 find_first_of( _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2) { return std::__find_first_of_ce(__first1, __last1, __first2, __last2, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/find_if.h b/libcxx/include/__cxx03/__algorithm/find_if.h index ca4139c86787c..e7cb971f1518f 100644 --- a/libcxx/include/__cxx03/__algorithm/find_if.h +++ b/libcxx/include/__cxx03/__algorithm/find_if.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _InputIterator find_if(_InputIterator __first, _InputIterator __last, _Predicate __pred) { for (; __first != __last; ++__first) if (__pred(*__first)) diff --git a/libcxx/include/__cxx03/__algorithm/find_if_not.h b/libcxx/include/__cxx03/__algorithm/find_if_not.h index a662dfbddfbb9..d3a6d7b44f967 100644 --- a/libcxx/include/__cxx03/__algorithm/find_if_not.h +++ b/libcxx/include/__cxx03/__algorithm/find_if_not.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _InputIterator find_if_not(_InputIterator __first, _InputIterator __last, _Predicate __pred) { for (; __first != __last; ++__first) if (!__pred(*__first)) diff --git a/libcxx/include/__cxx03/__algorithm/find_segment_if.h b/libcxx/include/__cxx03/__algorithm/find_segment_if.h index 3475e9e8bdacd..9fdf8ae53d517 100644 --- a/libcxx/include/__cxx03/__algorithm/find_segment_if.h +++ b/libcxx/include/__cxx03/__algorithm/find_segment_if.h @@ -25,7 +25,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // ranges algorithms, or __identity for classic algorithms. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator +_LIBCPP_HIDE_FROM_ABI _SegmentedIterator __find_segment_if(_SegmentedIterator __first, _SegmentedIterator __last, _Pred __pred, _Proj& __proj) { using _Traits = __segmented_iterator_traits<_SegmentedIterator>; diff --git a/libcxx/include/__cxx03/__algorithm/for_each.h b/libcxx/include/__cxx03/__algorithm/for_each.h index f79eb434465cf..d160a9eddc50b 100644 --- a/libcxx/include/__cxx03/__algorithm/for_each.h +++ b/libcxx/include/__cxx03/__algorithm/for_each.h @@ -26,8 +26,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function -for_each(_InputIterator __first, _InputIterator __last, _Function __f) { +_LIBCPP_HIDE_FROM_ABI _Function for_each(_InputIterator __first, _InputIterator __last, _Function __f) { for (; __first != __last; ++__first) __f(*__first); return __f; diff --git a/libcxx/include/__cxx03/__algorithm/for_each_segment.h b/libcxx/include/__cxx03/__algorithm/for_each_segment.h index 02b4a1799d6a8..b1d54ad427659 100644 --- a/libcxx/include/__cxx03/__algorithm/for_each_segment.h +++ b/libcxx/include/__cxx03/__algorithm/for_each_segment.h @@ -23,8 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // Anything that is returned from __func is ignored. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void -__for_each_segment(_SegmentedIterator __first, _SegmentedIterator __last, _Functor __func) { +_LIBCPP_HIDE_FROM_ABI void __for_each_segment(_SegmentedIterator __first, _SegmentedIterator __last, _Functor __func) { using _Traits = __segmented_iterator_traits<_SegmentedIterator>; auto __sfirst = _Traits::__segment(__first); diff --git a/libcxx/include/__cxx03/__algorithm/generate.h b/libcxx/include/__cxx03/__algorithm/generate.h index fa1929b639ad1..2d98820a8c738 100644 --- a/libcxx/include/__cxx03/__algorithm/generate.h +++ b/libcxx/include/__cxx03/__algorithm/generate.h @@ -18,8 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -generate(_ForwardIterator __first, _ForwardIterator __last, _Generator __gen) { +inline _LIBCPP_HIDE_FROM_ABI void generate(_ForwardIterator __first, _ForwardIterator __last, _Generator __gen) { for (; __first != __last; ++__first) *__first = __gen(); } diff --git a/libcxx/include/__cxx03/__algorithm/generate_n.h b/libcxx/include/__cxx03/__algorithm/generate_n.h index 5a421131070e9..f1ea183ba7d68 100644 --- a/libcxx/include/__cxx03/__algorithm/generate_n.h +++ b/libcxx/include/__cxx03/__algorithm/generate_n.h @@ -19,8 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator -generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) { +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) { typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize; _IntegralSize __n = __orig_n; for (; __n > 0; ++__first, (void)--__n) diff --git a/libcxx/include/__cxx03/__algorithm/half_positive.h b/libcxx/include/__cxx03/__algorithm/half_positive.h index a436a6086b5e9..6e01fce6fa240 100644 --- a/libcxx/include/__cxx03/__algorithm/half_positive.h +++ b/libcxx/include/__cxx03/__algorithm/half_positive.h @@ -23,12 +23,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD // Perform division by two quickly for positive integers (llvm.org/PR39129) template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Integral __half_positive(_Integral __value) { +_LIBCPP_HIDE_FROM_ABI _Integral __half_positive(_Integral __value) { return static_cast<_Integral>(static_cast<__make_unsigned_t<_Integral> >(__value) / 2); } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp __half_positive(_Tp __value) { +_LIBCPP_HIDE_FROM_ABI _Tp __half_positive(_Tp __value) { return __value / 2; } diff --git a/libcxx/include/__cxx03/__algorithm/includes.h b/libcxx/include/__cxx03/__algorithm/includes.h index 725940b5acb74..194f508932272 100644 --- a/libcxx/include/__cxx03/__algorithm/includes.h +++ b/libcxx/include/__cxx03/__algorithm/includes.h @@ -28,7 +28,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __includes( +_LIBCPP_HIDE_FROM_ABI bool __includes( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, @@ -47,7 +47,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __includes( } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -67,7 +67,7 @@ includes(_InputIterator1 __first1, } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool includes(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2) { return std::includes(std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/is_heap.h b/libcxx/include/__cxx03/__algorithm/is_heap.h index c19adb84ba570..a29cefe376409 100644 --- a/libcxx/include/__cxx03/__algorithm/is_heap.h +++ b/libcxx/include/__cxx03/__algorithm/is_heap.h @@ -22,13 +22,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool is_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { return std::__is_heap_until(__first, __last, static_cast<__comp_ref_type<_Compare> >(__comp)) == __last; } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool is_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { return std::is_heap(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/is_heap_until.h b/libcxx/include/__cxx03/__algorithm/is_heap_until.h index e3a6d9769fcc5..6625af701c5c3 100644 --- a/libcxx/include/__cxx03/__algorithm/is_heap_until.h +++ b/libcxx/include/__cxx03/__algorithm/is_heap_until.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator +_LIBCPP_HIDE_FROM_ABI _RandomAccessIterator __is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; difference_type __len = __last - __first; @@ -46,13 +46,13 @@ __is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Co } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _RandomAccessIterator is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { return std::__is_heap_until(__first, __last, static_cast<__comp_ref_type<_Compare> >(__comp)); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _RandomAccessIterator is_heap_until(_RandomAccessIterator __first, _RandomAccessIterator __last) { return std::__is_heap_until(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/is_partitioned.h b/libcxx/include/__cxx03/__algorithm/is_partitioned.h index a7dff7bf42544..c4547d3aa2ffa 100644 --- a/libcxx/include/__cxx03/__algorithm/is_partitioned.h +++ b/libcxx/include/__cxx03/__algorithm/is_partitioned.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool is_partitioned(_InputIterator __first, _InputIterator __last, _Predicate __pred) { for (; __first != __last; ++__first) if (!__pred(*__first)) diff --git a/libcxx/include/__cxx03/__algorithm/is_permutation.h b/libcxx/include/__cxx03/__algorithm/is_permutation.h index 3089acf119845..9402fdf5ce1c3 100644 --- a/libcxx/include/__cxx03/__algorithm/is_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/is_permutation.h @@ -54,7 +54,7 @@ template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation_impl( +_LIBCPP_HIDE_FROM_ABI bool __is_permutation_impl( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, @@ -98,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation_impl( // 2+1 iterators, predicate. Not used by range algorithms. template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation( +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool __is_permutation( _ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 __first2, _BinaryPredicate&& __pred) { // Shorten sequences as much as possible by lopping of any equal prefix. for (; __first1 != __last1; ++__first1, (void)++__first2) { @@ -135,7 +135,7 @@ template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation( +_LIBCPP_HIDE_FROM_ABI bool __is_permutation( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, @@ -178,7 +178,7 @@ template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation( +_LIBCPP_HIDE_FROM_ABI bool __is_permutation( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, @@ -209,7 +209,7 @@ template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation( +_LIBCPP_HIDE_FROM_ABI bool __is_permutation( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, @@ -232,7 +232,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __is_permutation( // 2+1 iterators, predicate template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_permutation( +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool is_permutation( _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _BinaryPredicate __pred) { static_assert(__is_callable<_BinaryPredicate, decltype(*__first1), decltype(*__first2)>::value, "The predicate has to be callable"); @@ -242,7 +242,7 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool is_pe // 2+1 iterators template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2) { return std::is_permutation(__first1, __last1, __first2, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/is_sorted.h b/libcxx/include/__cxx03/__algorithm/is_sorted.h index 1318f5baf8394..262b963f58f6b 100644 --- a/libcxx/include/__cxx03/__algorithm/is_sorted.h +++ b/libcxx/include/__cxx03/__algorithm/is_sorted.h @@ -22,14 +22,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool is_sorted(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) { return std::__is_sorted_until<__comp_ref_type<_Compare> >(__first, __last, __comp) == __last; } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool -is_sorted(_ForwardIterator __first, _ForwardIterator __last) { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool is_sorted(_ForwardIterator __first, _ForwardIterator __last) { return std::is_sorted(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/is_sorted_until.h b/libcxx/include/__cxx03/__algorithm/is_sorted_until.h index f97fb7c2e53ef..9bd3998e1ed78 100644 --- a/libcxx/include/__cxx03/__algorithm/is_sorted_until.h +++ b/libcxx/include/__cxx03/__algorithm/is_sorted_until.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_HIDE_FROM_ABI _ForwardIterator __is_sorted_until(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) { if (__first != __last) { _ForwardIterator __i = __first; @@ -35,13 +35,13 @@ __is_sorted_until(_ForwardIterator __first, _ForwardIterator __last, _Compare __ } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator is_sorted_until(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) { return std::__is_sorted_until<__comp_ref_type<_Compare> >(__first, __last, __comp); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator is_sorted_until(_ForwardIterator __first, _ForwardIterator __last) { return std::is_sorted_until(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/iter_swap.h b/libcxx/include/__cxx03/__algorithm/iter_swap.h index 4fcbcdcf1e050..896a277c6af27 100644 --- a/libcxx/include/__cxx03/__algorithm/iter_swap.h +++ b/libcxx/include/__cxx03/__algorithm/iter_swap.h @@ -20,9 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void iter_swap(_ForwardIterator1 __a, _ForwardIterator2 __b) - // _NOEXCEPT_(_NOEXCEPT_(swap(*__a, *__b))) - _NOEXCEPT_(_NOEXCEPT_(swap(*std::declval<_ForwardIterator1>(), *std::declval<_ForwardIterator2>()))) { +inline _LIBCPP_HIDE_FROM_ABI void iter_swap(_ForwardIterator1 __a, _ForwardIterator2 __b) { swap(*__a, *__b); } diff --git a/libcxx/include/__cxx03/__algorithm/iterator_operations.h b/libcxx/include/__cxx03/__algorithm/iterator_operations.h index c594723e7d906..b824928ee8fd8 100644 --- a/libcxx/include/__cxx03/__algorithm/iterator_operations.h +++ b/libcxx/include/__cxx03/__algorithm/iterator_operations.h @@ -52,14 +52,13 @@ struct _IterOps<_ClassicAlgPolicy> { // advance template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static void advance(_Iter& __iter, _Distance __count) { + _LIBCPP_HIDE_FROM_ABI static void advance(_Iter& __iter, _Distance __count) { std::advance(__iter, __count); } // distance template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static typename iterator_traits<_Iter>::difference_type - distance(_Iter __first, _Iter __last) { + _LIBCPP_HIDE_FROM_ABI static typename iterator_traits<_Iter>::difference_type distance(_Iter __first, _Iter __last) { return std::distance(__first, __last); } @@ -70,7 +69,7 @@ struct _IterOps<_ClassicAlgPolicy> { using __move_t = decltype(std::move(*std::declval<_Iter&>())); template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static void __validate_iter_reference() { + _LIBCPP_HIDE_FROM_ABI static void __validate_iter_reference() { static_assert( is_same<__deref_t<_Iter>, typename iterator_traits<__remove_cvref_t<_Iter> >::reference>::value, "It looks like your iterator's `iterator_traits::reference` does not match the return type of " @@ -80,7 +79,7 @@ struct _IterOps<_ClassicAlgPolicy> { // iter_move template >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static + _LIBCPP_HIDE_FROM_ABI static // If the result of dereferencing `_Iter` is a reference type, deduce the result of calling `std::move` on it. // Note that the C++03 mode doesn't support `decltype(auto)` as the return type. __move_t<_Iter> @@ -91,7 +90,7 @@ struct _IterOps<_ClassicAlgPolicy> { } template >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static + _LIBCPP_HIDE_FROM_ABI static // If the result of dereferencing `_Iter` is a value type, deduce the return value of this function to also be a // value -- otherwise, after `operator*` returns a temporary, this function would return a dangling reference to // that temporary. Note that the C++03 mode doesn't support `auto` as the return type. @@ -104,37 +103,37 @@ struct _IterOps<_ClassicAlgPolicy> { // iter_swap template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static void iter_swap(_Iter1&& __a, _Iter2&& __b) { + _LIBCPP_HIDE_FROM_ABI static void iter_swap(_Iter1&& __a, _Iter2&& __b) { std::iter_swap(std::forward<_Iter1>(__a), std::forward<_Iter2>(__b)); } // next template - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iterator next(_Iterator, _Iterator __last) { + _LIBCPP_HIDE_FROM_ABI static _Iterator next(_Iterator, _Iterator __last) { return __last; } template - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 __remove_cvref_t<_Iter> + _LIBCPP_HIDE_FROM_ABI static __remove_cvref_t<_Iter> next(_Iter&& __it, typename iterator_traits<__remove_cvref_t<_Iter> >::difference_type __n = 1) { return std::next(std::forward<_Iter>(__it), __n); } // prev template - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 __remove_cvref_t<_Iter> + _LIBCPP_HIDE_FROM_ABI static __remove_cvref_t<_Iter> prev(_Iter&& __iter, typename iterator_traits<__remove_cvref_t<_Iter> >::difference_type __n = 1) { return std::prev(std::forward<_Iter>(__iter), __n); } template - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 void __advance_to(_Iter& __first, _Iter __last) { + _LIBCPP_HIDE_FROM_ABI static void __advance_to(_Iter& __first, _Iter __last) { __first = __last; } // advance with sentinel, a la std::ranges::advance template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_Iter> + _LIBCPP_HIDE_FROM_ABI static __difference_type<_Iter> __advance_to(_Iter& __iter, __difference_type<_Iter> __count, const _Iter& __sentinel) { return _IterOps::__advance_to(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category()); } @@ -142,7 +141,7 @@ struct _IterOps<_ClassicAlgPolicy> { private: // advance with sentinel, a la std::ranges::advance -- InputIterator specialization template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter> __advance_to( + _LIBCPP_HIDE_FROM_ABI static __difference_type<_InputIter> __advance_to( _InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) { __difference_type<_InputIter> __dist = 0; for (; __dist < __count && __iter != __sentinel; ++__dist) @@ -152,7 +151,7 @@ struct _IterOps<_ClassicAlgPolicy> { // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_BiDirIter> + _LIBCPP_HIDE_FROM_ABI static __difference_type<_BiDirIter> __advance_to(_BiDirIter& __iter, __difference_type<_BiDirIter> __count, const _BiDirIter& __sentinel, @@ -169,7 +168,7 @@ struct _IterOps<_ClassicAlgPolicy> { // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_RandIter> + _LIBCPP_HIDE_FROM_ABI static __difference_type<_RandIter> __advance_to(_RandIter& __iter, __difference_type<_RandIter> __count, const _RandIter& __sentinel, diff --git a/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h b/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h index b019e4b5021b4..0d991c99a3317 100644 --- a/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h +++ b/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __lexicographical_compare( +_LIBCPP_HIDE_FROM_ABI bool __lexicographical_compare( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -37,7 +37,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __lexicographical_compa } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool lexicographical_compare( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool lexicographical_compare( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -47,7 +47,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 boo } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool lexicographical_compare( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool lexicographical_compare( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2) { return std::lexicographical_compare(__first1, __last1, __first2, __last2, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/lower_bound.h b/libcxx/include/__cxx03/__algorithm/lower_bound.h index a0d728009b7ff..47ae2550a6bbd 100644 --- a/libcxx/include/__cxx03/__algorithm/lower_bound.h +++ b/libcxx/include/__cxx03/__algorithm/lower_bound.h @@ -28,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting( +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _Iter __lower_bound_bisecting( _Iter __first, const _Type& __value, typename iterator_traits<_Iter>::difference_type __len, @@ -58,7 +58,7 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lo // whereas the one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of // comparisons. template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _ForwardIterator __lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) { // step = 0, ensuring we can always short-circuit when distance is 1 later on if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value)) @@ -84,14 +84,14 @@ __lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __va } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator __lower_bound(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) { const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last); return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) { static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value, "The comparator has to be callable"); auto __proj = std::__identity(); @@ -99,7 +99,7 @@ lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { return std::lower_bound(__first, __last, __value, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/make_heap.h b/libcxx/include/__cxx03/__algorithm/make_heap.h index 35a7f7bf9779f..faa7e9e185a5e 100644 --- a/libcxx/include/__cxx03/__algorithm/make_heap.h +++ b/libcxx/include/__cxx03/__algorithm/make_heap.h @@ -27,7 +27,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +inline _LIBCPP_HIDE_FROM_ABI void __make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp) { __comp_ref_type<_Compare> __comp_ref = __comp; @@ -42,14 +42,13 @@ __make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { std::__make_heap<_ClassicAlgPolicy>(std::move(__first), std::move(__last), __comp); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI void make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { std::make_heap(std::move(__first), std::move(__last), __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/make_projected.h b/libcxx/include/__cxx03/__algorithm/make_projected.h index 68cda60d4e473..592f6d6a5d7e1 100644 --- a/libcxx/include/__cxx03/__algorithm/make_projected.h +++ b/libcxx/include/__cxx03/__algorithm/make_projected.h @@ -31,12 +31,11 @@ struct _ProjectedPred { _Pred& __pred; // Can be a unary or a binary predicate. _Proj& __proj; - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _ProjectedPred(_Pred& __pred_arg, _Proj& __proj_arg) - : __pred(__pred_arg), __proj(__proj_arg) {} + _LIBCPP_HIDE_FROM_ABI _ProjectedPred(_Pred& __pred_arg, _Proj& __proj_arg) : __pred(__pred_arg), __proj(__proj_arg) {} template typename __invoke_of<_Pred&, decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_Tp>()))>::type - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI operator()(_Tp&& __v) const { return std::__invoke(__pred, std::__invoke(__proj, std::forward<_Tp>(__v))); } @@ -44,8 +43,7 @@ struct _ProjectedPred { template typename __invoke_of<_Pred&, decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_T1>())), - decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_T2>()))>::type _LIBCPP_CONSTEXPR - _LIBCPP_HIDE_FROM_ABI + decltype(std::__invoke(std::declval<_Proj&>(), std::declval<_T2>()))>::type _LIBCPP_HIDE_FROM_ABI operator()(_T1&& __lhs, _T2&& __rhs) const { return std::__invoke( __pred, std::__invoke(__proj, std::forward<_T1>(__lhs)), std::__invoke(__proj, std::forward<_T2>(__rhs))); @@ -56,7 +54,7 @@ template < class _Pred, class _Proj, __enable_if_t >::value && __is_identity<__decay_t<_Proj> >::value), int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ProjectedPred<_Pred, _Proj> __make_projected(_Pred& __pred, _Proj& __proj) { +_LIBCPP_HIDE_FROM_ABI _ProjectedPred<_Pred, _Proj> __make_projected(_Pred& __pred, _Proj& __proj) { return _ProjectedPred<_Pred, _Proj>(__pred, __proj); } @@ -67,7 +65,7 @@ template < class _Pred, class _Proj, __enable_if_t >::value && __is_identity<__decay_t<_Proj> >::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Pred& __make_projected(_Pred& __pred, _Proj&) { +_LIBCPP_HIDE_FROM_ABI _Pred& __make_projected(_Pred& __pred, _Proj&) { return __pred; } diff --git a/libcxx/include/__cxx03/__algorithm/max.h b/libcxx/include/__cxx03/__algorithm/max.h index 50dfd03843bdd..0a2e435b6cdc1 100644 --- a/libcxx/include/__cxx03/__algorithm/max.h +++ b/libcxx/include/__cxx03/__algorithm/max.h @@ -24,13 +24,13 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI const _Tp& max(_LIBCPP_LIFETIMEBOUND const _Tp& __a, _LIBCPP_LIFETIMEBOUND const _Tp& __b, _Compare __comp) { return __comp(__a, __b) ? __b : __a; } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI const _Tp& max(_LIBCPP_LIFETIMEBOUND const _Tp& __a, _LIBCPP_LIFETIMEBOUND const _Tp& __b) { return std::max(__a, __b, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/max_element.h b/libcxx/include/__cxx03/__algorithm/max_element.h index 20a22e74c8be7..710df45d3a14d 100644 --- a/libcxx/include/__cxx03/__algorithm/max_element.h +++ b/libcxx/include/__cxx03/__algorithm/max_element.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator +inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator __max_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) { static_assert( __has_forward_iterator_category<_ForwardIterator>::value, "std::max_element requires a ForwardIterator"); @@ -35,13 +35,13 @@ __max_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator max_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) { return std::__max_element<__comp_ref_type<_Compare> >(__first, __last, __comp); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator max_element(_ForwardIterator __first, _ForwardIterator __last) { return std::max_element(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/merge.h b/libcxx/include/__cxx03/__algorithm/merge.h index 90b986f747a3c..b5a19a836554a 100644 --- a/libcxx/include/__cxx03/__algorithm/merge.h +++ b/libcxx/include/__cxx03/__algorithm/merge.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator __merge( +_LIBCPP_HIDE_FROM_ABI _OutputIterator __merge( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -44,7 +44,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator __merge( } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator merge(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -55,7 +55,7 @@ merge(_InputIterator1 __first1, } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator merge(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, diff --git a/libcxx/include/__cxx03/__algorithm/min.h b/libcxx/include/__cxx03/__algorithm/min.h index b617f857102d4..f42e47b644c9a 100644 --- a/libcxx/include/__cxx03/__algorithm/min.h +++ b/libcxx/include/__cxx03/__algorithm/min.h @@ -24,13 +24,13 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI const _Tp& min(_LIBCPP_LIFETIMEBOUND const _Tp& __a, _LIBCPP_LIFETIMEBOUND const _Tp& __b, _Compare __comp) { return __comp(__b, __a) ? __b : __a; } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI const _Tp& min(_LIBCPP_LIFETIMEBOUND const _Tp& __a, _LIBCPP_LIFETIMEBOUND const _Tp& __b) { return std::min(__a, __b, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/min_element.h b/libcxx/include/__cxx03/__algorithm/min_element.h index 11c059c3acdc2..41c04162a61f3 100644 --- a/libcxx/include/__cxx03/__algorithm/min_element.h +++ b/libcxx/include/__cxx03/__algorithm/min_element.h @@ -28,8 +28,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter -__min_element(_Iter __first, _Sent __last, _Comp __comp, _Proj& __proj) { +inline _LIBCPP_HIDE_FROM_ABI _Iter __min_element(_Iter __first, _Sent __last, _Comp __comp, _Proj& __proj) { if (__first == __last) return __first; @@ -42,13 +41,13 @@ __min_element(_Iter __first, _Sent __last, _Comp __comp, _Proj& __proj) { } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter __min_element(_Iter __first, _Sent __last, _Comp __comp) { +_LIBCPP_HIDE_FROM_ABI _Iter __min_element(_Iter __first, _Sent __last, _Comp __comp) { auto __proj = __identity(); return std::__min_element<_Comp>(std::move(__first), std::move(__last), __comp, __proj); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator min_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) { static_assert( __has_forward_iterator_category<_ForwardIterator>::value, "std::min_element requires a ForwardIterator"); @@ -59,7 +58,7 @@ min_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator min_element(_ForwardIterator __first, _ForwardIterator __last) { return std::min_element(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/minmax.h b/libcxx/include/__cxx03/__algorithm/minmax.h index 609bc623b913c..ce19486ff58a7 100644 --- a/libcxx/include/__cxx03/__algorithm/minmax.h +++ b/libcxx/include/__cxx03/__algorithm/minmax.h @@ -23,13 +23,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI pair minmax(_LIBCPP_LIFETIMEBOUND const _Tp& __a, _LIBCPP_LIFETIMEBOUND const _Tp& __b, _Compare __comp) { return __comp(__b, __a) ? pair(__b, __a) : pair(__a, __b); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI pair minmax(_LIBCPP_LIFETIMEBOUND const _Tp& __a, _LIBCPP_LIFETIMEBOUND const _Tp& __b) { return std::minmax(__a, __b, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/minmax_element.h b/libcxx/include/__cxx03/__algorithm/minmax_element.h index c400ec2e8c7d5..8451725769e49 100644 --- a/libcxx/include/__cxx03/__algorithm/minmax_element.h +++ b/libcxx/include/__cxx03/__algorithm/minmax_element.h @@ -29,17 +29,16 @@ class _MinmaxElementLessFunc { _Proj& __proj_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _MinmaxElementLessFunc(_Comp& __comp, _Proj& __proj) - : __comp_(__comp), __proj_(__proj) {} + _LIBCPP_HIDE_FROM_ABI _MinmaxElementLessFunc(_Comp& __comp, _Proj& __proj) : __comp_(__comp), __proj_(__proj) {} template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator()(_Iter& __it1, _Iter& __it2) { + _LIBCPP_HIDE_FROM_ABI bool operator()(_Iter& __it1, _Iter& __it2) { return std::__invoke(__comp_, std::__invoke(__proj_, *__it1), std::__invoke(__proj_, *__it2)); } }; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter, _Iter> +_LIBCPP_HIDE_FROM_ABI pair<_Iter, _Iter> __minmax_element_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) { auto __less = _MinmaxElementLessFunc<_Comp, _Proj>(__comp, __proj); @@ -79,7 +78,7 @@ __minmax_element_impl(_Iter __first, _Sent __last, _Comp& __comp, _Proj& __proj) } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_ForwardIterator, _ForwardIterator> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _ForwardIterator> minmax_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp) { static_assert( __has_forward_iterator_category<_ForwardIterator>::value, "std::minmax_element requires a ForwardIterator"); @@ -90,7 +89,7 @@ minmax_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __com } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_ForwardIterator, _ForwardIterator> +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _ForwardIterator> minmax_element(_ForwardIterator __first, _ForwardIterator __last) { return std::minmax_element(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/mismatch.h b/libcxx/include/__cxx03/__algorithm/mismatch.h index baf464c25e86f..3c57c1bab1d61 100644 --- a/libcxx/include/__cxx03/__algorithm/mismatch.h +++ b/libcxx/include/__cxx03/__algorithm/mismatch.h @@ -37,7 +37,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_Iter1, _Iter2> __mismatch_loop(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { while (__first1 != __last1) { if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2))) @@ -49,7 +49,7 @@ __mismatch_loop(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_Iter1, _Iter2> __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2); } @@ -57,7 +57,7 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro #if _LIBCPP_VECTORIZE_ALGORITHMS template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_Iter, _Iter> __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) { using __value_type = __iter_value_type<_Iter>; constexpr size_t __unroll_count = 4; @@ -124,7 +124,7 @@ template ::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> && __is_identity<_Proj1>::value && __is_identity<_Proj2>::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_Tp*, _Tp*> __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred&, _Proj1&, _Proj2&) { return std::__mismatch_vectorized(__first1, __last1, __first2); } @@ -137,7 +137,7 @@ template ::value && __is_identity<_Proj2>::value && __can_map_to_integer_v<_Tp> && __libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI pair<_Tp*, _Tp*> __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { if (__libcpp_is_constant_evaluated()) { return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2); @@ -150,7 +150,7 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __ #endif // _LIBCPP_VECTORIZE_ALGORITHMS template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2> +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI pair<_InputIterator1, _InputIterator2> mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) { __identity __proj; auto __res = std::__mismatch( @@ -159,7 +159,7 @@ mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2> +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI pair<_InputIterator1, _InputIterator2> mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2) { return std::mismatch(__first1, __last1, __first2, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/move.h b/libcxx/include/__cxx03/__algorithm/move.h index cb158e15f19f5..0c744bc0e91a4 100644 --- a/libcxx/include/__cxx03/__algorithm/move.h +++ b/libcxx/include/__cxx03/__algorithm/move.h @@ -30,14 +30,12 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> -__move(_InIter __first, _Sent __last, _OutIter __result); +inline _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> __move(_InIter __first, _Sent __last, _OutIter __result); template struct __move_impl { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _Sent __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _Sent __last, _OutIter __result) const { while (__first != __last) { *__result = _IterOps<_AlgPolicy>::__iter_move(__first); ++__first; @@ -52,18 +50,16 @@ struct __move_impl { _OutIter& __result_; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _MoveSegment(_OutIter& __result) - : __result_(__result) {} + _LIBCPP_HIDE_FROM_ABI explicit _MoveSegment(_OutIter& __result) : __result_(__result) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void + _LIBCPP_HIDE_FROM_ABI void operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) { __result_ = std::__move<_AlgPolicy>(__lfirst, __llast, std::move(__result_)).second; } }; template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { std::__for_each_segment(__first, __last, _MoveSegment<_InIter, _OutIter>(__result)); return std::make_pair(__last, std::move(__result)); } @@ -73,8 +69,7 @@ struct __move_impl { __enable_if_t<__has_random_access_iterator_category<_InIter>::value && !__is_segmented_iterator<_InIter>::value && __is_segmented_iterator<_OutIter>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { using _Traits = __segmented_iterator_traits<_OutIter>; using _DiffT = typename common_type<__iter_diff_t<_InIter>, __iter_diff_t<_OutIter> >::type; @@ -98,21 +93,19 @@ struct __move_impl { // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> - operator()(_In* __first, _In* __last, _Out* __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_In*, _Out*> operator()(_In* __first, _In* __last, _Out* __result) const { return std::__copy_trivial_impl(__first, __last, __result); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> -__move(_InIter __first, _Sent __last, _OutIter __result) { +inline _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> __move(_InIter __first, _Sent __last, _OutIter __result) { return std::__copy_move_unwrap_iters<__move_impl<_AlgPolicy> >( std::move(__first), std::move(__last), std::move(__result)); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator move(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { static_assert(is_copy_constructible<_InputIterator>::value, "Iterators has to be copy constructible."); static_assert(is_copy_constructible<_OutputIterator>::value, "The output iterator has to be copy constructible."); diff --git a/libcxx/include/__cxx03/__algorithm/move_backward.h b/libcxx/include/__cxx03/__algorithm/move_backward.h index d4da82382a4c7..61e29c5c396f1 100644 --- a/libcxx/include/__cxx03/__algorithm/move_backward.h +++ b/libcxx/include/__cxx03/__algorithm/move_backward.h @@ -29,14 +29,13 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1, _BidirectionalIterator2> +_LIBCPP_HIDE_FROM_ABI pair<_BidirectionalIterator1, _BidirectionalIterator2> __move_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result); template struct __move_backward_impl { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _Sent __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _Sent __last, _OutIter __result) const { auto __last_iter = _IterOps<_AlgPolicy>::next(__first, __last); auto __original_last_iter = __last_iter; @@ -48,8 +47,7 @@ struct __move_backward_impl { } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { using _Traits = __segmented_iterator_traits<_InIter>; auto __sfirst = _Traits::__segment(__first); auto __slast = _Traits::__segment(__last); @@ -79,8 +77,7 @@ struct __move_backward_impl { __enable_if_t<__has_random_access_iterator_category<_InIter>::value && !__is_segmented_iterator<_InIter>::value && __is_segmented_iterator<_OutIter>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> - operator()(_InIter __first, _InIter __last, _OutIter __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_InIter, _OutIter> operator()(_InIter __first, _InIter __last, _OutIter __result) const { using _Traits = __segmented_iterator_traits<_OutIter>; using _DiffT = typename common_type<__iter_diff_t<_InIter>, __iter_diff_t<_OutIter> >::type; @@ -107,14 +104,13 @@ struct __move_backward_impl { // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> - operator()(_In* __first, _In* __last, _Out* __result) const { + _LIBCPP_HIDE_FROM_ABI pair<_In*, _Out*> operator()(_In* __first, _In* __last, _Out* __result) const { return std::__copy_backward_trivial_impl(__first, __last, __result); } }; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator1, _BidirectionalIterator2> +_LIBCPP_HIDE_FROM_ABI pair<_BidirectionalIterator1, _BidirectionalIterator2> __move_backward(_BidirectionalIterator1 __first, _Sentinel __last, _BidirectionalIterator2 __result) { static_assert(std::is_copy_constructible<_BidirectionalIterator1>::value && std::is_copy_constructible<_BidirectionalIterator1>::value, @@ -125,7 +121,7 @@ __move_backward(_BidirectionalIterator1 __first, _Sentinel __last, _Bidirectiona } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _BidirectionalIterator2 +inline _LIBCPP_HIDE_FROM_ABI _BidirectionalIterator2 move_backward(_BidirectionalIterator1 __first, _BidirectionalIterator1 __last, _BidirectionalIterator2 __result) { return std::__move_backward<_ClassicAlgPolicy>(std::move(__first), std::move(__last), std::move(__result)).second; } diff --git a/libcxx/include/__cxx03/__algorithm/next_permutation.h b/libcxx/include/__cxx03/__algorithm/next_permutation.h index 7d6b2ddad5056..12c6c51cb4dcb 100644 --- a/libcxx/include/__cxx03/__algorithm/next_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/next_permutation.h @@ -28,7 +28,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator, bool> +_LIBCPP_HIDE_FROM_ABI pair<_BidirectionalIterator, bool> __next_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&& __comp) { using _Result = pair<_BidirectionalIterator, bool>; @@ -55,7 +55,7 @@ __next_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&& } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +inline _LIBCPP_HIDE_FROM_ABI bool next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp) { return std::__next_permutation<_ClassicAlgPolicy>( std::move(__first), std::move(__last), static_cast<__comp_ref_type<_Compare> >(__comp)) @@ -63,8 +63,7 @@ next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool -next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI bool next_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last) { return std::next_permutation(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/none_of.h b/libcxx/include/__cxx03/__algorithm/none_of.h index 91162ec24ab1d..6672d5c0b2b11 100644 --- a/libcxx/include/__cxx03/__algorithm/none_of.h +++ b/libcxx/include/__cxx03/__algorithm/none_of.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool none_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) { for (; __first != __last; ++__first) if (__pred(*__first)) diff --git a/libcxx/include/__cxx03/__algorithm/nth_element.h b/libcxx/include/__cxx03/__algorithm/nth_element.h index 232966e0d2670..e39540fc985f7 100644 --- a/libcxx/include/__cxx03/__algorithm/nth_element.h +++ b/libcxx/include/__cxx03/__algorithm/nth_element.h @@ -29,7 +29,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool __nth_element_find_guard( +_LIBCPP_HIDE_FROM_ABI bool __nth_element_find_guard( _RandomAccessIterator& __i, _RandomAccessIterator& __j, _RandomAccessIterator __m, _Compare __comp) { // manually guard downward moving __j against __i while (true) { @@ -43,7 +43,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool __nth_element_find_guar } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +_LIBCPP_HIDE_FROM_ABI void // NOLINTNEXTLINE(readability-function-cognitive-complexity) __nth_element( _RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp) { @@ -227,7 +227,7 @@ __nth_element( } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __nth_element_impl( +inline _LIBCPP_HIDE_FROM_ABI void __nth_element_impl( _RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare& __comp) { if (__nth == __last) return; @@ -243,13 +243,13 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __nth_element_im } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last, _Compare __comp) { std::__nth_element_impl<_ClassicAlgPolicy>(std::move(__first), std::move(__nth), std::move(__last), __comp); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth, _RandomAccessIterator __last) { std::nth_element(std::move(__first), std::move(__nth), std::move(__last), __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/partial_sort.h b/libcxx/include/__cxx03/__algorithm/partial_sort.h index 04597fc32b9a2..56cb9e0132be0 100644 --- a/libcxx/include/__cxx03/__algorithm/partial_sort.h +++ b/libcxx/include/__cxx03/__algorithm/partial_sort.h @@ -32,7 +32,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator __partial_sort_impl( +_LIBCPP_HIDE_FROM_ABI _RandomAccessIterator __partial_sort_impl( _RandomAccessIterator __first, _RandomAccessIterator __middle, _Sentinel __last, _Compare&& __comp) { if (__first == __middle) { return _IterOps<_AlgPolicy>::next(__middle, __last); @@ -54,7 +54,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator __part } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator +_LIBCPP_HIDE_FROM_ABI _RandomAccessIterator __partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Sentinel __last, _Compare& __comp) { if (__first == __middle) return _IterOps<_AlgPolicy>::next(__middle, __last); @@ -70,7 +70,7 @@ __partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _S } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void partial_sort( +inline _LIBCPP_HIDE_FROM_ABI void partial_sort( _RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last, _Compare __comp) { static_assert(std::is_copy_constructible<_RandomAccessIterator>::value, "Iterators must be copy constructible."); static_assert(std::is_copy_assignable<_RandomAccessIterator>::value, "Iterators must be copy assignable."); @@ -79,7 +79,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void partial_sort( } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void partial_sort(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last) { std::partial_sort(__first, __middle, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h b/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h index 41189cfe029df..e0846dcaac10f 100644 --- a/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h +++ b/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h @@ -41,7 +41,7 @@ template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator, _RandomAccessIterator> __partial_sort_copy( +_LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _RandomAccessIterator> __partial_sort_copy( _InputIterator __first, _Sentinel1 __last, _RandomAccessIterator __result_first, @@ -70,7 +70,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator, _Random } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator partial_sort_copy( +inline _LIBCPP_HIDE_FROM_ABI _RandomAccessIterator partial_sort_copy( _InputIterator __first, _InputIterator __last, _RandomAccessIterator __result_first, @@ -91,7 +91,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator partial_sort_copy( +inline _LIBCPP_HIDE_FROM_ABI _RandomAccessIterator partial_sort_copy( _InputIterator __first, _InputIterator __last, _RandomAccessIterator __result_first, diff --git a/libcxx/include/__cxx03/__algorithm/partition.h b/libcxx/include/__cxx03/__algorithm/partition.h index d39dbbbd0b185..47fe62f9732a4 100644 --- a/libcxx/include/__cxx03/__algorithm/partition.h +++ b/libcxx/include/__cxx03/__algorithm/partition.h @@ -25,7 +25,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator> +_LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _ForwardIterator> __partition_impl(_ForwardIterator __first, _Sentinel __last, _Predicate __pred, forward_iterator_tag) { while (true) { if (__first == __last) @@ -46,7 +46,7 @@ __partition_impl(_ForwardIterator __first, _Sentinel __last, _Predicate __pred, } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator, _BidirectionalIterator> +_LIBCPP_HIDE_FROM_ABI pair<_BidirectionalIterator, _BidirectionalIterator> __partition_impl(_BidirectionalIterator __first, _Sentinel __sentinel, _Predicate __pred, bidirectional_iterator_tag) { _BidirectionalIterator __original_last = _IterOps<_AlgPolicy>::next(__first, __sentinel); _BidirectionalIterator __last = __original_last; @@ -69,14 +69,14 @@ __partition_impl(_BidirectionalIterator __first, _Sentinel __sentinel, _Predicat } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator, _ForwardIterator> +inline _LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _ForwardIterator> __partition(_ForwardIterator __first, _Sentinel __last, _Predicate&& __pred, _IterCategory __iter_category) { return std::__partition_impl<__remove_cvref_t<_Predicate>&, _AlgPolicy>( std::move(__first), std::move(__last), __pred, __iter_category); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { using _IterCategory = typename iterator_traits<_ForwardIterator>::iterator_category; auto __result = std::__partition<_ClassicAlgPolicy>(std::move(__first), std::move(__last), __pred, _IterCategory()); diff --git a/libcxx/include/__cxx03/__algorithm/partition_copy.h b/libcxx/include/__cxx03/__algorithm/partition_copy.h index 18d82cfa20326..3781233493346 100644 --- a/libcxx/include/__cxx03/__algorithm/partition_copy.h +++ b/libcxx/include/__cxx03/__algorithm/partition_copy.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_OutputIterator1, _OutputIterator2> partition_copy( +_LIBCPP_HIDE_FROM_ABI pair<_OutputIterator1, _OutputIterator2> partition_copy( _InputIterator __first, _InputIterator __last, _OutputIterator1 __out_true, diff --git a/libcxx/include/__cxx03/__algorithm/partition_point.h b/libcxx/include/__cxx03/__algorithm/partition_point.h index ccf203bbf245e..8dc6aea65b3c6 100644 --- a/libcxx/include/__cxx03/__algorithm/partition_point.h +++ b/libcxx/include/__cxx03/__algorithm/partition_point.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_HIDE_FROM_ABI _ForwardIterator partition_point(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { typedef typename iterator_traits<_ForwardIterator>::difference_type difference_type; difference_type __len = std::distance(__first, __last); diff --git a/libcxx/include/__cxx03/__algorithm/pop_heap.h b/libcxx/include/__cxx03/__algorithm/pop_heap.h index 5d19e902ff13b..ed0d1f1cc56b4 100644 --- a/libcxx/include/__cxx03/__algorithm/pop_heap.h +++ b/libcxx/include/__cxx03/__algorithm/pop_heap.h @@ -31,7 +31,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +inline _LIBCPP_HIDE_FROM_ABI void __pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare& __comp, @@ -59,7 +59,7 @@ __pop_heap(_RandomAccessIterator __first, } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { static_assert(std::is_copy_constructible<_RandomAccessIterator>::value, "Iterators must be copy constructible."); static_assert(std::is_copy_assignable<_RandomAccessIterator>::value, "Iterators must be copy assignable."); @@ -69,8 +69,7 @@ pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare _ } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI void pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { std::pop_heap(std::move(__first), std::move(__last), __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/prev_permutation.h b/libcxx/include/__cxx03/__algorithm/prev_permutation.h index b050d9cf337a7..a617377a7630e 100644 --- a/libcxx/include/__cxx03/__algorithm/prev_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/prev_permutation.h @@ -28,7 +28,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_BidirectionalIterator, bool> +_LIBCPP_HIDE_FROM_ABI pair<_BidirectionalIterator, bool> __prev_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&& __comp) { using _Result = pair<_BidirectionalIterator, bool>; @@ -55,7 +55,7 @@ __prev_permutation(_BidirectionalIterator __first, _Sentinel __last, _Compare&& } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +inline _LIBCPP_HIDE_FROM_ABI bool prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp) { return std::__prev_permutation<_ClassicAlgPolicy>( std::move(__first), std::move(__last), static_cast<__comp_ref_type<_Compare> >(__comp)) @@ -63,8 +63,7 @@ prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last, } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool -prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI bool prev_permutation(_BidirectionalIterator __first, _BidirectionalIterator __last) { return std::prev_permutation(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/push_heap.h b/libcxx/include/__cxx03/__algorithm/push_heap.h index 9ef44cdb3feea..eb4dc36ba7691 100644 --- a/libcxx/include/__cxx03/__algorithm/push_heap.h +++ b/libcxx/include/__cxx03/__algorithm/push_heap.h @@ -28,7 +28,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +_LIBCPP_HIDE_FROM_ABI void __sift_up(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp, @@ -56,14 +56,14 @@ __sift_up(_RandomAccessIterator __first, } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +inline _LIBCPP_HIDE_FROM_ABI void __push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare& __comp) { typename iterator_traits<_RandomAccessIterator>::difference_type __len = __last - __first; std::__sift_up<_AlgPolicy, __comp_ref_type<_Compare> >(std::move(__first), std::move(__last), __comp, __len); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { static_assert(std::is_copy_constructible<_RandomAccessIterator>::value, "Iterators must be copy constructible."); static_assert(std::is_copy_assignable<_RandomAccessIterator>::value, "Iterators must be copy assignable."); @@ -72,8 +72,7 @@ push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI void push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { std::push_heap(std::move(__first), std::move(__last), __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/remove.h b/libcxx/include/__cxx03/__algorithm/remove.h index 208351e22ac90..c5dc7aaf8e433 100644 --- a/libcxx/include/__cxx03/__algorithm/remove.h +++ b/libcxx/include/__cxx03/__algorithm/remove.h @@ -24,7 +24,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _ForwardIterator remove(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { __first = std::find(__first, __last, __value); if (__first != __last) { diff --git a/libcxx/include/__cxx03/__algorithm/remove_copy.h b/libcxx/include/__cxx03/__algorithm/remove_copy.h index 1bed25224281b..7360f53ff1c61 100644 --- a/libcxx/include/__cxx03/__algorithm/remove_copy.h +++ b/libcxx/include/__cxx03/__algorithm/remove_copy.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator remove_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result, const _Tp& __value) { for (; __first != __last; ++__first) { if (!(*__first == __value)) { diff --git a/libcxx/include/__cxx03/__algorithm/remove_copy_if.h b/libcxx/include/__cxx03/__algorithm/remove_copy_if.h index 3ec019dfd5912..5993738414373 100644 --- a/libcxx/include/__cxx03/__algorithm/remove_copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/remove_copy_if.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator remove_copy_if(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _Predicate __pred) { for (; __first != __last; ++__first) { if (!__pred(*__first)) { diff --git a/libcxx/include/__cxx03/__algorithm/remove_if.h b/libcxx/include/__cxx03/__algorithm/remove_if.h index c64e0aa4477e5..c740f4527ae65 100644 --- a/libcxx/include/__cxx03/__algorithm/remove_if.h +++ b/libcxx/include/__cxx03/__algorithm/remove_if.h @@ -23,7 +23,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _ForwardIterator remove_if(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { __first = std::find_if<_ForwardIterator, _Predicate&>(__first, __last, __pred); if (__first != __last) { diff --git a/libcxx/include/__cxx03/__algorithm/replace.h b/libcxx/include/__cxx03/__algorithm/replace.h index 692cece1708f9..975bc5f51c3c3 100644 --- a/libcxx/include/__cxx03/__algorithm/replace.h +++ b/libcxx/include/__cxx03/__algorithm/replace.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void replace(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __old_value, const _Tp& __new_value) { for (; __first != __last; ++__first) if (*__first == __old_value) diff --git a/libcxx/include/__cxx03/__algorithm/replace_copy.h b/libcxx/include/__cxx03/__algorithm/replace_copy.h index 4f8b375df2fb7..6e786c3126b8e 100644 --- a/libcxx/include/__cxx03/__algorithm/replace_copy.h +++ b/libcxx/include/__cxx03/__algorithm/replace_copy.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator replace_copy( +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator replace_copy( _InputIterator __first, _InputIterator __last, _OutputIterator __result, diff --git a/libcxx/include/__cxx03/__algorithm/replace_copy_if.h b/libcxx/include/__cxx03/__algorithm/replace_copy_if.h index cfc7b0aa2d34c..43bbcc01fa49b 100644 --- a/libcxx/include/__cxx03/__algorithm/replace_copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/replace_copy_if.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator replace_copy_if( +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator replace_copy_if( _InputIterator __first, _InputIterator __last, _OutputIterator __result, diff --git a/libcxx/include/__cxx03/__algorithm/replace_if.h b/libcxx/include/__cxx03/__algorithm/replace_if.h index f46da35714ef3..5816b247f187e 100644 --- a/libcxx/include/__cxx03/__algorithm/replace_if.h +++ b/libcxx/include/__cxx03/__algorithm/replace_if.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void replace_if(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred, const _Tp& __new_value) { for (; __first != __last; ++__first) if (__pred(*__first)) diff --git a/libcxx/include/__cxx03/__algorithm/reverse.h b/libcxx/include/__cxx03/__algorithm/reverse.h index 868377c7b26bd..d5298fcc9b1dd 100644 --- a/libcxx/include/__cxx03/__algorithm/reverse.h +++ b/libcxx/include/__cxx03/__algorithm/reverse.h @@ -25,7 +25,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void __reverse_impl(_BidirectionalIterator __first, _BidirectionalIterator __last, bidirectional_iterator_tag) { while (__first != __last) { if (__first == --__last) @@ -36,7 +36,7 @@ __reverse_impl(_BidirectionalIterator __first, _BidirectionalIterator __last, bi } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void __reverse_impl(_RandomAccessIterator __first, _RandomAccessIterator __last, random_access_iterator_tag) { if (__first != __last) for (; __first < --__last; ++__first) @@ -44,14 +44,13 @@ __reverse_impl(_RandomAccessIterator __first, _RandomAccessIterator __last, rand } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reverse(_BidirectionalIterator __first, _Sentinel __last) { +_LIBCPP_HIDE_FROM_ABI void __reverse(_BidirectionalIterator __first, _Sentinel __last) { using _IterCategory = typename _IterOps<_AlgPolicy>::template __iterator_category<_BidirectionalIterator>; std::__reverse_impl<_AlgPolicy>(std::move(__first), std::move(__last), _IterCategory()); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -reverse(_BidirectionalIterator __first, _BidirectionalIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI void reverse(_BidirectionalIterator __first, _BidirectionalIterator __last) { std::__reverse<_ClassicAlgPolicy>(std::move(__first), std::move(__last)); } diff --git a/libcxx/include/__cxx03/__algorithm/reverse_copy.h b/libcxx/include/__cxx03/__algorithm/reverse_copy.h index 3553102a2d03c..a667e8a8068e3 100644 --- a/libcxx/include/__cxx03/__algorithm/reverse_copy.h +++ b/libcxx/include/__cxx03/__algorithm/reverse_copy.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator reverse_copy(_BidirectionalIterator __first, _BidirectionalIterator __last, _OutputIterator __result) { for (; __first != __last; ++__result) *__result = *--__last; diff --git a/libcxx/include/__cxx03/__algorithm/rotate.h b/libcxx/include/__cxx03/__algorithm/rotate.h index e41edf00e7993..8e5c10acf42e5 100644 --- a/libcxx/include/__cxx03/__algorithm/rotate.h +++ b/libcxx/include/__cxx03/__algorithm/rotate.h @@ -29,8 +29,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator -__rotate_left(_ForwardIterator __first, _ForwardIterator __last) { +_LIBCPP_HIDE_FROM_ABI _ForwardIterator __rotate_left(_ForwardIterator __first, _ForwardIterator __last) { typedef typename iterator_traits<_ForwardIterator>::value_type value_type; using _Ops = _IterOps<_AlgPolicy>; @@ -41,7 +40,7 @@ __rotate_left(_ForwardIterator __first, _ForwardIterator __last) { } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _BidirectionalIterator +_LIBCPP_HIDE_FROM_ABI _BidirectionalIterator __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last) { typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type; using _Ops = _IterOps<_AlgPolicy>; @@ -54,7 +53,7 @@ __rotate_right(_BidirectionalIterator __first, _BidirectionalIterator __last) { } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _ForwardIterator +_LIBCPP_HIDE_FROM_ABI _ForwardIterator __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last) { _ForwardIterator __i = __middle; while (true) { @@ -83,7 +82,7 @@ __rotate_forward(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIt } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _Integral __algo_gcd(_Integral __x, _Integral __y) { +inline _LIBCPP_HIDE_FROM_ABI _Integral __algo_gcd(_Integral __x, _Integral __y) { do { _Integral __t = __x % __y; __x = __y; @@ -93,7 +92,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _Integral __algo_gcd( } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _RandomAccessIterator +_LIBCPP_HIDE_FROM_ABI _RandomAccessIterator __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; @@ -125,7 +124,7 @@ __rotate_gcd(_RandomAccessIterator __first, _RandomAccessIterator __middle, _Ran } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator +inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator __rotate_impl(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, std::forward_iterator_tag) { typedef typename iterator_traits<_ForwardIterator>::value_type value_type; if (is_trivially_move_assignable::value) { @@ -136,7 +135,7 @@ __rotate_impl(_ForwardIterator __first, _ForwardIterator __middle, _ForwardItera } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _BidirectionalIterator __rotate_impl( +inline _LIBCPP_HIDE_FROM_ABI _BidirectionalIterator __rotate_impl( _BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last, @@ -152,7 +151,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _BidirectionalIterato } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _RandomAccessIterator __rotate_impl( +inline _LIBCPP_HIDE_FROM_ABI _RandomAccessIterator __rotate_impl( _RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last, @@ -169,8 +168,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _RandomAccessIterator } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iterator, _Iterator> -__rotate(_Iterator __first, _Iterator __middle, _Sentinel __last) { +_LIBCPP_HIDE_FROM_ABI pair<_Iterator, _Iterator> __rotate(_Iterator __first, _Iterator __middle, _Sentinel __last) { using _Ret = pair<_Iterator, _Iterator>; _Iterator __last_iter = _IterOps<_AlgPolicy>::next(__middle, __last); @@ -186,7 +184,7 @@ __rotate(_Iterator __first, _Iterator __middle, _Sentinel __last) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last) { return std::__rotate<_ClassicAlgPolicy>(std::move(__first), std::move(__middle), std::move(__last)).first; } diff --git a/libcxx/include/__cxx03/__algorithm/rotate_copy.h b/libcxx/include/__cxx03/__algorithm/rotate_copy.h index 6970cdc5a2c56..06e1fb44bc6d1 100644 --- a/libcxx/include/__cxx03/__algorithm/rotate_copy.h +++ b/libcxx/include/__cxx03/__algorithm/rotate_copy.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator rotate_copy(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, _OutputIterator __result) { return std::copy(__first, __middle, std::copy(__middle, __last, __result)); } diff --git a/libcxx/include/__cxx03/__algorithm/search.h b/libcxx/include/__cxx03/__algorithm/search.h index f235510c33905..f3691de5de68b 100644 --- a/libcxx/include/__cxx03/__algorithm/search.h +++ b/libcxx/include/__cxx03/__algorithm/search.h @@ -35,7 +35,7 @@ template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> __search_forward_impl( +_LIBCPP_HIDE_FROM_ABI pair<_Iter1, _Iter1> __search_forward_impl( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { if (__first2 == __last2) return std::make_pair(__first1, __first1); // Everything matches an empty sequence @@ -79,7 +79,7 @@ template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> __search_random_access_impl( +_LIBCPP_HIDE_FROM_ABI pair<_Iter1, _Iter1> __search_random_access_impl( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, @@ -126,7 +126,7 @@ template ::value && __has_random_access_iterator_category<_Iter2>::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> __search_impl( +_LIBCPP_HIDE_FROM_ABI pair<_Iter1, _Iter1> __search_impl( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { auto __size2 = __last2 - __first2; if (__size2 == 0) @@ -153,13 +153,13 @@ template < !(__has_random_access_iterator_category<_Iter1>::value && __has_random_access_iterator_category<_Iter2>::value), int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> __search_impl( +_LIBCPP_HIDE_FROM_ABI pair<_Iter1, _Iter1> __search_impl( _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) { return std::__search_forward_impl<_ClassicAlgPolicy>(__first1, __last1, __first2, __last2, __pred, __proj1, __proj2); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator1 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, @@ -172,7 +172,7 @@ search(_ForwardIterator1 __first1, } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator1 +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator1 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2) { return std::search(__first1, __last1, __first2, __last2, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/search_n.h b/libcxx/include/__cxx03/__algorithm/search_n.h index 6fb5d52d4ff30..98328c69d3a70 100644 --- a/libcxx/include/__cxx03/__algorithm/search_n.h +++ b/libcxx/include/__cxx03/__algorithm/search_n.h @@ -29,7 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter, _Iter> __search_n_forward_impl( +_LIBCPP_HIDE_FROM_ABI pair<_Iter, _Iter> __search_n_forward_impl( _Iter __first, _Sent __last, _SizeT __count, const _Type& __value, _Pred& __pred, _Proj& __proj) { if (__count <= 0) return std::make_pair(__first, __first); @@ -66,7 +66,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter, _Iter> __search_ } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 std::pair<_Iter, _Iter> __search_n_random_access_impl( +_LIBCPP_HIDE_FROM_ABI std::pair<_Iter, _Iter> __search_n_random_access_impl( _Iter __first, _Sent __last, _SizeT __count, const _Type& __value, _Pred& __pred, _Proj& __proj, _DiffT __size1) { using difference_type = typename iterator_traits<_Iter>::difference_type; if (__count == 0) @@ -113,7 +113,7 @@ template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter, _Iter> +_LIBCPP_HIDE_FROM_ABI pair<_Iter, _Iter> __search_n_impl(_Iter __first, _Sent __last, _DiffT __count, const _Type& __value, _Pred& __pred, _Proj& __proj) { return std::__search_n_random_access_impl<_ClassicAlgPolicy>( __first, __last, __count, __value, __pred, __proj, __last - __first); @@ -128,13 +128,13 @@ template ::value && !__has_random_access_iterator_category<_Iter1>::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1> +_LIBCPP_HIDE_FROM_ABI pair<_Iter1, _Iter1> __search_n_impl(_Iter1 __first, _Sent1 __last, _DiffT __count, const _Type& __value, _Pred& __pred, _Proj& __proj) { return std::__search_n_forward_impl<_ClassicAlgPolicy>(__first, __last, __count, __value, __pred, __proj); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator search_n( +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator search_n( _ForwardIterator __first, _ForwardIterator __last, _Size __count, const _Tp& __value, _BinaryPredicate __pred) { static_assert( __is_callable<_BinaryPredicate, decltype(*__first), const _Tp&>::value, "BinaryPredicate has to be callable"); @@ -143,7 +143,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Fo } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator search_n(_ForwardIterator __first, _ForwardIterator __last, _Size __count, const _Tp& __value) { return std::search_n(__first, __last, std::__convert_to_integral(__count), __value, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/set_difference.h b/libcxx/include/__cxx03/__algorithm/set_difference.h index 4092e6753e5f6..943b458de68c3 100644 --- a/libcxx/include/__cxx03/__algorithm/set_difference.h +++ b/libcxx/include/__cxx03/__algorithm/set_difference.h @@ -30,8 +30,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__remove_cvref_t<_InIter1>, __remove_cvref_t<_OutIter> > -__set_difference( +_LIBCPP_HIDE_FROM_ABI pair<__remove_cvref_t<_InIter1>, __remove_cvref_t<_OutIter> > __set_difference( _InIter1&& __first1, _Sent1&& __last1, _InIter2&& __first2, _Sent2&& __last2, _OutIter&& __result, _Comp&& __comp) { while (__first1 != __last1 && __first2 != __last2) { if (__comp(*__first1, *__first2)) { @@ -49,7 +48,7 @@ __set_difference( } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_difference( +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator set_difference( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -62,7 +61,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_d } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_difference( +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator set_difference( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, diff --git a/libcxx/include/__cxx03/__algorithm/set_intersection.h b/libcxx/include/__cxx03/__algorithm/set_intersection.h index 4e02d3e9c51c7..d892dadd2567a 100644 --- a/libcxx/include/__cxx03/__algorithm/set_intersection.h +++ b/libcxx/include/__cxx03/__algorithm/set_intersection.h @@ -37,8 +37,7 @@ struct __set_intersection_result { _OutIter __out_; // need a constructor as C++03 aggregate init is hard - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - __set_intersection_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter) + _LIBCPP_HIDE_FROM_ABI __set_intersection_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter) : __in1_(std::move(__in_iter1)), __in2_(std::move(__in_iter2)), __out_(std::move(__out_iter)) {} }; @@ -47,7 +46,7 @@ struct __set_intersection_result { // the way it is used and doesn't attempt to abstract that, it's not appropriate for general usage outside of its // context. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_intersection_add_output_if_equal( +_LIBCPP_HIDE_FROM_ABI void __set_intersection_add_output_if_equal( bool __may_be_equal, _InForwardIter1& __first1, _InForwardIter2& __first2, @@ -83,8 +82,7 @@ template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter> __set_intersection( _InForwardIter1 __first1, _Sent1 __last1, @@ -94,7 +92,7 @@ __set_intersection( _Compare&& __comp, std::forward_iterator_tag, std::forward_iterator_tag) { - _LIBCPP_CONSTEXPR std::__identity __proj; + std::__identity __proj; bool __prev_may_be_equal = false; while (__first2 != __last2) { @@ -128,8 +126,7 @@ template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter> __set_intersection( _InInputIter1 __first1, _Sent1 __last1, @@ -159,9 +156,7 @@ __set_intersection( } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter> -__set_intersection( +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __set_intersection_result<_InIter1, _InIter2, _OutIter> __set_intersection( _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) { return std::__set_intersection<_AlgPolicy>( std::move(__first1), @@ -175,7 +170,7 @@ __set_intersection( } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_intersection( +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator set_intersection( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -193,7 +188,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_i } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_intersection( +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator set_intersection( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, diff --git a/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h b/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h index 64fdf4543be9c..82a1c46d70f0d 100644 --- a/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h +++ b/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h @@ -34,14 +34,13 @@ struct __set_symmetric_difference_result { _OutIter __out_; // need a constructor as C++03 aggregate init is hard - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 + _LIBCPP_HIDE_FROM_ABI __set_symmetric_difference_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter) : __in1_(std::move(__in_iter1)), __in2_(std::move(__in_iter2)), __out_(std::move(__out_iter)) {} }; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_symmetric_difference_result<_InIter1, _InIter2, _OutIter> -__set_symmetric_difference( +_LIBCPP_HIDE_FROM_ABI __set_symmetric_difference_result<_InIter1, _InIter2, _OutIter> __set_symmetric_difference( _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) { while (__first1 != __last1) { if (__first2 == __last2) { @@ -69,7 +68,7 @@ __set_symmetric_difference( } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_symmetric_difference( +_LIBCPP_HIDE_FROM_ABI _OutputIterator set_symmetric_difference( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -87,7 +86,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_symmetri } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_symmetric_difference( +_LIBCPP_HIDE_FROM_ABI _OutputIterator set_symmetric_difference( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, diff --git a/libcxx/include/__cxx03/__algorithm/set_union.h b/libcxx/include/__cxx03/__algorithm/set_union.h index a5c6d5eabd394..3effd78aafd01 100644 --- a/libcxx/include/__cxx03/__algorithm/set_union.h +++ b/libcxx/include/__cxx03/__algorithm/set_union.h @@ -34,13 +34,12 @@ struct __set_union_result { _OutIter __out_; // need a constructor as C++03 aggregate init is hard - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - __set_union_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter) + _LIBCPP_HIDE_FROM_ABI __set_union_result(_InIter1&& __in_iter1, _InIter2&& __in_iter2, _OutIter&& __out_iter) : __in1_(std::move(__in_iter1)), __in2_(std::move(__in_iter2)), __out_(std::move(__out_iter)) {} }; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_union_result<_InIter1, _InIter2, _OutIter> __set_union( +_LIBCPP_HIDE_FROM_ABI __set_union_result<_InIter1, _InIter2, _OutIter> __set_union( _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) { for (; __first1 != __last1; ++__result) { if (__first2 == __last2) { @@ -65,7 +64,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_union_result<_InIter1, } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_union( +_LIBCPP_HIDE_FROM_ABI _OutputIterator set_union( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, @@ -83,7 +82,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_union( } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_union( +_LIBCPP_HIDE_FROM_ABI _OutputIterator set_union( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, diff --git a/libcxx/include/__cxx03/__algorithm/shuffle.h b/libcxx/include/__cxx03/__algorithm/shuffle.h index 173af1bf25290..fee7028ae22ac 100644 --- a/libcxx/include/__cxx03/__algorithm/shuffle.h +++ b/libcxx/include/__cxx03/__algorithm/shuffle.h @@ -46,8 +46,8 @@ class _LIBCPP_EXPORTED_FROM_ABI __libcpp_debug_randomizer { return __oldstate >> 32; } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR result_type min() { return _Min; } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR result_type max() { return _Max; } + static _LIBCPP_HIDE_FROM_ABI result_type min() { return _Min; } + static _LIBCPP_HIDE_FROM_ABI result_type max() { return _Max; } private: uint_fast64_t __state_; @@ -82,8 +82,8 @@ class _LIBCPP_EXPORTED_FROM_ABI __rs_default { result_type operator()(); - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR result_type min() { return _Min; } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR result_type max() { return _Max; } + static _LIBCPP_HIDE_FROM_ABI result_type min() { return _Min; } + static _LIBCPP_HIDE_FROM_ABI result_type max() { return _Max; } friend _LIBCPP_EXPORTED_FROM_ABI __rs_default __rs_get(); }; @@ -91,8 +91,7 @@ class _LIBCPP_EXPORTED_FROM_ABI __rs_default { _LIBCPP_EXPORTED_FROM_ABI __rs_default __rs_get(); template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_DEPRECATED_IN_CXX14 void -random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last) { +_LIBCPP_HIDE_FROM_ABI void random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; typedef uniform_int_distribution _Dp; typedef typename _Dp::param_type _Pp; @@ -109,7 +108,7 @@ random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last) { } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_DEPRECATED_IN_CXX14 void +_LIBCPP_HIDE_FROM_ABI void random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last, _RandomNumberGenerator&& __rand) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; difference_type __d = __last - __first; diff --git a/libcxx/include/__cxx03/__algorithm/sift_down.h b/libcxx/include/__cxx03/__algorithm/sift_down.h index 774a6d2450d57..d299b718944d3 100644 --- a/libcxx/include/__cxx03/__algorithm/sift_down.h +++ b/libcxx/include/__cxx03/__algorithm/sift_down.h @@ -25,7 +25,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +_LIBCPP_HIDE_FROM_ABI void __sift_down(_RandomAccessIterator __first, _Compare&& __comp, typename iterator_traits<_RandomAccessIterator>::difference_type __len, @@ -80,7 +80,7 @@ __sift_down(_RandomAccessIterator __first, } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _RandomAccessIterator __floyd_sift_down( +_LIBCPP_HIDE_FROM_ABI _RandomAccessIterator __floyd_sift_down( _RandomAccessIterator __first, _Compare&& __comp, typename iterator_traits<_RandomAccessIterator>::difference_type __len) { diff --git a/libcxx/include/__cxx03/__algorithm/sort.h b/libcxx/include/__cxx03/__algorithm/sort.h index b89843f514673..41df4d9ff3830 100644 --- a/libcxx/include/__cxx03/__algorithm/sort.h +++ b/libcxx/include/__cxx03/__algorithm/sort.h @@ -46,8 +46,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // stable, 2-3 compares, 0-2 swaps template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 unsigned -__sort3(_ForwardIterator __x, _ForwardIterator __y, _ForwardIterator __z, _Compare __c) { +_LIBCPP_HIDE_FROM_ABI unsigned __sort3(_ForwardIterator __x, _ForwardIterator __y, _ForwardIterator __z, _Compare __c) { using _Ops = _IterOps<_AlgPolicy>; unsigned __r = 0; @@ -260,7 +259,7 @@ inline _LIBCPP_HIDE_FROM_ABI void __sort5_maybe_branchless( // Assumes size > 0 template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +_LIBCPP_HIDE_FROM_ABI void __selection_sort(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp) { _BidirectionalIterator __lm1 = __last; for (--__lm1; __first != __lm1; ++__first) { @@ -756,9 +755,9 @@ void __introsort(_RandomAccessIterator __first, typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; using _Comp_ref = __comp_ref_type<_Compare>; // Upper bound for using insertion sort for sorting. - _LIBCPP_CONSTEXPR difference_type __limit = 24; + difference_type __limit = 24; // Lower bound for using Tuckey's ninther technique for median computation. - _LIBCPP_CONSTEXPR difference_type __ninther_threshold = 128; + difference_type __ninther_threshold = 128; while (true) { difference_type __len = __last - __first; switch (__len) { @@ -910,8 +909,7 @@ extern template _LIBCPP_EXPORTED_FROM_ABI void __sort<__less&, long double*>(long double*, long double*, __less&); template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -__sort_dispatch(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp& __comp) { +_LIBCPP_HIDE_FROM_ABI void __sort_dispatch(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp& __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; difference_type __depth_limit = 2 * std::__log2i(__last - __first); @@ -961,7 +959,7 @@ _LIBCPP_HIDE_FROM_ABI void __sort_dispatch(_Type* __first, _Type* __last, less<_ } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void __sort_impl(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp& __comp) { std::__debug_randomize_range<_AlgPolicy>(__first, __last); @@ -975,14 +973,12 @@ __sort_impl(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp& } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) { +inline _LIBCPP_HIDE_FROM_ABI void sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) { std::__sort_impl<_ClassicAlgPolicy>(std::move(__first), std::move(__last), __comp); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -sort(_RandomAccessIterator __first, _RandomAccessIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI void sort(_RandomAccessIterator __first, _RandomAccessIterator __last) { std::sort(__first, __last, __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/sort_heap.h b/libcxx/include/__cxx03/__algorithm/sort_heap.h index b5a341103980e..1a7cad158cd0d 100644 --- a/libcxx/include/__cxx03/__algorithm/sort_heap.h +++ b/libcxx/include/__cxx03/__algorithm/sort_heap.h @@ -30,7 +30,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +inline _LIBCPP_HIDE_FROM_ABI void __sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare&& __comp) { _RandomAccessIterator __saved_last = __last; __comp_ref_type<_Compare> __comp_ref = __comp; @@ -42,7 +42,7 @@ __sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compar } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +inline _LIBCPP_HIDE_FROM_ABI void sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { static_assert(std::is_copy_constructible<_RandomAccessIterator>::value, "Iterators must be copy constructible."); static_assert(std::is_copy_assignable<_RandomAccessIterator>::value, "Iterators must be copy assignable."); @@ -51,8 +51,7 @@ sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { +inline _LIBCPP_HIDE_FROM_ABI void sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) { std::sort_heap(std::move(__first), std::move(__last), __less<>()); } diff --git a/libcxx/include/__cxx03/__algorithm/swap_ranges.h b/libcxx/include/__cxx03/__algorithm/swap_ranges.h index 9a19ffe602b33..d1f50c135231a 100644 --- a/libcxx/include/__cxx03/__algorithm/swap_ranges.h +++ b/libcxx/include/__cxx03/__algorithm/swap_ranges.h @@ -25,7 +25,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // 2+2 iterators: the shorter size will be used. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator1, _ForwardIterator2> +_LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator1, _ForwardIterator2> __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 __first2, _Sentinel2 __last2) { while (__first1 != __last1 && __first2 != __last2) { _IterOps<_AlgPolicy>::iter_swap(__first1, __first2); @@ -38,7 +38,7 @@ __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 // 2+1 iterators: size2 >= size1. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator1, _ForwardIterator2> +_LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator1, _ForwardIterator2> __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 __first2) { while (__first1 != __last1) { _IterOps<_AlgPolicy>::iter_swap(__first1, __first2); @@ -50,7 +50,7 @@ __swap_ranges(_ForwardIterator1 __first1, _Sentinel1 __last1, _ForwardIterator2 } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator2 +inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator2 swap_ranges(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2) { return std::__swap_ranges<_ClassicAlgPolicy>(std::move(__first1), std::move(__last1), std::move(__first2)).second; } diff --git a/libcxx/include/__cxx03/__algorithm/transform.h b/libcxx/include/__cxx03/__algorithm/transform.h index 4bed1ed4f8d59..abdf2cc72fecf 100644 --- a/libcxx/include/__cxx03/__algorithm/transform.h +++ b/libcxx/include/__cxx03/__algorithm/transform.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator transform(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _UnaryOperation __op) { for (; __first != __last; ++__first, (void)++__result) *__result = __op(*__first); @@ -26,7 +26,7 @@ transform(_InputIterator __first, _InputIterator __last, _OutputIterator __resul } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator transform( +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator transform( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, diff --git a/libcxx/include/__cxx03/__algorithm/unique.h b/libcxx/include/__cxx03/__algorithm/unique.h index b7eb2849e4e37..15980b93b4be3 100644 --- a/libcxx/include/__cxx03/__algorithm/unique.h +++ b/libcxx/include/__cxx03/__algorithm/unique.h @@ -29,7 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // unique template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 std::pair<_Iter, _Iter> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI std::pair<_Iter, _Iter> __unique(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) { __first = std::__adjacent_find(__first, __last, __pred); if (__first != __last) { @@ -46,13 +46,13 @@ __unique(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) { } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _ForwardIterator unique(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred) { return std::__unique<_ClassicAlgPolicy>(std::move(__first), std::move(__last), __pred).first; } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator unique(_ForwardIterator __first, _ForwardIterator __last) { return std::unique(__first, __last, __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/unique_copy.h b/libcxx/include/__cxx03/__algorithm/unique_copy.h index 6d3daefaa1ca5..5bed5b5d09f54 100644 --- a/libcxx/include/__cxx03/__algorithm/unique_copy.h +++ b/libcxx/include/__cxx03/__algorithm/unique_copy.h @@ -37,7 +37,7 @@ struct __read_from_tmp_value_tag {}; } // namespace __unique_copy_tags template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _OutputIterator> +_LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _OutputIterator> __unique_copy(_InputIterator __first, _Sent __last, _OutputIterator __result, @@ -59,7 +59,7 @@ __unique_copy(_InputIterator __first, } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _OutputIterator> +_LIBCPP_HIDE_FROM_ABI pair<_ForwardIterator, _OutputIterator> __unique_copy(_ForwardIterator __first, _Sent __last, _OutputIterator __result, @@ -81,7 +81,7 @@ __unique_copy(_ForwardIterator __first, } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _InputAndOutputIterator> +_LIBCPP_HIDE_FROM_ABI pair<_InputIterator, _InputAndOutputIterator> __unique_copy(_InputIterator __first, _Sent __last, _InputAndOutputIterator __result, @@ -98,7 +98,7 @@ __unique_copy(_InputIterator __first, } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryPredicate __pred) { using __algo_tag = __conditional_t< is_base_of::iterator_category>::value, @@ -115,7 +115,7 @@ unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __res } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +inline _LIBCPP_HIDE_FROM_ABI _OutputIterator unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { return std::unique_copy(std::move(__first), std::move(__last), std::move(__result), __equal_to()); } diff --git a/libcxx/include/__cxx03/__algorithm/unwrap_iter.h b/libcxx/include/__cxx03/__algorithm/unwrap_iter.h index b79dcd46b1fa2..d8daa54710840 100644 --- a/libcxx/include/__cxx03/__algorithm/unwrap_iter.h +++ b/libcxx/include/__cxx03/__algorithm/unwrap_iter.h @@ -36,8 +36,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD // Default case - we can't unwrap anything template ::value> struct __unwrap_iter_impl { - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap(_Iter, _Iter __iter) { return __iter; } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __unwrap(_Iter __i) _NOEXCEPT { return __i; } + static _LIBCPP_HIDE_FROM_ABI _Iter __rewrap(_Iter, _Iter __iter) { return __iter; } + static _LIBCPP_HIDE_FROM_ABI _Iter __unwrap(_Iter __i) _NOEXCEPT { return __i; } }; // TODO(hardening): make sure that the following unwrapping doesn't unexpectedly turn hardened iterators into raw @@ -48,25 +48,22 @@ template struct __unwrap_iter_impl<_Iter, true> { using _ToAddressT = decltype(std::__to_address(std::declval<_Iter>())); - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap(_Iter __orig_iter, _ToAddressT __unwrapped_iter) { + static _LIBCPP_HIDE_FROM_ABI _Iter __rewrap(_Iter __orig_iter, _ToAddressT __unwrapped_iter) { return __orig_iter + (__unwrapped_iter - std::__to_address(__orig_iter)); } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ToAddressT __unwrap(_Iter __i) _NOEXCEPT { - return std::__to_address(__i); - } + static _LIBCPP_HIDE_FROM_ABI _ToAddressT __unwrap(_Iter __i) _NOEXCEPT { return std::__to_address(__i); } }; template , __enable_if_t::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 decltype(_Impl::__unwrap(std::declval<_Iter>())) -__unwrap_iter(_Iter __i) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI decltype(_Impl::__unwrap(std::declval<_Iter>())) __unwrap_iter(_Iter __i) _NOEXCEPT { return _Impl::__unwrap(__i); } template > -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _OrigIter __rewrap_iter(_OrigIter __orig_iter, _Iter __iter) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI _OrigIter __rewrap_iter(_OrigIter __orig_iter, _Iter __iter) _NOEXCEPT { return _Impl::__rewrap(std::move(__orig_iter), std::move(__iter)); } diff --git a/libcxx/include/__cxx03/__algorithm/unwrap_range.h b/libcxx/include/__cxx03/__algorithm/unwrap_range.h index ed1a6b167c608..1926676dd1708 100644 --- a/libcxx/include/__cxx03/__algorithm/unwrap_range.h +++ b/libcxx/include/__cxx03/__algorithm/unwrap_range.h @@ -30,12 +30,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD // the same type. __unwrap_range tries to get two iterators and then forward to __unwrap_iter. template ()))> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR pair<_Unwrapped, _Unwrapped> __unwrap_range(_Iter __first, _Iter __last) { +_LIBCPP_HIDE_FROM_ABI pair<_Unwrapped, _Unwrapped> __unwrap_range(_Iter __first, _Iter __last) { return std::make_pair(std::__unwrap_iter(std::move(__first)), std::__unwrap_iter(std::move(__last))); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap_range(_Iter __orig_iter, _Unwrapped __iter) { +_LIBCPP_HIDE_FROM_ABI _Iter __rewrap_range(_Iter __orig_iter, _Unwrapped __iter) { return std::__rewrap_iter(std::move(__orig_iter), std::move(__iter)); } diff --git a/libcxx/include/__cxx03/__algorithm/upper_bound.h b/libcxx/include/__cxx03/__algorithm/upper_bound.h index d01780291c6e5..6c7cc37934d91 100644 --- a/libcxx/include/__cxx03/__algorithm/upper_bound.h +++ b/libcxx/include/__cxx03/__algorithm/upper_bound.h @@ -31,7 +31,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter +_LIBCPP_HIDE_FROM_ABI _Iter __upper_bound(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp, _Proj&& __proj) { auto __len = _IterOps<_AlgPolicy>::distance(__first, __last); while (__len != 0) { @@ -48,7 +48,7 @@ __upper_bound(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) { static_assert(is_copy_constructible<_ForwardIterator>::value, "Iterator has to be copy constructible"); return std::__upper_bound<_ClassicAlgPolicy>( @@ -56,7 +56,7 @@ upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __valu } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _ForwardIterator upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { return std::upper_bound(std::move(__first), std::move(__last), __value, __less<>()); } diff --git a/libcxx/include/__cxx03/__atomic/atomic.h b/libcxx/include/__cxx03/__atomic/atomic.h index f275ee32723f9..bc4a3937ce8be 100644 --- a/libcxx/include/__cxx03/__atomic/atomic.h +++ b/libcxx/include/__cxx03/__atomic/atomic.h @@ -40,7 +40,7 @@ struct atomic : public __atomic_base<_Tp> { _LIBCPP_HIDE_FROM_ABI atomic() _NOEXCEPT = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR atomic(_Tp __d) _NOEXCEPT : __base(__d) {} + _LIBCPP_HIDE_FROM_ABI atomic(_Tp __d) _NOEXCEPT : __base(__d) {} _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __d) volatile _NOEXCEPT { __base::store(__d); @@ -65,7 +65,7 @@ struct atomic<_Tp*> : public __atomic_base<_Tp*> { _LIBCPP_HIDE_FROM_ABI atomic() _NOEXCEPT = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR atomic(_Tp* __d) _NOEXCEPT : __base(__d) {} + _LIBCPP_HIDE_FROM_ABI atomic(_Tp* __d) _NOEXCEPT : __base(__d) {} _LIBCPP_HIDE_FROM_ABI _Tp* operator=(_Tp* __d) volatile _NOEXCEPT { __base::store(__d); @@ -132,14 +132,12 @@ _LIBCPP_HIDE_FROM_ABI bool atomic_is_lock_free(const atomic<_Tp>* __o) _NOEXCEPT // atomic_init template -_LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI void -atomic_init(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI void atomic_init(volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { std::__cxx_atomic_init(std::addressof(__o->__a_), __d); } template -_LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI void -atomic_init(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI void atomic_init(atomic<_Tp>* __o, typename atomic<_Tp>::value_type __d) _NOEXCEPT { std::__cxx_atomic_init(std::addressof(__o->__a_), __d); } diff --git a/libcxx/include/__cxx03/__atomic/atomic_base.h b/libcxx/include/__cxx03/__atomic/atomic_base.h index 81424bc40938d..a2b40c6a7e6f2 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_base.h +++ b/libcxx/include/__cxx03/__atomic/atomic_base.h @@ -116,7 +116,7 @@ struct __atomic_base // false _LIBCPP_HIDE_FROM_ABI __atomic_base() _NOEXCEPT = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __atomic_base(_Tp __d) _NOEXCEPT : __a_(__d) {} + _LIBCPP_HIDE_FROM_ABI __atomic_base(_Tp __d) _NOEXCEPT : __a_(__d) {} __atomic_base(const __atomic_base&) = delete; }; @@ -127,9 +127,9 @@ template struct __atomic_base<_Tp, true> : public __atomic_base<_Tp, false> { using __base = __atomic_base<_Tp, false>; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __atomic_base() _NOEXCEPT = default; + _LIBCPP_HIDE_FROM_ABI __atomic_base() _NOEXCEPT = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __atomic_base(_Tp __d) _NOEXCEPT : __base(__d) {} + _LIBCPP_HIDE_FROM_ABI __atomic_base(_Tp __d) _NOEXCEPT : __base(__d) {} _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile _NOEXCEPT { return std::__cxx_atomic_fetch_add(std::addressof(this->__a_), __op, __m); diff --git a/libcxx/include/__cxx03/__atomic/atomic_flag.h b/libcxx/include/__cxx03/__atomic/atomic_flag.h index fb2aac4ca9f78..316e014163da4 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_flag.h +++ b/libcxx/include/__cxx03/__atomic/atomic_flag.h @@ -70,7 +70,7 @@ struct atomic_flag { } atomic_flag() _NOEXCEPT = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR atomic_flag(bool __b) _NOEXCEPT : __a_(__b) {} // EXTENSION + _LIBCPP_HIDE_FROM_ABI atomic_flag(bool __b) _NOEXCEPT : __a_(__b) {} // EXTENSION atomic_flag(const atomic_flag&) = delete; atomic_flag& operator=(const atomic_flag&) = delete; diff --git a/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h b/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h index 2a0bb1a661892..4ec516b4e8b7e 100644 --- a/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h +++ b/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h @@ -45,7 +45,7 @@ _LIBCPP_HIDE_FROM_ABI void __cxx_atomic_assign_volatile(_Tp volatile& __a_value, template struct __cxx_atomic_base_impl { _LIBCPP_HIDE_FROM_ABI __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} - _LIBCPP_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} + explicit __cxx_atomic_base_impl(_Tp value) _NOEXCEPT : __a_value(value) {} _Tp __a_value; }; @@ -258,7 +258,7 @@ __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_o template struct __cxx_atomic_base_impl { _LIBCPP_HIDE_FROM_ABI __cxx_atomic_base_impl() _NOEXCEPT : __a_value() {} - _LIBCPP_CONSTEXPR explicit __cxx_atomic_base_impl(_Tp __value) _NOEXCEPT : __a_value(__value) {} + explicit __cxx_atomic_base_impl(_Tp __value) _NOEXCEPT : __a_value(__value) {} _LIBCPP_DISABLE_EXTENSION_WARNING _Atomic(_Tp) __a_value; }; @@ -334,7 +334,7 @@ __cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, memory_orde std::addressof(__a->__a_value), __value, static_cast<__memory_order_underlying_t>(__order)); } -_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR memory_order __to_failure_order(memory_order __order) { +_LIBCPP_HIDE_FROM_ABI inline memory_order __to_failure_order(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_release ? memory_order_relaxed @@ -490,7 +490,7 @@ struct __cxx_atomic_impl : public _Base { static_assert(is_trivially_copyable<_Tp>::value, "std::atomic requires that 'T' be a trivially copyable type"); _LIBCPP_HIDE_FROM_ABI __cxx_atomic_impl() _NOEXCEPT = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __cxx_atomic_impl(_Tp __value) _NOEXCEPT : _Base(__value) {} + _LIBCPP_HIDE_FROM_ABI explicit __cxx_atomic_impl(_Tp __value) _NOEXCEPT : _Base(__value) {} }; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__atomic/to_gcc_order.h b/libcxx/include/__cxx03/__atomic/to_gcc_order.h index aab3c59602f11..aa510c16691cb 100644 --- a/libcxx/include/__cxx03/__atomic/to_gcc_order.h +++ b/libcxx/include/__cxx03/__atomic/to_gcc_order.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if defined(__ATOMIC_RELAXED) && defined(__ATOMIC_CONSUME) && defined(__ATOMIC_ACQUIRE) && \ defined(__ATOMIC_RELEASE) && defined(__ATOMIC_ACQ_REL) && defined(__ATOMIC_SEQ_CST) -_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_order(memory_order __order) { +_LIBCPP_HIDE_FROM_ABI inline int __to_gcc_order(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_relaxed ? __ATOMIC_RELAXED @@ -34,7 +34,7 @@ _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_order(memory_order _ : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME)))); } -_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_failure_order(memory_order __order) { +_LIBCPP_HIDE_FROM_ABI inline int __to_gcc_failure_order(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_relaxed ? __ATOMIC_RELAXED diff --git a/libcxx/include/__cxx03/__bit/blsr.h b/libcxx/include/__cxx03/__bit/blsr.h index ae1d8b588925d..b8027d9137190 100644 --- a/libcxx/include/__cxx03/__bit/blsr.h +++ b/libcxx/include/__cxx03/__bit/blsr.h @@ -17,15 +17,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unsigned __libcpp_blsr(unsigned __x) _NOEXCEPT { - return __x ^ (__x & -__x); -} +inline _LIBCPP_HIDE_FROM_ABI unsigned __libcpp_blsr(unsigned __x) _NOEXCEPT { return __x ^ (__x & -__x); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unsigned long __libcpp_blsr(unsigned long __x) _NOEXCEPT { - return __x ^ (__x & -__x); -} +inline _LIBCPP_HIDE_FROM_ABI unsigned long __libcpp_blsr(unsigned long __x) _NOEXCEPT { return __x ^ (__x & -__x); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unsigned long long __libcpp_blsr(unsigned long long __x) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI unsigned long long __libcpp_blsr(unsigned long long __x) _NOEXCEPT { return __x ^ (__x & -__x); } diff --git a/libcxx/include/__cxx03/__bit/countl.h b/libcxx/include/__cxx03/__bit/countl.h index d73f9cac0fa41..3f0161aef6a32 100644 --- a/libcxx/include/__cxx03/__bit/countl.h +++ b/libcxx/include/__cxx03/__bit/countl.h @@ -26,20 +26,18 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_clz(unsigned __x) _NOEXCEPT { - return __builtin_clz(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI int __libcpp_clz(unsigned __x) _NOEXCEPT { return __builtin_clz(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_clz(unsigned long __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI int __libcpp_clz(unsigned long __x) _NOEXCEPT { return __builtin_clzl(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_clz(unsigned long long __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI int __libcpp_clz(unsigned long long __x) _NOEXCEPT { return __builtin_clzll(__x); } #ifndef _LIBCPP_HAS_NO_INT128 -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_clz(__uint128_t __x) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI int __libcpp_clz(__uint128_t __x) _NOEXCEPT { # if __has_builtin(__builtin_clzg) return __builtin_clzg(__x); # else @@ -59,7 +57,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_clz(__uint128_t __x) #endif // _LIBCPP_HAS_NO_INT128 template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int __countl_zero(_Tp __t) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI int __countl_zero(_Tp __t) _NOEXCEPT { static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires an unsigned integer type"); #if __has_builtin(__builtin_clzg) return __builtin_clzg(__t, numeric_limits<_Tp>::digits); diff --git a/libcxx/include/__cxx03/__bit/countr.h b/libcxx/include/__cxx03/__bit/countr.h index 84124669ed633..2f5b5591dc3f5 100644 --- a/libcxx/include/__cxx03/__bit/countr.h +++ b/libcxx/include/__cxx03/__bit/countr.h @@ -25,20 +25,18 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_ctz(unsigned __x) _NOEXCEPT { - return __builtin_ctz(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI int __libcpp_ctz(unsigned __x) _NOEXCEPT { return __builtin_ctz(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_ctz(unsigned long __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI int __libcpp_ctz(unsigned long __x) _NOEXCEPT { return __builtin_ctzl(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_ctz(unsigned long long __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI int __libcpp_ctz(unsigned long long __x) _NOEXCEPT { return __builtin_ctzll(__x); } template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int __countr_zero(_Tp __t) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI int __countr_zero(_Tp __t) _NOEXCEPT { #if __has_builtin(__builtin_ctzg) return __builtin_ctzg(__t, numeric_limits<_Tp>::digits); #else // __has_builtin(__builtin_ctzg) diff --git a/libcxx/include/__cxx03/__bit/invert_if.h b/libcxx/include/__cxx03/__bit/invert_if.h index 270bd55a59e96..cc2815e5bbee7 100644 --- a/libcxx/include/__cxx03/__bit/invert_if.h +++ b/libcxx/include/__cxx03/__bit/invert_if.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __invert_if(_Tp __v) { +_LIBCPP_HIDE_FROM_ABI _Tp __invert_if(_Tp __v) { if (_Invert) return ~__v; return __v; diff --git a/libcxx/include/__cxx03/__bit/popcount.h b/libcxx/include/__cxx03/__bit/popcount.h index b91e80e1a6e5b..64404d2cf4948 100644 --- a/libcxx/include/__cxx03/__bit/popcount.h +++ b/libcxx/include/__cxx03/__bit/popcount.h @@ -25,15 +25,11 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned __x) _NOEXCEPT { - return __builtin_popcount(__x); -} +inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned __x) _NOEXCEPT { return __builtin_popcount(__x); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned long __x) _NOEXCEPT { - return __builtin_popcountl(__x); -} +inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long __x) _NOEXCEPT { return __builtin_popcountl(__x); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { return __builtin_popcountll(__x); } diff --git a/libcxx/include/__cxx03/__bit/rotate.h b/libcxx/include/__cxx03/__bit/rotate.h index f828d73f73cb5..fbe121fe54a67 100644 --- a/libcxx/include/__cxx03/__bit/rotate.h +++ b/libcxx/include/__cxx03/__bit/rotate.h @@ -23,7 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // to optimize the code. On x86 this function becomes the ROL instruction and // the rotr function becomes the ROR instruction. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotl(_Tp __x, int __s) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI _Tp __rotl(_Tp __x, int __s) _NOEXCEPT { static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires an unsigned integer type"); const int __N = numeric_limits<_Tp>::digits; int __r = __s % __N; @@ -38,7 +38,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotl(_Tp __x, int __s) } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotr(_Tp __x, int __s) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI _Tp __rotr(_Tp __x, int __s) _NOEXCEPT { static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires an unsigned integer type"); const int __N = numeric_limits<_Tp>::digits; int __r = __s % __N; diff --git a/libcxx/include/__cxx03/__bit_reference b/libcxx/include/__cxx03/__bit_reference index ec87c4faf6474..76027e2d1523f 100644 --- a/libcxx/include/__cxx03/__bit_reference +++ b/libcxx/include/__cxx03/__bit_reference @@ -58,16 +58,12 @@ class __bit_reference { public: using __container = typename _Cp::__self; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_reference(const __bit_reference&) = default; + _LIBCPP_HIDE_FROM_ABI __bit_reference(const __bit_reference&) = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator bool() const _NOEXCEPT { - return static_cast(*__seg_ & __mask_); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool operator~() const _NOEXCEPT { - return !static_cast(*this); - } + _LIBCPP_HIDE_FROM_ABI operator bool() const _NOEXCEPT { return static_cast(*__seg_ & __mask_); } + _LIBCPP_HIDE_FROM_ABI bool operator~() const _NOEXCEPT { return !static_cast(*this); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_reference& operator=(bool __x) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bit_reference& operator=(bool __x) _NOEXCEPT { if (__x) *__seg_ |= __mask_; else @@ -75,18 +71,17 @@ public: return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_reference& operator=(const __bit_reference& __x) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bit_reference& operator=(const __bit_reference& __x) _NOEXCEPT { return operator=(static_cast(__x)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void flip() _NOEXCEPT { *__seg_ ^= __mask_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> operator&() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void flip() _NOEXCEPT { *__seg_ ^= __mask_; } + _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> operator&() const _NOEXCEPT { return __bit_iterator<_Cp, false>(__seg_, static_cast(std::__libcpp_ctz(__mask_))); } private: - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __bit_reference(__storage_pointer __s, __storage_type __m) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI explicit __bit_reference(__storage_pointer __s, __storage_type __m) _NOEXCEPT : __seg_(__s), __mask_(__m) {} }; @@ -95,30 +90,28 @@ template class __bit_reference<_Cp, false> {}; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) _NOEXCEPT { bool __t = __x; __x = __y; __y = __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) _NOEXCEPT { bool __t = __x; __x = __y; __y = __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(__bit_reference<_Cp> __x, bool& __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void swap(__bit_reference<_Cp> __x, bool& __y) _NOEXCEPT { bool __t = __x; __x = __y; __y = __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(bool& __x, __bit_reference<_Cp> __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void swap(bool& __x, __bit_reference<_Cp> __y) _NOEXCEPT { bool __t = __x; __x = __y; __y = __t; @@ -141,21 +134,18 @@ public: _LIBCPP_HIDE_FROM_ABI __bit_const_reference(const __bit_const_reference&) = default; __bit_const_reference& operator=(const __bit_const_reference&) = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_const_reference(const __bit_reference<_Cp>& __x) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI __bit_const_reference(const __bit_reference<_Cp>& __x) _NOEXCEPT : __seg_(__x.__seg_), __mask_(__x.__mask_) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR operator bool() const _NOEXCEPT { - return static_cast(*__seg_ & __mask_); - } + _LIBCPP_HIDE_FROM_ABI operator bool() const _NOEXCEPT { return static_cast(*__seg_ & __mask_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, true> operator&() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, true> operator&() const _NOEXCEPT { return __bit_iterator<_Cp, true>(__seg_, static_cast(std::__libcpp_ctz(__mask_))); } private: - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR explicit __bit_const_reference(__storage_pointer __s, __storage_type __m) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI explicit __bit_const_reference(__storage_pointer __s, __storage_type __m) _NOEXCEPT : __seg_(__s), __mask_(__m) {} }; @@ -163,7 +153,7 @@ private: // copy template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned( +_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { using _In = __bit_iterator<_Cp, _IsConst>; using difference_type = typename _In::difference_type; @@ -206,7 +196,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned( +_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { using _In = __bit_iterator<_Cp, _IsConst>; using difference_type = typename _In::difference_type; @@ -277,7 +267,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _ } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> +inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { if (__first.__ctz_ == __result.__ctz_) return std::__copy_aligned(__first, __last, __result); @@ -287,7 +277,7 @@ copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last // copy_backward template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned( +_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { using _In = __bit_iterator<_Cp, _IsConst>; using difference_type = typename _In::difference_type; @@ -329,7 +319,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned( +_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { using _In = __bit_iterator<_Cp, _IsConst>; using difference_type = typename _In::difference_type; @@ -405,7 +395,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _ } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward( +inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> copy_backward( __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { if (__last.__ctz_ == __result.__ctz_) return std::__copy_backward_aligned(__first, __last, __result); @@ -589,26 +579,26 @@ struct __bit_array { difference_type __size_; __storage_type __word_[_Np]; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static difference_type capacity() { + _LIBCPP_HIDE_FROM_ABI static difference_type capacity() { return static_cast(_Np * __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __bit_array(difference_type __s) : __size_(__s) { + _LIBCPP_HIDE_FROM_ABI explicit __bit_array(difference_type __s) : __size_(__s) { if (__libcpp_is_constant_evaluated()) { for (size_t __i = 0; __i != __bit_array<_Cp>::_Np; ++__i) std::__construct_at(__word_ + __i, 0); } } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator begin() { + _LIBCPP_HIDE_FROM_ABI iterator begin() { return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator end() { + _LIBCPP_HIDE_FROM_ABI iterator end() { return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word, static_cast(__size_ % __bits_per_word)); } }; template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> +_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, __bit_iterator<_Cp, false> __last) { using _I1 = __bit_iterator<_Cp, false>; using difference_type = typename _I1::difference_type; @@ -649,7 +639,7 @@ rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, // equal template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __equal_unaligned( +_LIBCPP_HIDE_FROM_ABI bool __equal_unaligned( __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) { using _It = __bit_iterator<_Cp, _IC1>; using difference_type = typename _It::difference_type; @@ -721,7 +711,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __equal_unaligned( } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __equal_aligned( +_LIBCPP_HIDE_FROM_ABI bool __equal_aligned( __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) { using _It = __bit_iterator<_Cp, _IC1>; using difference_type = typename _It::difference_type; @@ -760,7 +750,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __equal_aligned( } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool +inline _LIBCPP_HIDE_FROM_ABI bool equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2) { if (__first1.__ctz_ == __first2.__ctz_) return std::__equal_aligned(__first1, __last1, __first2); @@ -791,14 +781,14 @@ private: unsigned __ctz_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator() _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI __bit_iterator() _NOEXCEPT {} // When _IsConst=false, this is the copy constructor. // It is non-trivial. Making it trivial would break ABI. // When _IsConst=true, this is a converting constructor; // the copy and move constructors are implicitly generated // and trivial. - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator(const __bit_iterator<_Cp, false>& __it) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI __bit_iterator(const __bit_iterator<_Cp, false>& __it) _NOEXCEPT : __seg_(__it.__seg_), __ctz_(__it.__ctz_) {} @@ -807,19 +797,18 @@ public: // the implicit generation of a defaulted one is deprecated. // When _IsConst=true, the assignment operators are // implicitly generated and trivial. - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator& - operator=(const _If<_IsConst, struct __private_nat, __bit_iterator>& __it) { + _LIBCPP_HIDE_FROM_ABI __bit_iterator& operator=(const _If<_IsConst, struct __private_nat, __bit_iterator>& __it) { __seg_ = __it.__seg_; __ctz_ = __it.__ctz_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator*() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference operator*() const _NOEXCEPT { return __conditional_t<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp> >( __seg_, __storage_type(1) << __ctz_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator& operator++() { + _LIBCPP_HIDE_FROM_ABI __bit_iterator& operator++() { if (__ctz_ != __bits_per_word - 1) ++__ctz_; else { @@ -829,13 +818,13 @@ public: return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator operator++(int) { + _LIBCPP_HIDE_FROM_ABI __bit_iterator operator++(int) { __bit_iterator __tmp = *this; ++(*this); return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator& operator--() { + _LIBCPP_HIDE_FROM_ABI __bit_iterator& operator--() { if (__ctz_ != 0) --__ctz_; else { @@ -845,13 +834,13 @@ public: return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator operator--(int) { + _LIBCPP_HIDE_FROM_ABI __bit_iterator operator--(int) { __bit_iterator __tmp = *this; --(*this); return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator& operator+=(difference_type __n) { + _LIBCPP_HIDE_FROM_ABI __bit_iterator& operator+=(difference_type __n) { if (__n >= 0) __seg_ += (__n + __ctz_) / __bits_per_word; else @@ -862,69 +851,56 @@ public: return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator& operator-=(difference_type __n) { - return *this += -__n; - } + _LIBCPP_HIDE_FROM_ABI __bit_iterator& operator-=(difference_type __n) { return *this += -__n; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator operator+(difference_type __n) const { + _LIBCPP_HIDE_FROM_ABI __bit_iterator operator+(difference_type __n) const { __bit_iterator __t(*this); __t += __n; return __t; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator operator-(difference_type __n) const { + _LIBCPP_HIDE_FROM_ABI __bit_iterator operator-(difference_type __n) const { __bit_iterator __t(*this); __t -= __n; return __t; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator - operator+(difference_type __n, const __bit_iterator& __it) { + _LIBCPP_HIDE_FROM_ABI friend __bit_iterator operator+(difference_type __n, const __bit_iterator& __it) { return __it + __n; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend difference_type - operator-(const __bit_iterator& __x, const __bit_iterator& __y) { + _LIBCPP_HIDE_FROM_ABI friend difference_type operator-(const __bit_iterator& __x, const __bit_iterator& __y) { return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](difference_type __n) const { - return *(*this + __n); - } + _LIBCPP_HIDE_FROM_ABI reference operator[](difference_type __n) const { return *(*this + __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - operator==(const __bit_iterator& __x, const __bit_iterator& __y) { + _LIBCPP_HIDE_FROM_ABI friend bool operator==(const __bit_iterator& __x, const __bit_iterator& __y) { return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - operator!=(const __bit_iterator& __x, const __bit_iterator& __y) { + _LIBCPP_HIDE_FROM_ABI friend bool operator!=(const __bit_iterator& __x, const __bit_iterator& __y) { return !(__x == __y); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - operator<(const __bit_iterator& __x, const __bit_iterator& __y) { + _LIBCPP_HIDE_FROM_ABI friend bool operator<(const __bit_iterator& __x, const __bit_iterator& __y) { return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - operator>(const __bit_iterator& __x, const __bit_iterator& __y) { + _LIBCPP_HIDE_FROM_ABI friend bool operator>(const __bit_iterator& __x, const __bit_iterator& __y) { return __y < __x; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - operator<=(const __bit_iterator& __x, const __bit_iterator& __y) { + _LIBCPP_HIDE_FROM_ABI friend bool operator<=(const __bit_iterator& __x, const __bit_iterator& __y) { return !(__y < __x); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - operator>=(const __bit_iterator& __x, const __bit_iterator& __y) { + _LIBCPP_HIDE_FROM_ABI friend bool operator>=(const __bit_iterator& __x, const __bit_iterator& __y) { return !(__x < __y); } private: - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __bit_iterator(__storage_pointer __s, unsigned __ctz) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI explicit __bit_iterator(__storage_pointer __s, unsigned __ctz) _NOEXCEPT : __seg_(__s), __ctz_(__ctz) {} @@ -937,26 +913,25 @@ private: friend struct __bit_array; template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void - __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); + friend void __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned( + friend __bit_iterator<_Dp, false> __copy_aligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_unaligned( + friend __bit_iterator<_Dp, false> __copy_unaligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> + friend __bit_iterator<_Dp, false> copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_aligned( + friend __bit_iterator<_Dp, false> __copy_backward_aligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned( + friend __bit_iterator<_Dp, false> __copy_backward_unaligned( __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> + friend __bit_iterator<_Dp, false> copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result); template friend __bit_iterator<_Cr, false> @@ -968,23 +943,19 @@ private: friend __bit_iterator<_Cr, false> swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> + friend __bit_iterator<_Dp, false> rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); + friend bool __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); + friend bool __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend bool - equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); + friend bool equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, _IC> - __find_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); + friend __bit_iterator<_Dp, _IC> __find_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); template - friend typename __bit_iterator<_Dp, _IC>::difference_type _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR_SINCE_CXX20 __count_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); + friend typename __bit_iterator<_Dp, _IC>::difference_type + _LIBCPP_HIDE_FROM_ABI __count_bool(__bit_iterator<_Dp, _IC>, typename _Dp::size_type); }; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/duration.h b/libcxx/include/__cxx03/__chrono/duration.h index 01c75d7c1abaf..3a96df7ee99f0 100644 --- a/libcxx/include/__cxx03/__chrono/duration.h +++ b/libcxx/include/__cxx03/__chrono/duration.h @@ -68,14 +68,14 @@ struct __duration_cast; template struct __duration_cast<_FromDuration, _ToDuration, _Period, true, true> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ToDuration operator()(const _FromDuration& __fd) const { + _LIBCPP_HIDE_FROM_ABI _ToDuration operator()(const _FromDuration& __fd) const { return _ToDuration(static_cast(__fd.count())); } }; template struct __duration_cast<_FromDuration, _ToDuration, _Period, true, false> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ToDuration operator()(const _FromDuration& __fd) const { + _LIBCPP_HIDE_FROM_ABI _ToDuration operator()(const _FromDuration& __fd) const { typedef typename common_type::type _Ct; return _ToDuration( static_cast(static_cast<_Ct>(__fd.count()) / static_cast<_Ct>(_Period::den))); @@ -84,7 +84,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, true, false> { template struct __duration_cast<_FromDuration, _ToDuration, _Period, false, true> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ToDuration operator()(const _FromDuration& __fd) const { + _LIBCPP_HIDE_FROM_ABI _ToDuration operator()(const _FromDuration& __fd) const { typedef typename common_type::type _Ct; return _ToDuration( static_cast(static_cast<_Ct>(__fd.count()) * static_cast<_Ct>(_Period::num))); @@ -93,7 +93,7 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, false, true> { template struct __duration_cast<_FromDuration, _ToDuration, _Period, false, false> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ToDuration operator()(const _FromDuration& __fd) const { + _LIBCPP_HIDE_FROM_ABI _ToDuration operator()(const _FromDuration& __fd) const { typedef typename common_type::type _Ct; return _ToDuration(static_cast( static_cast<_Ct>(__fd.count()) * static_cast<_Ct>(_Period::num) / static_cast<_Ct>(_Period::den))); @@ -101,20 +101,22 @@ struct __duration_cast<_FromDuration, _ToDuration, _Period, false, false> { }; template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ToDuration duration_cast(const duration<_Rep, _Period>& __fd) { +inline _LIBCPP_HIDE_FROM_ABI _ToDuration duration_cast(const duration<_Rep, _Period>& __fd) { return __duration_cast, _ToDuration>()(__fd); } template struct _LIBCPP_TEMPLATE_VIS treat_as_floating_point : is_floating_point<_Rep> {}; +// clang-format off template struct _LIBCPP_TEMPLATE_VIS duration_values { public: - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR _Rep zero() _NOEXCEPT { return _Rep(0); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR _Rep max() _NOEXCEPT { return numeric_limits<_Rep>::max(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR _Rep min() _NOEXCEPT { return numeric_limits<_Rep>::lowest(); } + _LIBCPP_HIDE_FROM_ABI static _Rep zero() _NOEXCEPT { return _Rep(0); } + _LIBCPP_HIDE_FROM_ABI static _Rep max() _NOEXCEPT { return numeric_limits<_Rep>::max(); } + _LIBCPP_HIDE_FROM_ABI static _Rep min() _NOEXCEPT { return numeric_limits<_Rep>::lowest(); } }; +// clang-format on // duration @@ -165,7 +167,7 @@ class _LIBCPP_TEMPLATE_VIS duration { __enable_if_t::value && (treat_as_floating_point::value || !treat_as_floating_point<_Rep2>::value), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit duration(const _Rep2& __r) : __rep_(__r) {} + _LIBCPP_HIDE_FROM_ABI explicit duration(const _Rep2& __r) : __rep_(__r) {} // conversions template ::type::den == 1 && !treat_as_floating_point<_Rep2>::value)), int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration(const duration<_Rep2, _Period2>& __d) + _LIBCPP_HIDE_FROM_ABI duration(const duration<_Rep2, _Period2>& __d) : __rep_(chrono::duration_cast(__d).count()) {} // observer - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR rep count() const { return __rep_; } + _LIBCPP_HIDE_FROM_ABI rep count() const { return __rep_; } // arithmetic - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR typename common_type::type operator+() const { + _LIBCPP_HIDE_FROM_ABI typename common_type::type operator+() const { return typename common_type::type(*this); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR typename common_type::type operator-() const { + _LIBCPP_HIDE_FROM_ABI typename common_type::type operator-() const { return typename common_type::type(-__rep_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator++() { + _LIBCPP_HIDE_FROM_ABI duration& operator++() { ++__rep_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration operator++(int) { return duration(__rep_++); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator--() { + _LIBCPP_HIDE_FROM_ABI duration operator++(int) { return duration(__rep_++); } + _LIBCPP_HIDE_FROM_ABI duration& operator--() { --__rep_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration operator--(int) { return duration(__rep_--); } + _LIBCPP_HIDE_FROM_ABI duration operator--(int) { return duration(__rep_--); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator+=(const duration& __d) { + _LIBCPP_HIDE_FROM_ABI duration& operator+=(const duration& __d) { __rep_ += __d.count(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator-=(const duration& __d) { + _LIBCPP_HIDE_FROM_ABI duration& operator-=(const duration& __d) { __rep_ -= __d.count(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator*=(const rep& __rhs) { + _LIBCPP_HIDE_FROM_ABI duration& operator*=(const rep& __rhs) { __rep_ *= __rhs; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator/=(const rep& __rhs) { + _LIBCPP_HIDE_FROM_ABI duration& operator/=(const rep& __rhs) { __rep_ /= __rhs; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator%=(const rep& __rhs) { + _LIBCPP_HIDE_FROM_ABI duration& operator%=(const rep& __rhs) { __rep_ %= __rhs; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 duration& operator%=(const duration& __rhs) { + _LIBCPP_HIDE_FROM_ABI duration& operator%=(const duration& __rhs) { __rep_ %= __rhs.count(); return *this; } // special values - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR duration zero() _NOEXCEPT { - return duration(duration_values::zero()); - } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR duration min() _NOEXCEPT { - return duration(duration_values::min()); - } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR duration max() _NOEXCEPT { - return duration(duration_values::max()); - } + _LIBCPP_HIDE_FROM_ABI static duration zero() _NOEXCEPT { return duration(duration_values::zero()); } + _LIBCPP_HIDE_FROM_ABI static duration min() _NOEXCEPT { return duration(duration_values::min()); } + _LIBCPP_HIDE_FROM_ABI static duration max() _NOEXCEPT { return duration(duration_values::max()); } }; typedef duration nanoseconds; @@ -250,7 +246,7 @@ typedef duration< long, ratio<3600> > hours; template struct __duration_eq { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const { + _LIBCPP_HIDE_FROM_ABI bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const { typedef typename common_type<_LhsDuration, _RhsDuration>::type _Ct; return _Ct(__lhs).count() == _Ct(__rhs).count(); } @@ -258,13 +254,13 @@ struct __duration_eq { template struct __duration_eq<_LhsDuration, _LhsDuration> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const { + _LIBCPP_HIDE_FROM_ABI bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const { return __lhs.count() == __rhs.count(); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { return __duration_eq, duration<_Rep2, _Period2> >()(__lhs, __rhs); } @@ -272,7 +268,7 @@ operator==(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period // Duration != template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { return !(__lhs == __rhs); } @@ -281,7 +277,7 @@ operator!=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period template struct __duration_lt { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const { + _LIBCPP_HIDE_FROM_ABI bool operator()(const _LhsDuration& __lhs, const _RhsDuration& __rhs) const { typedef typename common_type<_LhsDuration, _RhsDuration>::type _Ct; return _Ct(__lhs).count() < _Ct(__rhs).count(); } @@ -289,13 +285,13 @@ struct __duration_lt { template struct __duration_lt<_LhsDuration, _LhsDuration> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const { + _LIBCPP_HIDE_FROM_ABI bool operator()(const _LhsDuration& __lhs, const _LhsDuration& __rhs) const { return __lhs.count() < __rhs.count(); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool +inline _LIBCPP_HIDE_FROM_ABI bool operator<(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { return __duration_lt, duration<_Rep2, _Period2> >()(__lhs, __rhs); } @@ -303,7 +299,7 @@ operator<(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2 // Duration > template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool +inline _LIBCPP_HIDE_FROM_ABI bool operator>(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { return __rhs < __lhs; } @@ -311,7 +307,7 @@ operator>(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2 // Duration <= template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { return !(__rhs < __lhs); } @@ -319,7 +315,7 @@ operator<=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period // Duration >= template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { return !(__lhs < __rhs); } @@ -327,8 +323,7 @@ operator>=(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period // Duration + template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR -typename common_type, duration<_Rep2, _Period2> >::type +inline _LIBCPP_HIDE_FROM_ABI typename common_type, duration<_Rep2, _Period2> >::type operator+(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { typedef typename common_type, duration<_Rep2, _Period2> >::type _Cd; return _Cd(_Cd(__lhs).count() + _Cd(__rhs).count()); @@ -337,8 +332,7 @@ operator+(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2 // Duration - template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR -typename common_type, duration<_Rep2, _Period2> >::type +inline _LIBCPP_HIDE_FROM_ABI typename common_type, duration<_Rep2, _Period2> >::type operator-(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { typedef typename common_type, duration<_Rep2, _Period2> >::type _Cd; return _Cd(_Cd(__lhs).count() - _Cd(__rhs).count()); @@ -350,7 +344,7 @@ template ::type>::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> +inline _LIBCPP_HIDE_FROM_ABI duration::type, _Period> operator*(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { typedef typename common_type<_Rep1, _Rep2>::type _Cr; typedef duration<_Cr, _Period> _Cd; @@ -361,7 +355,7 @@ template ::type>::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> +inline _LIBCPP_HIDE_FROM_ABI duration::type, _Period> operator*(const _Rep1& __s, const duration<_Rep2, _Period>& __d) { return __d * __s; } @@ -374,7 +368,7 @@ template ::value && is_convertible::type>::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> +inline _LIBCPP_HIDE_FROM_ABI duration::type, _Period> operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { typedef typename common_type<_Rep1, _Rep2>::type _Cr; typedef duration<_Cr, _Period> _Cd; @@ -382,7 +376,7 @@ operator/(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR typename common_type<_Rep1, _Rep2>::type +inline _LIBCPP_HIDE_FROM_ABI typename common_type<_Rep1, _Rep2>::type operator/(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { typedef typename common_type, duration<_Rep2, _Period2> >::type _Ct; return _Ct(__lhs).count() / _Ct(__rhs).count(); @@ -396,7 +390,7 @@ template ::value && is_convertible::type>::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR duration::type, _Period> +inline _LIBCPP_HIDE_FROM_ABI duration::type, _Period> operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { typedef typename common_type<_Rep1, _Rep2>::type _Cr; typedef duration<_Cr, _Period> _Cd; @@ -404,8 +398,7 @@ operator%(const duration<_Rep1, _Period>& __d, const _Rep2& __s) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR -typename common_type, duration<_Rep2, _Period2> >::type +inline _LIBCPP_HIDE_FROM_ABI typename common_type, duration<_Rep2, _Period2> >::type operator%(const duration<_Rep1, _Period1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { typedef typename common_type<_Rep1, _Rep2>::type _Cr; typedef typename common_type, duration<_Rep2, _Period2> >::type _Cd; diff --git a/libcxx/include/__cxx03/__chrono/steady_clock.h b/libcxx/include/__cxx03/__chrono/steady_clock.h index f8a56954f31ad..5a8fa4f84ad48 100644 --- a/libcxx/include/__cxx03/__chrono/steady_clock.h +++ b/libcxx/include/__cxx03/__chrono/steady_clock.h @@ -29,7 +29,7 @@ class _LIBCPP_EXPORTED_FROM_ABI steady_clock { typedef duration::rep rep; typedef duration::period period; typedef chrono::time_point time_point; - static _LIBCPP_CONSTEXPR_SINCE_CXX14 const bool is_steady = true; + static const bool is_steady = true; static time_point now() _NOEXCEPT; }; diff --git a/libcxx/include/__cxx03/__chrono/system_clock.h b/libcxx/include/__cxx03/__chrono/system_clock.h index 28cf3562036b1..09d2d698ad9f2 100644 --- a/libcxx/include/__cxx03/__chrono/system_clock.h +++ b/libcxx/include/__cxx03/__chrono/system_clock.h @@ -29,7 +29,7 @@ class _LIBCPP_EXPORTED_FROM_ABI system_clock { typedef duration::rep rep; typedef duration::period period; typedef chrono::time_point time_point; - static _LIBCPP_CONSTEXPR_SINCE_CXX14 const bool is_steady = false; + static const bool is_steady = false; static time_point now() _NOEXCEPT; static time_t to_time_t(const time_point& __t) _NOEXCEPT; diff --git a/libcxx/include/__cxx03/__chrono/time_point.h b/libcxx/include/__cxx03/__chrono/time_point.h index cf0fbc28a8856..8ec687d837717 100644 --- a/libcxx/include/__cxx03/__chrono/time_point.h +++ b/libcxx/include/__cxx03/__chrono/time_point.h @@ -43,33 +43,32 @@ class _LIBCPP_TEMPLATE_VIS time_point { duration __d_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 time_point() : __d_(duration::zero()) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit time_point(const duration& __d) : __d_(__d) {} + _LIBCPP_HIDE_FROM_ABI time_point() : __d_(duration::zero()) {} + _LIBCPP_HIDE_FROM_ABI explicit time_point(const duration& __d) : __d_(__d) {} // conversions template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 time_point(const time_point& __t) - : __d_(__t.time_since_epoch()) {} + _LIBCPP_HIDE_FROM_ABI time_point(const time_point& __t) : __d_(__t.time_since_epoch()) {} // observer - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 duration time_since_epoch() const { return __d_; } + _LIBCPP_HIDE_FROM_ABI duration time_since_epoch() const { return __d_; } // arithmetic - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 time_point& operator+=(const duration& __d) { + _LIBCPP_HIDE_FROM_ABI time_point& operator+=(const duration& __d) { __d_ += __d; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 time_point& operator-=(const duration& __d) { + _LIBCPP_HIDE_FROM_ABI time_point& operator-=(const duration& __d) { __d_ -= __d; return *this; } // special values - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR time_point min() _NOEXCEPT { return time_point(duration::min()); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR time_point max() _NOEXCEPT { return time_point(duration::max()); } + _LIBCPP_HIDE_FROM_ABI static time_point min() _NOEXCEPT { return time_point(duration::min()); } + _LIBCPP_HIDE_FROM_ABI static time_point max() _NOEXCEPT { return time_point(duration::max()); } }; } // namespace chrono @@ -83,15 +82,14 @@ common_type, chrono::time_point<_Clock, _ namespace chrono { template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 time_point<_Clock, _ToDuration> -time_point_cast(const time_point<_Clock, _Duration>& __t) { +inline _LIBCPP_HIDE_FROM_ABI time_point<_Clock, _ToDuration> time_point_cast(const time_point<_Clock, _Duration>& __t) { return time_point<_Clock, _ToDuration>(chrono::duration_cast<_ToDuration>(__t.time_since_epoch())); } // time_point == template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return __lhs.time_since_epoch() == __rhs.time_since_epoch(); } @@ -99,7 +97,7 @@ operator==(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, // time_point != template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return !(__lhs == __rhs); } @@ -107,7 +105,7 @@ operator!=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, // time_point < template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool +inline _LIBCPP_HIDE_FROM_ABI bool operator<(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return __lhs.time_since_epoch() < __rhs.time_since_epoch(); } @@ -115,7 +113,7 @@ operator<(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, // time_point > template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool +inline _LIBCPP_HIDE_FROM_ABI bool operator>(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return __rhs < __lhs; } @@ -123,7 +121,7 @@ operator>(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, // time_point <= template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return !(__rhs < __lhs); } @@ -131,7 +129,7 @@ operator<=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, // time_point >= template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return !(__lhs < __rhs); } @@ -139,8 +137,7 @@ operator>=(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, // time_point operator+(time_point x, duration y); template -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX14 time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2> >::type> +inline _LIBCPP_HIDE_FROM_ABI time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2> >::type> operator+(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { typedef time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2> >::type> _Tr; return _Tr(__lhs.time_since_epoch() + __rhs); @@ -149,8 +146,7 @@ operator+(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Pe // time_point operator+(duration x, time_point y); template -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX14 time_point<_Clock, typename common_type, _Duration2>::type> +inline _LIBCPP_HIDE_FROM_ABI time_point<_Clock, typename common_type, _Duration2>::type> operator+(const duration<_Rep1, _Period1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return __rhs + __lhs; } @@ -158,8 +154,7 @@ operator+(const duration<_Rep1, _Period1>& __lhs, const time_point<_Clock, _Dura // time_point operator-(time_point x, duration y); template -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX14 time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2> >::type> +inline _LIBCPP_HIDE_FROM_ABI time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2> >::type> operator-(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Period2>& __rhs) { typedef time_point<_Clock, typename common_type<_Duration1, duration<_Rep2, _Period2> >::type> _Ret; return _Ret(__lhs.time_since_epoch() - __rhs); @@ -168,7 +163,7 @@ operator-(const time_point<_Clock, _Duration1>& __lhs, const duration<_Rep2, _Pe // duration operator-(time_point x, time_point y); template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename common_type<_Duration1, _Duration2>::type +inline _LIBCPP_HIDE_FROM_ABI typename common_type<_Duration1, _Duration2>::type operator-(const time_point<_Clock, _Duration1>& __lhs, const time_point<_Clock, _Duration2>& __rhs) { return __lhs.time_since_epoch() - __rhs.time_since_epoch(); } diff --git a/libcxx/include/__cxx03/__condition_variable/condition_variable.h b/libcxx/include/__cxx03/__condition_variable/condition_variable.h index 8e41ad89914f9..af0325095fc9f 100644 --- a/libcxx/include/__cxx03/__condition_variable/condition_variable.h +++ b/libcxx/include/__cxx03/__condition_variable/condition_variable.h @@ -43,7 +43,7 @@ class _LIBCPP_EXPORTED_FROM_ABI condition_variable { __libcpp_condvar_t __cv_ = _LIBCPP_CONDVAR_INITIALIZER; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR condition_variable() _NOEXCEPT = default; + _LIBCPP_HIDE_FROM_ABI condition_variable() _NOEXCEPT = default; # ifdef _LIBCPP_HAS_TRIVIAL_CONDVAR_DESTRUCTION ~condition_variable() = default; diff --git a/libcxx/include/__cxx03/__config b/libcxx/include/__cxx03/__config index 0172f76cef912..ef47327d96355 100644 --- a/libcxx/include/__cxx03/__config +++ b/libcxx/include/__cxx03/__config @@ -215,15 +215,6 @@ _LIBCPP_HARDENING_MODE_DEBUG # endif # endif -// Incomplete features get their own specific disabling flags. This makes it -// easier to grep for target specific flags once the feature is complete. -# if !defined(_LIBCPP_ENABLE_EXPERIMENTAL) && !defined(_LIBCPP_BUILDING_LIBRARY) -# define _LIBCPP_HAS_NO_INCOMPLETE_PSTL -# define _LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN -# define _LIBCPP_HAS_NO_EXPERIMENTAL_TZDB -# define _LIBCPP_HAS_NO_EXPERIMENTAL_SYNCSTREAM -# endif - # if defined(__MVS__) # include // for __NATIVE_ASCII_F # endif @@ -302,17 +293,14 @@ _LIBCPP_HARDENING_MODE_DEBUG # define _LIBCPP_USING_DEV_RANDOM # endif -# define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp) -# define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x)))) -# define _ALIGNAS(x) __attribute__((__aligned__(x))) -# define _LIBCPP_NORETURN __attribute__((__noreturn__)) -# define _LIBCPP_HAS_NO_NOEXCEPT -# define nullptr __nullptr -# define _NOEXCEPT throw() -# define _NOEXCEPT_(...) -# define static_assert(...) _Static_assert(__VA_ARGS__) -# define decltype(...) __decltype(__VA_ARGS__) -# define _LIBCPP_CONSTEXPR +# define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp) +# define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x)))) +# define _ALIGNAS(x) __attribute__((__aligned__(x))) +# define _LIBCPP_NORETURN __attribute__((__noreturn__)) +# define nullptr __nullptr +# define _NOEXCEPT throw() +# define static_assert(...) _Static_assert(__VA_ARGS__) +# define decltype(...) __decltype(__VA_ARGS__) typedef __char16_t char16_t; typedef __char32_t char32_t; @@ -666,14 +654,6 @@ typedef __char32_t char32_t; # define _LIBCPP_DEPRECATED_ATOMIC_SYNC /* nothing */ -# define _LIBCPP_DEPRECATED_IN_CXX11 - -# define _LIBCPP_DEPRECATED_IN_CXX14 -# define _LIBCPP_DEPRECATED_IN_CXX17 -# define _LIBCPP_DEPRECATED_IN_CXX20 -# define _LIBCPP_DEPRECATED_IN_CXX23 -# define _LIBCPP_DEPRECATED_IN_CXX26 - # if !defined(_LIBCPP_HAS_NO_CHAR8_T) # define _LIBCPP_DEPRECATED_WITH_CHAR8_T _LIBCPP_DEPRECATED # else @@ -691,13 +671,6 @@ typedef __char32_t char32_t; # define _LIBCPP_SUPPRESS_DEPRECATED_POP # endif -# define _LIBCPP_EXPLICIT_SINCE_CXX14 -# define _LIBCPP_EXPLICIT_SINCE_CXX23 -# define _LIBCPP_CONSTEXPR_SINCE_CXX14 -# define _LIBCPP_CONSTEXPR_SINCE_CXX17 -# define _LIBCPP_CONSTEXPR_SINCE_CXX20 -# define _LIBCPP_CONSTEXPR_SINCE_CXX23 - # ifndef _LIBCPP_WEAK # define _LIBCPP_WEAK __attribute__((__weak__)) # endif diff --git a/libcxx/include/__cxx03/__debug_utils/randomize_range.h b/libcxx/include/__cxx03/__debug_utils/randomize_range.h index dec21a01ce3fc..577a6be495e88 100644 --- a/libcxx/include/__cxx03/__debug_utils/randomize_range.h +++ b/libcxx/include/__cxx03/__debug_utils/randomize_range.h @@ -23,7 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __debug_randomize_range(_Iterator __first, _Sentinel __last) { +_LIBCPP_HIDE_FROM_ABI void __debug_randomize_range(_Iterator __first, _Sentinel __last) { #ifdef _LIBCPP_DEBUG_RANDOMIZE_UNSPECIFIED_STABILITY # error Support for unspecified stability is only for C++11 and higher diff --git a/libcxx/include/__cxx03/__debug_utils/sanitizers.h b/libcxx/include/__cxx03/__debug_utils/sanitizers.h index e3cda20468b53..32c4399eb9b67 100644 --- a/libcxx/include/__cxx03/__debug_utils/sanitizers.h +++ b/libcxx/include/__cxx03/__debug_utils/sanitizers.h @@ -81,7 +81,7 @@ _LIBCPP_HIDE_FROM_ABI void __annotate_double_ended_contiguous_container( // __old_last_contained is the previously last allowed (unpoisoned) element, and // __new_last_contained is the new last allowed (unpoisoned) element. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __annotate_contiguous_container( +_LIBCPP_HIDE_FROM_ABI void __annotate_contiguous_container( const void* __first_storage, const void* __last_storage, const void* __old_last_contained, diff --git a/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h b/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h index 8d3a918d9b557..98976524a8c01 100644 --- a/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h +++ b/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h @@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +_LIBCPP_HIDE_FROM_ABI void __check_strict_weak_ordering_sorted(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp& __comp) { #if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG using __diff_t = __iter_diff_t<_RandomAccessIterator>; diff --git a/libcxx/include/__cxx03/__functional/binary_function.h b/libcxx/include/__cxx03/__functional/binary_function.h index 61329bb7316c2..06613bdc1e907 100644 --- a/libcxx/include/__cxx03/__functional/binary_function.h +++ b/libcxx/include/__cxx03/__functional/binary_function.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 binary_function { +struct _LIBCPP_TEMPLATE_VIS binary_function { typedef _Arg1 first_argument_type; typedef _Arg2 second_argument_type; typedef _Result result_type; @@ -27,9 +27,9 @@ struct _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 binary_function { template struct __binary_function_keep_layout_base { - using first_argument_type _LIBCPP_DEPRECATED_IN_CXX17 = _Arg1; - using second_argument_type _LIBCPP_DEPRECATED_IN_CXX17 = _Arg2; - using result_type _LIBCPP_DEPRECATED_IN_CXX17 = _Result; + using first_argument_type = _Arg1; + using second_argument_type = _Arg2; + using result_type = _Result; }; _LIBCPP_DIAGNOSTIC_PUSH diff --git a/libcxx/include/__cxx03/__functional/binary_negate.h b/libcxx/include/__cxx03/__functional/binary_negate.h index aa9ef71ffd262..f59506cf1fc32 100644 --- a/libcxx/include/__cxx03/__functional/binary_negate.h +++ b/libcxx/include/__cxx03/__functional/binary_negate.h @@ -20,25 +20,23 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 binary_negate +class _LIBCPP_TEMPLATE_VIS binary_negate : public __binary_function { _Predicate __pred_; public: - _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR_SINCE_CXX14 binary_negate(const _Predicate& __pred) - : __pred_(__pred) {} + _LIBCPP_HIDE_FROM_ABI explicit binary_negate(const _Predicate& __pred) : __pred_(__pred) {} - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()( - const typename _Predicate::first_argument_type& __x, const typename _Predicate::second_argument_type& __y) const { + _LIBCPP_HIDE_FROM_ABI bool operator()(const typename _Predicate::first_argument_type& __x, + const typename _Predicate::second_argument_type& __y) const { return !__pred_(__x, __y); } }; template -_LIBCPP_DEPRECATED_IN_CXX17 inline _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI binary_negate<_Predicate> -not2(const _Predicate& __pred) { +inline _LIBCPP_HIDE_FROM_ABI binary_negate<_Predicate> not2(const _Predicate& __pred) { return binary_negate<_Predicate>(__pred); } diff --git a/libcxx/include/__cxx03/__functional/binder1st.h b/libcxx/include/__cxx03/__functional/binder1st.h index 1a4a2ed81c04d..aacb02471c301 100644 --- a/libcxx/include/__cxx03/__functional/binder1st.h +++ b/libcxx/include/__cxx03/__functional/binder1st.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 binder1st +class _LIBCPP_TEMPLATE_VIS binder1st : public __unary_function { protected: _Operation op; @@ -40,8 +40,7 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 binder1st }; template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI binder1st<_Operation> -bind1st(const _Operation& __op, const _Tp& __x) { +inline _LIBCPP_HIDE_FROM_ABI binder1st<_Operation> bind1st(const _Operation& __op, const _Tp& __x) { return binder1st<_Operation>(__op, __x); } diff --git a/libcxx/include/__cxx03/__functional/binder2nd.h b/libcxx/include/__cxx03/__functional/binder2nd.h index 06600946e3e96..b45a1fe3cb288 100644 --- a/libcxx/include/__cxx03/__functional/binder2nd.h +++ b/libcxx/include/__cxx03/__functional/binder2nd.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 binder2nd +class _LIBCPP_TEMPLATE_VIS binder2nd : public __unary_function { protected: _Operation op; @@ -40,8 +40,7 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 binder2nd }; template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI binder2nd<_Operation> -bind2nd(const _Operation& __op, const _Tp& __x) { +inline _LIBCPP_HIDE_FROM_ABI binder2nd<_Operation> bind2nd(const _Operation& __op, const _Tp& __x) { return binder2nd<_Operation>(__op, __x); } diff --git a/libcxx/include/__cxx03/__functional/identity.h b/libcxx/include/__cxx03/__functional/identity.h index 6af22948a1f19..b5b86830f54ac 100644 --- a/libcxx/include/__cxx03/__functional/identity.h +++ b/libcxx/include/__cxx03/__functional/identity.h @@ -26,7 +26,7 @@ struct __is_identity : false_type {}; struct __identity { template - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp&& operator()(_Tp&& __t) const _NOEXCEPT { + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _Tp&& operator()(_Tp&& __t) const _NOEXCEPT { return std::forward<_Tp>(__t); } diff --git a/libcxx/include/__cxx03/__functional/mem_fn.h b/libcxx/include/__cxx03/__functional/mem_fn.h index 4577c41bf8499..fb9ffcf55c58d 100644 --- a/libcxx/include/__cxx03/__functional/mem_fn.h +++ b/libcxx/include/__cxx03/__functional/mem_fn.h @@ -32,20 +32,17 @@ class __mem_fn : public __weak_result_type<_Tp> { type __f_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __mem_fn(type __f) _NOEXCEPT : __f_(__f) {} + _LIBCPP_HIDE_FROM_ABI __mem_fn(type __f) _NOEXCEPT : __f_(__f) {} // invoke template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - - typename __invoke_return::type - operator()(_ArgTypes&&... __args) const { + _LIBCPP_HIDE_FROM_ABI typename __invoke_return::type operator()(_ArgTypes&&... __args) const { return std::__invoke(__f_, std::forward<_ArgTypes>(__args)...); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __mem_fn<_Rp _Tp::*> mem_fn(_Rp _Tp::*__pm) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI __mem_fn<_Rp _Tp::*> mem_fn(_Rp _Tp::* __pm) _NOEXCEPT { return __mem_fn<_Rp _Tp::*>(__pm); } diff --git a/libcxx/include/__cxx03/__functional/mem_fun_ref.h b/libcxx/include/__cxx03/__functional/mem_fun_ref.h index 7a15d19b32f7f..81497e96afde4 100644 --- a/libcxx/include/__cxx03/__functional/mem_fun_ref.h +++ b/libcxx/include/__cxx03/__functional/mem_fun_ref.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun_t : public __unary_function<_Tp*, _Sp> { +class _LIBCPP_TEMPLATE_VIS mem_fun_t : public __unary_function<_Tp*, _Sp> { _Sp (_Tp::*__p_)(); public: @@ -30,7 +30,7 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun_t : public __unar }; template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun1_t : public __binary_function<_Tp*, _Ap, _Sp> { +class _LIBCPP_TEMPLATE_VIS mem_fun1_t : public __binary_function<_Tp*, _Ap, _Sp> { _Sp (_Tp::*__p_)(_Ap); public: @@ -39,17 +39,17 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun1_t : public __bin }; template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI mem_fun_t<_Sp, _Tp> mem_fun(_Sp (_Tp::*__f)()) { +inline _LIBCPP_HIDE_FROM_ABI mem_fun_t<_Sp, _Tp> mem_fun(_Sp (_Tp::*__f)()) { return mem_fun_t<_Sp, _Tp>(__f); } template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI mem_fun1_t<_Sp, _Tp, _Ap> mem_fun(_Sp (_Tp::*__f)(_Ap)) { +inline _LIBCPP_HIDE_FROM_ABI mem_fun1_t<_Sp, _Tp, _Ap> mem_fun(_Sp (_Tp::*__f)(_Ap)) { return mem_fun1_t<_Sp, _Tp, _Ap>(__f); } template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun_ref_t : public __unary_function<_Tp, _Sp> { +class _LIBCPP_TEMPLATE_VIS mem_fun_ref_t : public __unary_function<_Tp, _Sp> { _Sp (_Tp::*__p_)(); public: @@ -58,7 +58,7 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun_ref_t : public __ }; template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun1_ref_t : public __binary_function<_Tp, _Ap, _Sp> { +class _LIBCPP_TEMPLATE_VIS mem_fun1_ref_t : public __binary_function<_Tp, _Ap, _Sp> { _Sp (_Tp::*__p_)(_Ap); public: @@ -67,18 +67,17 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 mem_fun1_ref_t : public _ }; template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI mem_fun_ref_t<_Sp, _Tp> mem_fun_ref(_Sp (_Tp::*__f)()) { +inline _LIBCPP_HIDE_FROM_ABI mem_fun_ref_t<_Sp, _Tp> mem_fun_ref(_Sp (_Tp::*__f)()) { return mem_fun_ref_t<_Sp, _Tp>(__f); } template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI mem_fun1_ref_t<_Sp, _Tp, _Ap> -mem_fun_ref(_Sp (_Tp::*__f)(_Ap)) { +inline _LIBCPP_HIDE_FROM_ABI mem_fun1_ref_t<_Sp, _Tp, _Ap> mem_fun_ref(_Sp (_Tp::*__f)(_Ap)) { return mem_fun1_ref_t<_Sp, _Tp, _Ap>(__f); } template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun_t : public __unary_function { +class _LIBCPP_TEMPLATE_VIS const_mem_fun_t : public __unary_function { _Sp (_Tp::*__p_)() const; public: @@ -87,8 +86,7 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun_t : public }; template -class _LIBCPP_TEMPLATE_VIS -_LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun1_t : public __binary_function { +class _LIBCPP_TEMPLATE_VIS const_mem_fun1_t : public __binary_function { _Sp (_Tp::*__p_)(_Ap) const; public: @@ -97,18 +95,17 @@ _LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun1_t : public __binary_function -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI const_mem_fun_t<_Sp, _Tp> mem_fun(_Sp (_Tp::*__f)() const) { +inline _LIBCPP_HIDE_FROM_ABI const_mem_fun_t<_Sp, _Tp> mem_fun(_Sp (_Tp::*__f)() const) { return const_mem_fun_t<_Sp, _Tp>(__f); } template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI const_mem_fun1_t<_Sp, _Tp, _Ap> -mem_fun(_Sp (_Tp::*__f)(_Ap) const) { +inline _LIBCPP_HIDE_FROM_ABI const_mem_fun1_t<_Sp, _Tp, _Ap> mem_fun(_Sp (_Tp::*__f)(_Ap) const) { return const_mem_fun1_t<_Sp, _Tp, _Ap>(__f); } template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun_ref_t : public __unary_function<_Tp, _Sp> { +class _LIBCPP_TEMPLATE_VIS const_mem_fun_ref_t : public __unary_function<_Tp, _Sp> { _Sp (_Tp::*__p_)() const; public: @@ -117,7 +114,7 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun_ref_t : pub }; template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun1_ref_t : public __binary_function<_Tp, _Ap, _Sp> { +class _LIBCPP_TEMPLATE_VIS const_mem_fun1_ref_t : public __binary_function<_Tp, _Ap, _Sp> { _Sp (_Tp::*__p_)(_Ap) const; public: @@ -126,14 +123,12 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 const_mem_fun1_ref_t : pu }; template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI const_mem_fun_ref_t<_Sp, _Tp> -mem_fun_ref(_Sp (_Tp::*__f)() const) { +inline _LIBCPP_HIDE_FROM_ABI const_mem_fun_ref_t<_Sp, _Tp> mem_fun_ref(_Sp (_Tp::*__f)() const) { return const_mem_fun_ref_t<_Sp, _Tp>(__f); } template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI const_mem_fun1_ref_t<_Sp, _Tp, _Ap> -mem_fun_ref(_Sp (_Tp::*__f)(_Ap) const) { +inline _LIBCPP_HIDE_FROM_ABI const_mem_fun1_ref_t<_Sp, _Tp, _Ap> mem_fun_ref(_Sp (_Tp::*__f)(_Ap) const) { return const_mem_fun1_ref_t<_Sp, _Tp, _Ap>(__f); } diff --git a/libcxx/include/__cxx03/__functional/operations.h b/libcxx/include/__cxx03/__functional/operations.h index 9c40a198fee15..43bfbddc41cce 100644 --- a/libcxx/include/__cxx03/__functional/operations.h +++ b/libcxx/include/__cxx03/__functional/operations.h @@ -27,9 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct _LIBCPP_TEMPLATE_VIS plus : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x + __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x + __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(plus); @@ -44,43 +42,35 @@ inline const bool __desugars_to_v<__plus_tag, plus, _Tp, _Up> = true; template struct _LIBCPP_TEMPLATE_VIS minus : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x - __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x - __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(minus); template struct _LIBCPP_TEMPLATE_VIS multiplies : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x * __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x * __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(multiplies); template struct _LIBCPP_TEMPLATE_VIS divides : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x / __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x / __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(divides); template struct _LIBCPP_TEMPLATE_VIS modulus : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x % __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x % __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(modulus); template struct _LIBCPP_TEMPLATE_VIS negate : __unary_function<_Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x) const { return -__x; } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x) const { return -__x; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(negate); @@ -89,27 +79,21 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(negate); template struct _LIBCPP_TEMPLATE_VIS bit_and : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x & __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x & __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(bit_and); template struct _LIBCPP_TEMPLATE_VIS bit_or : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x | __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x | __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(bit_or); template struct _LIBCPP_TEMPLATE_VIS bit_xor : __binary_function<_Tp, _Tp, _Tp> { typedef _Tp __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { - return __x ^ __y; - } + _LIBCPP_HIDE_FROM_ABI _Tp operator()(const _Tp& __x, const _Tp& __y) const { return __x ^ __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(bit_xor); @@ -118,9 +102,7 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(bit_xor); template struct _LIBCPP_TEMPLATE_VIS equal_to : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x == __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x == __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(equal_to); @@ -136,18 +118,14 @@ inline const bool __desugars_to_v<__equal_tag, equal_to, _Tp, _Up> = true; template struct _LIBCPP_TEMPLATE_VIS not_equal_to : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x != __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x != __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(not_equal_to); template struct _LIBCPP_TEMPLATE_VIS less : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x < __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x < __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(less); @@ -157,27 +135,21 @@ inline const bool __desugars_to_v<__less_tag, less<_Tp>, _Tp, _Tp> = true; template struct _LIBCPP_TEMPLATE_VIS less_equal : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x <= __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x <= __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(less_equal); template struct _LIBCPP_TEMPLATE_VIS greater_equal : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x >= __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x >= __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(greater_equal); template struct _LIBCPP_TEMPLATE_VIS greater : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x > __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x > __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(greater); @@ -186,25 +158,21 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(greater); template struct _LIBCPP_TEMPLATE_VIS logical_and : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x && __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x && __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(logical_and); template struct _LIBCPP_TEMPLATE_VIS logical_not : __unary_function<_Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x) const { return !__x; } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x) const { return !__x; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(logical_not); template struct _LIBCPP_TEMPLATE_VIS logical_or : __binary_function<_Tp, _Tp, bool> { typedef bool __result_type; // used by valarray - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { - return __x || __y; - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Tp& __x, const _Tp& __y) const { return __x || __y; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(logical_or); diff --git a/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h b/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h index 69ec2c8f8b801..3fe2f4cbc62da 100644 --- a/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h +++ b/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h @@ -20,8 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -class _LIBCPP_TEMPLATE_VIS -_LIBCPP_DEPRECATED_IN_CXX11 pointer_to_binary_function : public __binary_function<_Arg1, _Arg2, _Result> { +class _LIBCPP_TEMPLATE_VIS pointer_to_binary_function : public __binary_function<_Arg1, _Arg2, _Result> { _Result (*__f_)(_Arg1, _Arg2); public: @@ -30,8 +29,7 @@ _LIBCPP_DEPRECATED_IN_CXX11 pointer_to_binary_function : public __binary_functio }; template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI pointer_to_binary_function<_Arg1, _Arg2, _Result> -ptr_fun(_Result (*__f)(_Arg1, _Arg2)) { +inline _LIBCPP_HIDE_FROM_ABI pointer_to_binary_function<_Arg1, _Arg2, _Result> ptr_fun(_Result (*__f)(_Arg1, _Arg2)) { return pointer_to_binary_function<_Arg1, _Arg2, _Result>(__f); } diff --git a/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h b/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h index 217ce3ff6c957..cf429b922cc94 100644 --- a/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h +++ b/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h @@ -20,8 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -class _LIBCPP_TEMPLATE_VIS -_LIBCPP_DEPRECATED_IN_CXX11 pointer_to_unary_function : public __unary_function<_Arg, _Result> { +class _LIBCPP_TEMPLATE_VIS pointer_to_unary_function : public __unary_function<_Arg, _Result> { _Result (*__f_)(_Arg); public: @@ -30,8 +29,7 @@ _LIBCPP_DEPRECATED_IN_CXX11 pointer_to_unary_function : public __unary_function< }; template -_LIBCPP_DEPRECATED_IN_CXX11 inline _LIBCPP_HIDE_FROM_ABI pointer_to_unary_function<_Arg, _Result> -ptr_fun(_Result (*__f)(_Arg)) { +inline _LIBCPP_HIDE_FROM_ABI pointer_to_unary_function<_Arg, _Result> ptr_fun(_Result (*__f)(_Arg)) { return pointer_to_unary_function<_Arg, _Result>(__f); } diff --git a/libcxx/include/__cxx03/__functional/reference_wrapper.h b/libcxx/include/__cxx03/__functional/reference_wrapper.h index a9da980749f32..bbe8544531a8d 100644 --- a/libcxx/include/__cxx03/__functional/reference_wrapper.h +++ b/libcxx/include/__cxx03/__functional/reference_wrapper.h @@ -43,43 +43,39 @@ class _LIBCPP_TEMPLATE_VIS reference_wrapper : public __weak_result_type<_Tp> { template ()))>, __enable_if_t::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference_wrapper(_Up&& __u) - _NOEXCEPT_(noexcept(__fun(std::declval<_Up>()))) { + _LIBCPP_HIDE_FROM_ABI reference_wrapper(_Up&& __u) { type& __f = static_cast<_Up&&>(__u); __f_ = std::addressof(__f); } // access - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator type&() const _NOEXCEPT { return *__f_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 type& get() const _NOEXCEPT { return *__f_; } + _LIBCPP_HIDE_FROM_ABI operator type&() const _NOEXCEPT { return *__f_; } + _LIBCPP_HIDE_FROM_ABI type& get() const _NOEXCEPT { return *__f_; } // invoke template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __invoke_of::type - operator()(_ArgTypes&&... __args) const { + _LIBCPP_HIDE_FROM_ABI typename __invoke_of::type operator()(_ArgTypes&&... __args) const { return std::__invoke(get(), std::forward<_ArgTypes>(__args)...); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference_wrapper<_Tp> ref(_Tp& __t) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI reference_wrapper<_Tp> ref(_Tp& __t) _NOEXCEPT { return reference_wrapper<_Tp>(__t); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference_wrapper<_Tp> -ref(reference_wrapper<_Tp> __t) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI reference_wrapper<_Tp> ref(reference_wrapper<_Tp> __t) _NOEXCEPT { return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference_wrapper cref(const _Tp& __t) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI reference_wrapper cref(const _Tp& __t) _NOEXCEPT { return reference_wrapper(__t); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference_wrapper -cref(reference_wrapper<_Tp> __t) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI reference_wrapper cref(reference_wrapper<_Tp> __t) _NOEXCEPT { return __t; } diff --git a/libcxx/include/__cxx03/__functional/unary_function.h b/libcxx/include/__cxx03/__functional/unary_function.h index 8bf305a4dfe9f..72193c865b589 100644 --- a/libcxx/include/__cxx03/__functional/unary_function.h +++ b/libcxx/include/__cxx03/__functional/unary_function.h @@ -18,15 +18,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 unary_function { +struct _LIBCPP_TEMPLATE_VIS unary_function { typedef _Arg argument_type; typedef _Result result_type; }; template struct __unary_function_keep_layout_base { - using argument_type _LIBCPP_DEPRECATED_IN_CXX17 = _Arg; - using result_type _LIBCPP_DEPRECATED_IN_CXX17 = _Result; + using argument_type = _Arg; + using result_type = _Result; }; _LIBCPP_DIAGNOSTIC_PUSH diff --git a/libcxx/include/__cxx03/__functional/unary_negate.h b/libcxx/include/__cxx03/__functional/unary_negate.h index 87abbe88e0e75..3591b8ee0d5a3 100644 --- a/libcxx/include/__cxx03/__functional/unary_negate.h +++ b/libcxx/include/__cxx03/__functional/unary_negate.h @@ -20,22 +20,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -class _LIBCPP_TEMPLATE_VIS -_LIBCPP_DEPRECATED_IN_CXX17 unary_negate : public __unary_function { +class _LIBCPP_TEMPLATE_VIS unary_negate : public __unary_function { _Predicate __pred_; public: - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI explicit unary_negate(const _Predicate& __pred) - : __pred_(__pred) {} - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool - operator()(const typename _Predicate::argument_type& __x) const { - return !__pred_(__x); - } + _LIBCPP_HIDE_FROM_ABI explicit unary_negate(const _Predicate& __pred) : __pred_(__pred) {} + _LIBCPP_HIDE_FROM_ABI bool operator()(const typename _Predicate::argument_type& __x) const { return !__pred_(__x); } }; template -_LIBCPP_DEPRECATED_IN_CXX17 inline _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI unary_negate<_Predicate> -not1(const _Predicate& __pred) { +inline _LIBCPP_HIDE_FROM_ABI unary_negate<_Predicate> not1(const _Predicate& __pred) { return unary_negate<_Predicate>(__pred); } diff --git a/libcxx/include/__cxx03/__functional/weak_result_type.h b/libcxx/include/__cxx03/__functional/weak_result_type.h index e0f42e5aeb9b7..e8c2c4de9900a 100644 --- a/libcxx/include/__cxx03/__functional/weak_result_type.h +++ b/libcxx/include/__cxx03/__functional/weak_result_type.h @@ -88,7 +88,7 @@ template ::value> struct __weak_result_type_imp // bool is true : public __maybe_derive_from_unary_function<_Tp>, public __maybe_derive_from_binary_function<_Tp> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = typename _Tp::result_type; + using result_type _LIBCPP_NODEBUG = typename _Tp::result_type; }; template @@ -102,17 +102,17 @@ struct __weak_result_type : public __weak_result_type_imp<_Tp> {}; template struct __weak_result_type<_Rp()> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (&)()> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (*)()> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; // 1 argument case @@ -166,37 +166,37 @@ struct __weak_result_type<_Rp (_Cp::*)(_A1) const volatile> : public __binary_fu template struct __weak_result_type<_Rp(_A1, _A2, _A3, _A4...)> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (&)(_A1, _A2, _A3, _A4...)> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (*)(_A1, _A2, _A3, _A4...)> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...)> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...) const> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...) volatile> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...) const volatile> { - using result_type _LIBCPP_NODEBUG _LIBCPP_DEPRECATED_IN_CXX17 = _Rp; + using result_type _LIBCPP_NODEBUG = _Rp; }; template diff --git a/libcxx/include/__cxx03/__fwd/array.h b/libcxx/include/__cxx03/__fwd/array.h index aea35f29c6423..101faf876333e 100644 --- a/libcxx/include/__cxx03/__fwd/array.h +++ b/libcxx/include/__cxx03/__fwd/array.h @@ -22,10 +22,10 @@ template struct _LIBCPP_TEMPLATE_VIS array; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>&) _NOEXCEPT; +_LIBCPP_HIDE_FROM_ABI _Tp& get(array<_Tp, _Size>&) _NOEXCEPT; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>&) _NOEXCEPT; +_LIBCPP_HIDE_FROM_ABI const _Tp& get(const array<_Tp, _Size>&) _NOEXCEPT; template struct __is_std_array : false_type {}; diff --git a/libcxx/include/__cxx03/__fwd/pair.h b/libcxx/include/__cxx03/__fwd/pair.h index 56d5bae5a90f3..be2c8623f89e5 100644 --- a/libcxx/include/__cxx03/__fwd/pair.h +++ b/libcxx/include/__cxx03/__fwd/pair.h @@ -23,12 +23,10 @@ template struct _LIBCPP_TEMPLATE_VIS pair; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type& -get(pair<_T1, _T2>&) _NOEXCEPT; +_LIBCPP_HIDE_FROM_ABI typename tuple_element<_Ip, pair<_T1, _T2> >::type& get(pair<_T1, _T2>&) _NOEXCEPT; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type& -get(const pair<_T1, _T2>&) _NOEXCEPT; +_LIBCPP_HIDE_FROM_ABI const typename tuple_element<_Ip, pair<_T1, _T2> >::type& get(const pair<_T1, _T2>&) _NOEXCEPT; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__hash_table b/libcxx/include/__cxx03/__hash_table index 6148a828dd2f8..30c0fbf4049da 100644 --- a/libcxx/include/__cxx03/__hash_table +++ b/libcxx/include/__cxx03/__hash_table @@ -546,16 +546,11 @@ class __bucket_list_deallocator { public: typedef typename __alloc_traits::pointer pointer; - _LIBCPP_HIDE_FROM_ABI __bucket_list_deallocator() _NOEXCEPT_(is_nothrow_default_constructible::value) - : __data_(0, __default_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI __bucket_list_deallocator() : __data_(0, __default_init_tag()) {} - _LIBCPP_HIDE_FROM_ABI __bucket_list_deallocator(const allocator_type& __a, size_type __size) - _NOEXCEPT_(is_nothrow_copy_constructible::value) - : __data_(__size, __a) {} + _LIBCPP_HIDE_FROM_ABI __bucket_list_deallocator(const allocator_type& __a, size_type __size) : __data_(__size, __a) {} - _LIBCPP_HIDE_FROM_ABI __bucket_list_deallocator(__bucket_list_deallocator&& __x) - _NOEXCEPT_(is_nothrow_move_constructible::value) - : __data_(std::move(__x.__data_)) { + _LIBCPP_HIDE_FROM_ABI __bucket_list_deallocator(__bucket_list_deallocator&& __x) : __data_(std::move(__x.__data_)) { __x.size() = 0; } @@ -710,27 +705,18 @@ public: typedef __hash_local_iterator<__node_pointer> local_iterator; typedef __hash_const_local_iterator<__node_pointer> const_local_iterator; - _LIBCPP_HIDE_FROM_ABI __hash_table() _NOEXCEPT_( - is_nothrow_default_constructible<__bucket_list>::value&& is_nothrow_default_constructible<__first_node>::value&& - is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_default_constructible::value&& - is_nothrow_default_constructible::value); + _LIBCPP_HIDE_FROM_ABI __hash_table(); _LIBCPP_HIDE_FROM_ABI __hash_table(const hasher& __hf, const key_equal& __eql); _LIBCPP_HIDE_FROM_ABI __hash_table(const hasher& __hf, const key_equal& __eql, const allocator_type& __a); _LIBCPP_HIDE_FROM_ABI explicit __hash_table(const allocator_type& __a); _LIBCPP_HIDE_FROM_ABI __hash_table(const __hash_table& __u); _LIBCPP_HIDE_FROM_ABI __hash_table(const __hash_table& __u, const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI __hash_table(__hash_table&& __u) _NOEXCEPT_( - is_nothrow_move_constructible<__bucket_list>::value&& is_nothrow_move_constructible<__first_node>::value&& - is_nothrow_move_constructible<__node_allocator>::value&& is_nothrow_move_constructible::value&& - is_nothrow_move_constructible::value); + _LIBCPP_HIDE_FROM_ABI __hash_table(__hash_table&& __u); _LIBCPP_HIDE_FROM_ABI __hash_table(__hash_table&& __u, const allocator_type& __a); _LIBCPP_HIDE_FROM_ABI ~__hash_table(); _LIBCPP_HIDE_FROM_ABI __hash_table& operator=(const __hash_table& __u); - _LIBCPP_HIDE_FROM_ABI __hash_table& operator=(__hash_table&& __u) - _NOEXCEPT_(__node_traits::propagate_on_container_move_assignment::value&& - is_nothrow_move_assignable<__node_allocator>::value&& is_nothrow_move_assignable::value&& - is_nothrow_move_assignable::value); + _LIBCPP_HIDE_FROM_ABI __hash_table& operator=(__hash_table&& __u); template _LIBCPP_HIDE_FROM_ABI void __assign_unique(_InputIterator __first, _InputIterator __last); template @@ -871,11 +857,7 @@ public: template _LIBCPP_HIDE_FROM_ABI pair __equal_range_multi(const _Key& __k) const; - _LIBCPP_HIDE_FROM_ABI void swap(__hash_table& __u) - _NOEXCEPT_(__is_nothrow_swappable_v&& __is_nothrow_swappable_v && - (!allocator_traits<__pointer_allocator>::propagate_on_container_swap::value || - __is_nothrow_swappable_v<__pointer_allocator>) && - (!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>)); + _LIBCPP_HIDE_FROM_ABI void swap(__hash_table& __u); _LIBCPP_HIDE_FROM_ABI size_type max_bucket_count() const _NOEXCEPT { return max_size(); } _LIBCPP_HIDE_FROM_ABI size_type bucket_size(size_type __n) const; @@ -934,16 +916,11 @@ private: _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __hash_table&, false_type) {} _LIBCPP_HIDE_FROM_ABI void __move_assign(__hash_table& __u, false_type); - _LIBCPP_HIDE_FROM_ABI void __move_assign(__hash_table& __u, true_type) - _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value&& is_nothrow_move_assignable::value&& - is_nothrow_move_assignable::value); - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__hash_table& __u) _NOEXCEPT_( - !__node_traits::propagate_on_container_move_assignment::value || - (is_nothrow_move_assignable<__pointer_allocator>::value && is_nothrow_move_assignable<__node_allocator>::value)) { + _LIBCPP_HIDE_FROM_ABI void __move_assign(__hash_table& __u, true_type); + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__hash_table& __u) { __move_assign_alloc(__u, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__hash_table& __u, true_type) _NOEXCEPT_( - is_nothrow_move_assignable<__pointer_allocator>::value&& is_nothrow_move_assignable<__node_allocator>::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__hash_table& __u, true_type) { __bucket_list_.get_deleter().__alloc() = std::move(__u.__bucket_list_.get_deleter().__alloc()); __node_alloc() = std::move(__u.__node_alloc()); } @@ -959,10 +936,7 @@ private: }; template -inline __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table() _NOEXCEPT_( - is_nothrow_default_constructible<__bucket_list>::value&& is_nothrow_default_constructible<__first_node>::value&& - is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_default_constructible::value&& - is_nothrow_default_constructible::value) +inline __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table() : __p2_(0, __default_init_tag()), __p3_(1.0f, __default_init_tag()) {} template @@ -1003,10 +977,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(const __hash_table& __u, __p3_(__u.__p3_) {} template -__hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(__hash_table&& __u) _NOEXCEPT_( - is_nothrow_move_constructible<__bucket_list>::value&& is_nothrow_move_constructible<__first_node>::value&& - is_nothrow_move_constructible<__node_allocator>::value&& is_nothrow_move_constructible::value&& - is_nothrow_move_constructible::value) +__hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(__hash_table&& __u) : __bucket_list_(std::move(__u.__bucket_list_)), __p1_(std::move(__u.__p1_)), __p2_(std::move(__u.__p2_)), @@ -1095,9 +1066,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__detach() _NOEXCEPT { } template -void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(__hash_table& __u, true_type) - _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value&& is_nothrow_move_assignable::value&& - is_nothrow_move_assignable::value) { +void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(__hash_table& __u, true_type) { clear(); __bucket_list_.reset(__u.__bucket_list_.release()); __bucket_list_.get_deleter().size() = __u.__bucket_list_.get_deleter().size(); @@ -1154,9 +1123,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(__hash_table& __u, template inline __hash_table<_Tp, _Hash, _Equal, _Alloc>& -__hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(__hash_table&& __u) _NOEXCEPT_( - __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable<__node_allocator>::value&& - is_nothrow_move_assignable::value&& is_nothrow_move_assignable::value) { +__hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(__hash_table&& __u) { __move_assign(__u, integral_constant()); return *this; } @@ -1566,7 +1533,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__do_rehash(size_type __nbc) { __phash = __chash; } else { __next_pointer __np = __cp; - if _LIBCPP_CONSTEXPR_SINCE_CXX17 (!_UniqueKeys) { + if (!_UniqueKeys) { for (; __np->__next_ != nullptr && key_eq()(__cp->__upcast()->__get_value(), __np->__next_->__upcast()->__get_value()); __np = __np->__next_) @@ -1823,11 +1790,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__equal_range_multi(const _Key& __k) c } template -void __hash_table<_Tp, _Hash, _Equal, _Alloc>::swap(__hash_table& __u) - _NOEXCEPT_(__is_nothrow_swappable_v&& __is_nothrow_swappable_v && - (!allocator_traits<__pointer_allocator>::propagate_on_container_swap::value || - __is_nothrow_swappable_v<__pointer_allocator>) && - (!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>)) { +void __hash_table<_Tp, _Hash, _Equal, _Alloc>::swap(__hash_table& __u) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __node_traits::propagate_on_container_swap::value || this->__node_alloc() == __u.__node_alloc(), "unordered container::swap: Either propagate_on_container_swap " @@ -1868,8 +1831,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::bucket_size(size_type __n) const { template inline _LIBCPP_HIDE_FROM_ABI void -swap(__hash_table<_Tp, _Hash, _Equal, _Alloc>& __x, __hash_table<_Tp, _Hash, _Equal, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(__hash_table<_Tp, _Hash, _Equal, _Alloc>& __x, __hash_table<_Tp, _Hash, _Equal, _Alloc>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/__iterator/access.h b/libcxx/include/__cxx03/__iterator/access.h index 99ba9c4c73d08..52220a1f04121 100644 --- a/libcxx/include/__cxx03/__iterator/access.h +++ b/libcxx/include/__cxx03/__iterator/access.h @@ -20,12 +20,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp* begin(_Tp (&__array)[_Np]) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI _Tp* begin(_Tp (&__array)[_Np]) _NOEXCEPT { return __array; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp* end(_Tp (&__array)[_Np]) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI _Tp* end(_Tp (&__array)[_Np]) _NOEXCEPT { return __array + _Np; } diff --git a/libcxx/include/__cxx03/__iterator/advance.h b/libcxx/include/__cxx03/__iterator/advance.h index 4ebe868735649..d49556a17cc34 100644 --- a/libcxx/include/__cxx03/__iterator/advance.h +++ b/libcxx/include/__cxx03/__iterator/advance.h @@ -31,14 +31,14 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void +_LIBCPP_HIDE_FROM_ABI void __advance(_InputIter& __i, typename iterator_traits<_InputIter>::difference_type __n, input_iterator_tag) { for (; __n > 0; --__n) ++__i; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void +_LIBCPP_HIDE_FROM_ABI void __advance(_BiDirIter& __i, typename iterator_traits<_BiDirIter>::difference_type __n, bidirectional_iterator_tag) { if (__n >= 0) for (; __n > 0; --__n) @@ -49,7 +49,7 @@ __advance(_BiDirIter& __i, typename iterator_traits<_BiDirIter>::difference_type } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void +_LIBCPP_HIDE_FROM_ABI void __advance(_RandIter& __i, typename iterator_traits<_RandIter>::difference_type __n, random_access_iterator_tag) { __i += __n; } @@ -58,7 +58,7 @@ template < class _InputIter, class _Distance, class _IntegralDistance = decltype(std::__convert_to_integral(std::declval<_Distance>())), __enable_if_t::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void advance(_InputIter& __i, _Distance __orig_n) { +_LIBCPP_HIDE_FROM_ABI void advance(_InputIter& __i, _Distance __orig_n) { typedef typename iterator_traits<_InputIter>::difference_type _Difference; _Difference __n = static_cast<_Difference>(std::__convert_to_integral(__orig_n)); // Calling `advance` with a negative value on a non-bidirectional iterator is a no-op in the current implementation. diff --git a/libcxx/include/__cxx03/__iterator/back_insert_iterator.h b/libcxx/include/__cxx03/__iterator/back_insert_iterator.h index a67b6d5b82cc5..e9390a6aa2598 100644 --- a/libcxx/include/__cxx03/__iterator/back_insert_iterator.h +++ b/libcxx/include/__cxx03/__iterator/back_insert_iterator.h @@ -42,24 +42,21 @@ class _LIBCPP_TEMPLATE_VIS back_insert_iterator : public iteratorpush_back(__value); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 back_insert_iterator& operator*() { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 back_insert_iterator& operator++() { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 back_insert_iterator operator++(int) { return *this; } + _LIBCPP_HIDE_FROM_ABI back_insert_iterator& operator*() { return *this; } + _LIBCPP_HIDE_FROM_ABI back_insert_iterator& operator++() { return *this; } + _LIBCPP_HIDE_FROM_ABI back_insert_iterator operator++(int) { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Container* __get_container() const { return container; } + _LIBCPP_HIDE_FROM_ABI _Container* __get_container() const { return container; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(back_insert_iterator); template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 back_insert_iterator<_Container> -back_inserter(_Container& __x) { +inline _LIBCPP_HIDE_FROM_ABI back_insert_iterator<_Container> back_inserter(_Container& __x) { return back_insert_iterator<_Container>(__x); } diff --git a/libcxx/include/__cxx03/__iterator/bounded_iter.h b/libcxx/include/__cxx03/__iterator/bounded_iter.h index 1e034fc5bbd72..83296a4cf20b4 100644 --- a/libcxx/include/__cxx03/__iterator/bounded_iter.h +++ b/libcxx/include/__cxx03/__iterator/bounded_iter.h @@ -63,7 +63,7 @@ struct __bounded_iter { _LIBCPP_HIDE_FROM_ABI __bounded_iter(__bounded_iter&&) = default; template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bounded_iter(__bounded_iter<_OtherIterator> const& __other) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI __bounded_iter(__bounded_iter<_OtherIterator> const& __other) _NOEXCEPT : __current_(__other.__current_), __begin_(__other.__begin_), __end_(__other.__end_) {} @@ -81,8 +81,7 @@ struct __bounded_iter { // // Since it is non-standard for iterators to have this constructor, __bounded_iter must // be created via `std::__make_bounded_iter`. - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __bounded_iter(_Iterator __current, _Iterator __begin, _Iterator __end) + _LIBCPP_HIDE_FROM_ABI explicit __bounded_iter(_Iterator __current, _Iterator __begin, _Iterator __end) : __current_(__current), __begin_(__begin), __end_(__end) { _LIBCPP_ASSERT_INTERNAL( __begin <= __current, "__bounded_iter(current, begin, end): current and begin are inconsistent"); @@ -91,7 +90,7 @@ struct __bounded_iter { } template - friend _LIBCPP_CONSTEXPR __bounded_iter<_It> __make_bounded_iter(_It, _It, _It); + friend __bounded_iter<_It> __make_bounded_iter(_It, _It, _It); public: // Dereference and indexing operations. @@ -101,19 +100,19 @@ struct __bounded_iter { // `end`. This is easier for the optimizer because it aligns with the `iter != container.end()` // checks that typical callers already use (see // https://github.com/llvm/llvm-project/issues/78829). - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator*() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference operator*() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __current_ != __end_, "__bounded_iter::operator*: Attempt to dereference an iterator at the end"); return *__current_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pointer operator->() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI pointer operator->() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __current_ != __end_, "__bounded_iter::operator->: Attempt to dereference an iterator at the end"); return std::__to_address(__current_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator[](difference_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference operator[](difference_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __n >= __begin_ - __current_, "__bounded_iter::operator[]: Attempt to index an iterator past the start"); _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( @@ -124,31 +123,31 @@ struct __bounded_iter { // Arithmetic operations. // // These operations check that the iterator remains within `[begin, end]`. - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator++() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bounded_iter& operator++() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __current_ != __end_, "__bounded_iter::operator++: Attempt to advance an iterator past the end"); ++__current_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter operator++(int) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bounded_iter operator++(int) _NOEXCEPT { __bounded_iter __tmp(*this); ++*this; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator--() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bounded_iter& operator--() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __current_ != __begin_, "__bounded_iter::operator--: Attempt to rewind an iterator past the start"); --__current_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter operator--(int) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bounded_iter operator--(int) _NOEXCEPT { __bounded_iter __tmp(*this); --*this; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator+=(difference_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bounded_iter& operator+=(difference_type __n) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __n >= __begin_ - __current_, "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"); _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( @@ -156,20 +155,18 @@ struct __bounded_iter { __current_ += __n; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 friend __bounded_iter - operator+(__bounded_iter const& __self, difference_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend __bounded_iter operator+(__bounded_iter const& __self, difference_type __n) _NOEXCEPT { __bounded_iter __tmp(__self); __tmp += __n; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 friend __bounded_iter - operator+(difference_type __n, __bounded_iter const& __self) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend __bounded_iter operator+(difference_type __n, __bounded_iter const& __self) _NOEXCEPT { __bounded_iter __tmp(__self); __tmp += __n; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator-=(difference_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __bounded_iter& operator-=(difference_type __n) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __n <= __current_ - __begin_, "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"); _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( @@ -177,13 +174,12 @@ struct __bounded_iter { __current_ -= __n; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 friend __bounded_iter - operator-(__bounded_iter const& __self, difference_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend __bounded_iter operator-(__bounded_iter const& __self, difference_type __n) _NOEXCEPT { __bounded_iter __tmp(__self); __tmp -= __n; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 friend difference_type + _LIBCPP_HIDE_FROM_ABI friend difference_type operator-(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { return __x.__current_ - __y.__current_; } @@ -194,31 +190,25 @@ struct __bounded_iter { // The valid range for each iterator is also not considered as part of the comparison, // i.e. two iterators pointing to the same location will be considered equal even // if they have different validity ranges. - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool - operator==(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend bool operator==(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { return __x.__current_ == __y.__current_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool - operator!=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend bool operator!=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { return __x.__current_ != __y.__current_; } // TODO(mordante) disable these overloads in the LLVM 20 release. - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool - operator<(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend bool operator<(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { return __x.__current_ < __y.__current_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool - operator>(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend bool operator>(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { return __x.__current_ > __y.__current_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool - operator<=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend bool operator<=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { return __x.__current_ <= __y.__current_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool - operator>=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI friend bool operator>=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT { return __x.__current_ >= __y.__current_; } @@ -232,7 +222,7 @@ struct __bounded_iter { }; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bounded_iter<_It> __make_bounded_iter(_It __it, _It __begin, _It __end) { +_LIBCPP_HIDE_FROM_ABI __bounded_iter<_It> __make_bounded_iter(_It __it, _It __begin, _It __end) { return __bounded_iter<_It>(std::move(__it), std::move(__begin), std::move(__end)); } @@ -245,7 +235,7 @@ struct pointer_traits<__bounded_iter<_Iterator> > { using element_type = typename pointer_traits<_Iterator>::element_type; using difference_type = typename pointer_traits<_Iterator>::difference_type; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR static element_type* to_address(pointer __it) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static element_type* to_address(pointer __it) _NOEXCEPT { return std::__to_address(__it.__current_); } }; diff --git a/libcxx/include/__cxx03/__iterator/distance.h b/libcxx/include/__cxx03/__iterator/distance.h index 7b380d72eab0d..656b9606a84dc 100644 --- a/libcxx/include/__cxx03/__iterator/distance.h +++ b/libcxx/include/__cxx03/__iterator/distance.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type +inline _LIBCPP_HIDE_FROM_ABI typename iterator_traits<_InputIter>::difference_type __distance(_InputIter __first, _InputIter __last, input_iterator_tag) { typename iterator_traits<_InputIter>::difference_type __r(0); for (; __first != __last; ++__first) @@ -31,13 +31,13 @@ __distance(_InputIter __first, _InputIter __last, input_iterator_tag) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_RandIter>::difference_type +inline _LIBCPP_HIDE_FROM_ABI typename iterator_traits<_RandIter>::difference_type __distance(_RandIter __first, _RandIter __last, random_access_iterator_tag) { return __last - __first; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type +inline _LIBCPP_HIDE_FROM_ABI typename iterator_traits<_InputIter>::difference_type distance(_InputIter __first, _InputIter __last) { return std::__distance(__first, __last, typename iterator_traits<_InputIter>::iterator_category()); } diff --git a/libcxx/include/__cxx03/__iterator/front_insert_iterator.h b/libcxx/include/__cxx03/__iterator/front_insert_iterator.h index ca1edc31810d8..347858f062060 100644 --- a/libcxx/include/__cxx03/__iterator/front_insert_iterator.h +++ b/libcxx/include/__cxx03/__iterator/front_insert_iterator.h @@ -42,22 +42,19 @@ class _LIBCPP_TEMPLATE_VIS front_insert_iterator : public iteratorpush_front(__value); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 front_insert_iterator& operator*() { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 front_insert_iterator& operator++() { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 front_insert_iterator operator++(int) { return *this; } + _LIBCPP_HIDE_FROM_ABI front_insert_iterator& operator*() { return *this; } + _LIBCPP_HIDE_FROM_ABI front_insert_iterator& operator++() { return *this; } + _LIBCPP_HIDE_FROM_ABI front_insert_iterator operator++(int) { return *this; } }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(front_insert_iterator); template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 front_insert_iterator<_Container> -front_inserter(_Container& __x) { +inline _LIBCPP_HIDE_FROM_ABI front_insert_iterator<_Container> front_inserter(_Container& __x) { return front_insert_iterator<_Container>(__x); } diff --git a/libcxx/include/__cxx03/__iterator/insert_iterator.h b/libcxx/include/__cxx03/__iterator/insert_iterator.h index 3abeca0029bd6..afb1f98b7be1a 100644 --- a/libcxx/include/__cxx03/__iterator/insert_iterator.h +++ b/libcxx/include/__cxx03/__iterator/insert_iterator.h @@ -46,22 +46,20 @@ class _LIBCPP_TEMPLATE_VIS insert_iterator : public iterator __i) + _LIBCPP_HIDE_FROM_ABI insert_iterator(_Container& __x, __insert_iterator_iter_t<_Container> __i) : container(std::addressof(__x)), iter(__i) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 insert_iterator& - operator=(const typename _Container::value_type& __value) { + _LIBCPP_HIDE_FROM_ABI insert_iterator& operator=(const typename _Container::value_type& __value) { iter = container->insert(iter, __value); ++iter; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 insert_iterator& operator*() { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 insert_iterator& operator++() { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 insert_iterator& operator++(int) { return *this; } + _LIBCPP_HIDE_FROM_ABI insert_iterator& operator*() { return *this; } + _LIBCPP_HIDE_FROM_ABI insert_iterator& operator++() { return *this; } + _LIBCPP_HIDE_FROM_ABI insert_iterator& operator++(int) { return *this; } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 insert_iterator<_Container> +inline _LIBCPP_HIDE_FROM_ABI insert_iterator<_Container> inserter(_Container& __x, __insert_iterator_iter_t<_Container> __i) { return insert_iterator<_Container>(__x, __i); } diff --git a/libcxx/include/__cxx03/__iterator/istream_iterator.h b/libcxx/include/__cxx03/__iterator/istream_iterator.h index 35c5569558d6d..fb77c6718009f 100644 --- a/libcxx/include/__cxx03/__iterator/istream_iterator.h +++ b/libcxx/include/__cxx03/__iterator/istream_iterator.h @@ -45,7 +45,7 @@ class _LIBCPP_TEMPLATE_VIS istream_iterator _Tp __value_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR istream_iterator() : __in_stream_(nullptr), __value_() {} + _LIBCPP_HIDE_FROM_ABI istream_iterator() : __in_stream_(nullptr), __value_() {} _LIBCPP_HIDE_FROM_ABI istream_iterator(istream_type& __s) : __in_stream_(std::addressof(__s)) { if (!(*__in_stream_ >> __value_)) __in_stream_ = nullptr; diff --git a/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h b/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h index b21e0bc927858..37c81bbc5c091 100644 --- a/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h +++ b/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h @@ -60,7 +60,7 @@ class _LIBCPP_TEMPLATE_VIS istreambuf_iterator } public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR istreambuf_iterator() _NOEXCEPT : __sbuf_(nullptr) {} + _LIBCPP_HIDE_FROM_ABI istreambuf_iterator() _NOEXCEPT : __sbuf_(nullptr) {} _LIBCPP_HIDE_FROM_ABI istreambuf_iterator(istream_type& __s) _NOEXCEPT : __sbuf_(__s.rdbuf()) {} _LIBCPP_HIDE_FROM_ABI istreambuf_iterator(streambuf_type* __s) _NOEXCEPT : __sbuf_(__s) {} _LIBCPP_HIDE_FROM_ABI istreambuf_iterator(const __proxy& __p) _NOEXCEPT : __sbuf_(__p.__sbuf_) {} diff --git a/libcxx/include/__cxx03/__iterator/iterator.h b/libcxx/include/__cxx03/__iterator/iterator.h index 60114ffc5b4dc..8fbe9739c61ea 100644 --- a/libcxx/include/__cxx03/__iterator/iterator.h +++ b/libcxx/include/__cxx03/__iterator/iterator.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 iterator { +struct _LIBCPP_TEMPLATE_VIS iterator { typedef _Tp value_type; typedef _Distance difference_type; typedef _Pointer pointer; diff --git a/libcxx/include/__cxx03/__iterator/move_iterator.h b/libcxx/include/__cxx03/__iterator/move_iterator.h index 200fabd4291ae..c778bb3bb9e17 100644 --- a/libcxx/include/__cxx03/__iterator/move_iterator.h +++ b/libcxx/include/__cxx03/__iterator/move_iterator.h @@ -48,67 +48,58 @@ class _LIBCPP_TEMPLATE_VIS move_iterator { typedef __conditional_t::value, __libcpp_remove_reference_t<__reference>&&, __reference> reference; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 explicit move_iterator(_Iter __i) : __current_(std::move(__i)) {} + _LIBCPP_HIDE_FROM_ABI explicit move_iterator(_Iter __i) : __current_(std::move(__i)) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator& operator++() { + _LIBCPP_HIDE_FROM_ABI move_iterator& operator++() { ++__current_; return *this; } - _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pointer operator->() const { - return __current_; - } + _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return __current_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator() : __current_() {} + _LIBCPP_HIDE_FROM_ABI move_iterator() : __current_() {} template ::value && is_convertible::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator(const move_iterator<_Up>& __u) - : __current_(__u.base()) {} + _LIBCPP_HIDE_FROM_ABI move_iterator(const move_iterator<_Up>& __u) : __current_(__u.base()) {} template ::value && is_convertible::value && is_assignable<_Iter&, const _Up&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator& operator=(const move_iterator<_Up>& __u) { + _LIBCPP_HIDE_FROM_ABI move_iterator& operator=(const move_iterator<_Up>& __u) { __current_ = __u.base(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _Iter base() const { return __current_; } + _LIBCPP_HIDE_FROM_ABI _Iter base() const { return __current_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator*() const { - return static_cast(*__current_); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](difference_type __n) const { + _LIBCPP_HIDE_FROM_ABI reference operator*() const { return static_cast(*__current_); } + _LIBCPP_HIDE_FROM_ABI reference operator[](difference_type __n) const { return static_cast(__current_[__n]); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator operator++(int) { + _LIBCPP_HIDE_FROM_ABI move_iterator operator++(int) { move_iterator __tmp(*this); ++__current_; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator& operator--() { + _LIBCPP_HIDE_FROM_ABI move_iterator& operator--() { --__current_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator operator--(int) { + _LIBCPP_HIDE_FROM_ABI move_iterator operator--(int) { move_iterator __tmp(*this); --__current_; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator operator+(difference_type __n) const { - return move_iterator(__current_ + __n); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator& operator+=(difference_type __n) { + _LIBCPP_HIDE_FROM_ABI move_iterator operator+(difference_type __n) const { return move_iterator(__current_ + __n); } + _LIBCPP_HIDE_FROM_ABI move_iterator& operator+=(difference_type __n) { __current_ += __n; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator operator-(difference_type __n) const { - return move_iterator(__current_ - __n); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator& operator-=(difference_type __n) { + _LIBCPP_HIDE_FROM_ABI move_iterator operator-(difference_type __n) const { return move_iterator(__current_ - __n); } + _LIBCPP_HIDE_FROM_ABI move_iterator& operator-=(difference_type __n) { __current_ -= __n; return *this; } @@ -122,38 +113,32 @@ class _LIBCPP_TEMPLATE_VIS move_iterator { _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(move_iterator); template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator==(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { return __x.base() == __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator!=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { return __x.base() != __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator<(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { return __x.base() < __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator>(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { return __x.base() > __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator<=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { return __x.base() <= __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator>=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { return __x.base() >= __y.base(); } @@ -164,13 +149,13 @@ operator-(const move_iterator<_Iter1>& __x, const move_iterator<_Iter2>& __y) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator<_Iter> +inline _LIBCPP_HIDE_FROM_ABI move_iterator<_Iter> operator+(typename move_iterator<_Iter>::difference_type __n, const move_iterator<_Iter>& __x) { return move_iterator<_Iter>(__x.base() + __n); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 move_iterator<_Iter> make_move_iterator(_Iter __i) { +inline _LIBCPP_HIDE_FROM_ABI move_iterator<_Iter> make_move_iterator(_Iter __i) { return move_iterator<_Iter>(std::move(__i)); } diff --git a/libcxx/include/__cxx03/__iterator/next.h b/libcxx/include/__cxx03/__iterator/next.h index 6ee32a433277f..16ebaed172d11 100644 --- a/libcxx/include/__cxx03/__iterator/next.h +++ b/libcxx/include/__cxx03/__iterator/next.h @@ -23,7 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter +inline _LIBCPP_HIDE_FROM_ABI _InputIter next(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1) { // Calling `advance` with a negative value on a non-bidirectional iterator is a no-op in the current implementation. // Note that this check duplicates the similar check in `std::advance`. diff --git a/libcxx/include/__cxx03/__iterator/prev.h b/libcxx/include/__cxx03/__iterator/prev.h index 82b2ef67a73fa..dc1bdaf584022 100644 --- a/libcxx/include/__cxx03/__iterator/prev.h +++ b/libcxx/include/__cxx03/__iterator/prev.h @@ -23,7 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _InputIter +inline _LIBCPP_HIDE_FROM_ABI _InputIter prev(_InputIter __x, typename iterator_traits<_InputIter>::difference_type __n = 1) { // Calling `advance` with a negative value on a non-bidirectional iterator is a no-op in the current implementation. // Note that this check duplicates the similar check in `std::advance`. diff --git a/libcxx/include/__cxx03/__iterator/reverse_iterator.h b/libcxx/include/__cxx03/__iterator/reverse_iterator.h index a7037003dafee..b06dad13ea789 100644 --- a/libcxx/include/__cxx03/__iterator/reverse_iterator.h +++ b/libcxx/include/__cxx03/__iterator/reverse_iterator.h @@ -66,118 +66,108 @@ class _LIBCPP_TEMPLATE_VIS reverse_iterator using reference = typename iterator_traits<_Iter>::reference; #ifndef _LIBCPP_ABI_NO_ITERATOR_BASES - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator() : __t_(), current() {} + _LIBCPP_HIDE_FROM_ABI reverse_iterator() : __t_(), current() {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 explicit reverse_iterator(_Iter __x) : __t_(__x), current(__x) {} + _LIBCPP_HIDE_FROM_ABI explicit reverse_iterator(_Iter __x) : __t_(__x), current(__x) {} template ::value && is_convertible<_Up const&, _Iter>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator(const reverse_iterator<_Up>& __u) - : __t_(__u.base()), current(__u.base()) {} + _LIBCPP_HIDE_FROM_ABI reverse_iterator(const reverse_iterator<_Up>& __u) : __t_(__u.base()), current(__u.base()) {} template ::value && is_convertible<_Up const&, _Iter>::value && is_assignable<_Iter&, _Up const&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator& operator=(const reverse_iterator<_Up>& __u) { + _LIBCPP_HIDE_FROM_ABI reverse_iterator& operator=(const reverse_iterator<_Up>& __u) { __t_ = current = __u.base(); return *this; } #else - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator() : current() {} + _LIBCPP_HIDE_FROM_ABI reverse_iterator() : current() {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 explicit reverse_iterator(_Iter __x) : current(__x) {} + _LIBCPP_HIDE_FROM_ABI explicit reverse_iterator(_Iter __x) : current(__x) {} template ::value && is_convertible<_Up const&, _Iter>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator(const reverse_iterator<_Up>& __u) - : current(__u.base()) {} + _LIBCPP_HIDE_FROM_ABI reverse_iterator(const reverse_iterator<_Up>& __u) : current(__u.base()) {} template ::value && is_convertible<_Up const&, _Iter>::value && is_assignable<_Iter&, _Up const&>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator& operator=(const reverse_iterator<_Up>& __u) { + _LIBCPP_HIDE_FROM_ABI reverse_iterator& operator=(const reverse_iterator<_Up>& __u) { current = __u.base(); return *this; } #endif - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 _Iter base() const { return current; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator*() const { + _LIBCPP_HIDE_FROM_ABI _Iter base() const { return current; } + _LIBCPP_HIDE_FROM_ABI reference operator*() const { _Iter __tmp = current; return *--__tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 pointer operator->() const { return std::addressof(operator*()); } + _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return std::addressof(operator*()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator& operator++() { + _LIBCPP_HIDE_FROM_ABI reverse_iterator& operator++() { --current; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator operator++(int) { + _LIBCPP_HIDE_FROM_ABI reverse_iterator operator++(int) { reverse_iterator __tmp(*this); --current; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator& operator--() { + _LIBCPP_HIDE_FROM_ABI reverse_iterator& operator--() { ++current; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator operator--(int) { + _LIBCPP_HIDE_FROM_ABI reverse_iterator operator--(int) { reverse_iterator __tmp(*this); ++current; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator operator+(difference_type __n) const { + _LIBCPP_HIDE_FROM_ABI reverse_iterator operator+(difference_type __n) const { return reverse_iterator(current - __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator& operator+=(difference_type __n) { + _LIBCPP_HIDE_FROM_ABI reverse_iterator& operator+=(difference_type __n) { current -= __n; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator operator-(difference_type __n) const { + _LIBCPP_HIDE_FROM_ABI reverse_iterator operator-(difference_type __n) const { return reverse_iterator(current + __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator& operator-=(difference_type __n) { + _LIBCPP_HIDE_FROM_ABI reverse_iterator& operator-=(difference_type __n) { current += __n; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](difference_type __n) const { - return *(*this + __n); - } + _LIBCPP_HIDE_FROM_ABI reference operator[](difference_type __n) const { return *(*this + __n); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { return __x.base() == __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { return __x.base() > __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { return __x.base() != __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { return __x.base() < __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { return __x.base() <= __y.base(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 bool -operator<=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& __y) { return __x.base() >= __y.base(); } @@ -188,7 +178,7 @@ operator-(const reverse_iterator<_Iter1>& __x, const reverse_iterator<_Iter2>& _ } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator<_Iter> +inline _LIBCPP_HIDE_FROM_ABI reverse_iterator<_Iter> operator+(typename reverse_iterator<_Iter>::difference_type __n, const reverse_iterator<_Iter>& __x) { return reverse_iterator<_Iter>(__x.base() - __n); } @@ -198,13 +188,12 @@ struct __unwrap_iter_impl >, __b> { using _UnwrappedIter = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>())); using _ReverseWrapper = reverse_iterator >; - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ReverseWrapper - __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) { + static _LIBCPP_HIDE_FROM_ABI _ReverseWrapper __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) { return _ReverseWrapper( reverse_iterator<_Iter>(__unwrap_iter_impl<_Iter>::__rewrap(__orig_iter.base().base(), __unwrapped_iter))); } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _UnwrappedIter __unwrap(_ReverseWrapper __i) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI _UnwrappedIter __unwrap(_ReverseWrapper __i) _NOEXCEPT { return __unwrap_iter_impl<_Iter>::__unwrap(__i.base().base()); } }; diff --git a/libcxx/include/__cxx03/__iterator/wrap_iter.h b/libcxx/include/__cxx03/__iterator/wrap_iter.h index 13d7e27f37698..22c9e2dfb5f5a 100644 --- a/libcxx/include/__cxx03/__iterator/wrap_iter.h +++ b/libcxx/include/__cxx03/__iterator/wrap_iter.h @@ -38,57 +38,50 @@ class __wrap_iter { iterator_type __i_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter() _NOEXCEPT : __i_() {} + _LIBCPP_HIDE_FROM_ABI __wrap_iter() _NOEXCEPT : __i_() {} template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter(const __wrap_iter<_Up>& __u) _NOEXCEPT - : __i_(__u.base()) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator*() const _NOEXCEPT { return *__i_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pointer operator->() const _NOEXCEPT { - return std::__to_address(__i_); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter& operator++() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __wrap_iter(const __wrap_iter<_Up>& __u) _NOEXCEPT : __i_(__u.base()) {} + _LIBCPP_HIDE_FROM_ABI reference operator*() const _NOEXCEPT { return *__i_; } + _LIBCPP_HIDE_FROM_ABI pointer operator->() const _NOEXCEPT { return std::__to_address(__i_); } + _LIBCPP_HIDE_FROM_ABI __wrap_iter& operator++() _NOEXCEPT { ++__i_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter operator++(int) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __wrap_iter operator++(int) _NOEXCEPT { __wrap_iter __tmp(*this); ++(*this); return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter& operator--() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __wrap_iter& operator--() _NOEXCEPT { --__i_; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter operator--(int) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __wrap_iter operator--(int) _NOEXCEPT { __wrap_iter __tmp(*this); --(*this); return __tmp; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter operator+(difference_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __wrap_iter operator+(difference_type __n) const _NOEXCEPT { __wrap_iter __w(*this); __w += __n; return __w; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter& operator+=(difference_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __wrap_iter& operator+=(difference_type __n) _NOEXCEPT { __i_ += __n; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter operator-(difference_type __n) const _NOEXCEPT { - return *this + (-__n); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter& operator-=(difference_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI __wrap_iter operator-(difference_type __n) const _NOEXCEPT { return *this + (-__n); } + _LIBCPP_HIDE_FROM_ABI __wrap_iter& operator-=(difference_type __n) _NOEXCEPT { *this += -__n; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator[](difference_type __n) const _NOEXCEPT { - return __i_[__n]; - } + _LIBCPP_HIDE_FROM_ABI reference operator[](difference_type __n) const _NOEXCEPT { return __i_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 iterator_type base() const _NOEXCEPT { return __i_; } + _LIBCPP_HIDE_FROM_ABI iterator_type base() const _NOEXCEPT { return __i_; } private: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __wrap_iter(iterator_type __x) _NOEXCEPT : __i_(__x) {} + _LIBCPP_HIDE_FROM_ABI explicit __wrap_iter(iterator_type __x) _NOEXCEPT : __i_(__x) {} template friend class __wrap_iter; @@ -105,86 +98,74 @@ class __wrap_iter { }; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator==(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator==(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { return __x.base() == __y.base(); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator==(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator==(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { return __x.base() == __y.base(); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { return __x.base() < __y.base(); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { return __x.base() < __y.base(); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { return !(__x == __y); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { return !(__x == __y); } // TODO(mordante) disable these overloads in the LLVM 20 release. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { return __y < __x; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { return __y < __x; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator>=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { return !(__x < __y); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator>=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { return !(__x < __y); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT { return !(__y < __x); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool -operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { return !(__y < __x); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __wrap_iter<_Iter1>::difference_type +_LIBCPP_HIDE_FROM_ABI typename __wrap_iter<_Iter1>::difference_type operator-(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT { return __x.base() - __y.base(); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter<_Iter1> +_LIBCPP_HIDE_FROM_ABI __wrap_iter<_Iter1> operator+(typename __wrap_iter<_Iter1>::difference_type __n, __wrap_iter<_Iter1> __x) _NOEXCEPT { __x += __n; return __x; @@ -199,9 +180,7 @@ struct _LIBCPP_TEMPLATE_VIS pointer_traits<__wrap_iter<_It> > { typedef typename pointer_traits<_It>::element_type element_type; typedef typename pointer_traits<_It>::difference_type difference_type; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR static element_type* to_address(pointer __w) _NOEXCEPT { - return std::__to_address(__w.base()); - } + _LIBCPP_HIDE_FROM_ABI static element_type* to_address(pointer __w) _NOEXCEPT { return std::__to_address(__w.base()); } }; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__locale b/libcxx/include/__cxx03/__locale index 1deda8d1149e7..d5faa89b99fc0 100644 --- a/libcxx/include/__cxx03/__locale +++ b/libcxx/include/__cxx03/__locale @@ -132,7 +132,7 @@ class _LIBCPP_EXPORTED_FROM_ABI locale::id { static int32_t __next_id; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR id() : __id_(0) {} + _LIBCPP_HIDE_FROM_ABI id() : __id_(0) {} void operator=(const id&) = delete; id(const id&) = delete; @@ -903,8 +903,7 @@ protected: // template <> class codecvt // deprecated in C++20 template <> -class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_EXPORTED_FROM_ABI codecvt - : public locale::facet, public codecvt_base { +class _LIBCPP_EXPORTED_FROM_ABI codecvt : public locale::facet, public codecvt_base { public: typedef char16_t intern_type; typedef char extern_type; @@ -1066,8 +1065,7 @@ protected: // template <> class codecvt // deprecated in C++20 template <> -class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_EXPORTED_FROM_ABI codecvt - : public locale::facet, public codecvt_base { +class _LIBCPP_EXPORTED_FROM_ABI codecvt : public locale::facet, public codecvt_base { public: typedef char32_t intern_type; typedef char extern_type; @@ -1249,10 +1247,8 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; #endif -extern template class _LIBCPP_DEPRECATED_IN_CXX20 -_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // deprecated in C++20 -extern template class _LIBCPP_DEPRECATED_IN_CXX20 -_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // deprecated in C++20 +extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // deprecated in C++20 +extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // deprecated in C++20 #ifndef _LIBCPP_HAS_NO_CHAR8_T extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // C++20 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // C++20 diff --git a/libcxx/include/__cxx03/__math/traits.h b/libcxx/include/__cxx03/__math/traits.h index f62509c32c008..0d27680d579a4 100644 --- a/libcxx/include/__cxx03/__math/traits.h +++ b/libcxx/include/__cxx03/__math/traits.h @@ -46,90 +46,76 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool signbit(_A1) _NOEXCEPT { // isfinite template ::value && numeric_limits<_A1>::has_infinity, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(_A1 __x) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isfinite(_A1 __x) _NOEXCEPT { return __builtin_isfinite((typename __promote<_A1>::type)__x); } template ::value && !numeric_limits<_A1>::has_infinity, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(_A1) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isfinite(_A1) _NOEXCEPT { return true; } -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(float __x) _NOEXCEPT { - return __builtin_isfinite(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool isfinite(float __x) _NOEXCEPT { return __builtin_isfinite(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(double __x) _NOEXCEPT { - return __builtin_isfinite(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool isfinite(double __x) _NOEXCEPT { return __builtin_isfinite(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(long double __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool isfinite(long double __x) _NOEXCEPT { return __builtin_isfinite(__x); } // isinf template ::value && numeric_limits<_A1>::has_infinity, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isinf(_A1 __x) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isinf(_A1 __x) _NOEXCEPT { return __builtin_isinf((typename __promote<_A1>::type)__x); } template ::value && !numeric_limits<_A1>::has_infinity, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isinf(_A1) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isinf(_A1) _NOEXCEPT { return false; } #ifdef _LIBCPP_PREFERRED_OVERLOAD -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isinf(float __x) _NOEXCEPT { - return __builtin_isinf(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool isinf(float __x) _NOEXCEPT { return __builtin_isinf(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI _LIBCPP_PREFERRED_OVERLOAD bool -isinf(double __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_PREFERRED_OVERLOAD bool isinf(double __x) _NOEXCEPT { return __builtin_isinf(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isinf(long double __x) _NOEXCEPT { - return __builtin_isinf(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool isinf(long double __x) _NOEXCEPT { return __builtin_isinf(__x); } #endif // isnan template ::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isnan(_A1 __x) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isnan(_A1 __x) _NOEXCEPT { return __builtin_isnan(__x); } template ::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isnan(_A1) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isnan(_A1) _NOEXCEPT { return false; } #ifdef _LIBCPP_PREFERRED_OVERLOAD -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isnan(float __x) _NOEXCEPT { - return __builtin_isnan(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool isnan(float __x) _NOEXCEPT { return __builtin_isnan(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI _LIBCPP_PREFERRED_OVERLOAD bool -isnan(double __x) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_PREFERRED_OVERLOAD bool isnan(double __x) _NOEXCEPT { return __builtin_isnan(__x); } -_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isnan(long double __x) _NOEXCEPT { - return __builtin_isnan(__x); -} +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI bool isnan(long double __x) _NOEXCEPT { return __builtin_isnan(__x); } #endif // isnormal template ::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isnormal(_A1 __x) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isnormal(_A1 __x) _NOEXCEPT { return __builtin_isnormal(__x); } template ::value, int> = 0> -_LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isnormal(_A1 __x) _NOEXCEPT { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool isnormal(_A1 __x) _NOEXCEPT { return __x != 0; } diff --git a/libcxx/include/__cxx03/__memory/addressof.h b/libcxx/include/__cxx03/__memory/addressof.h index 81c6a696f0631..9bb49baef21f6 100644 --- a/libcxx/include/__cxx03/__memory/addressof.h +++ b/libcxx/include/__cxx03/__memory/addressof.h @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -inline _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_NO_CFI _LIBCPP_HIDE_FROM_ABI _Tp* addressof(_Tp& __x) _NOEXCEPT { +inline _LIBCPP_NO_CFI _LIBCPP_HIDE_FROM_ABI _Tp* addressof(_Tp& __x) _NOEXCEPT { return __builtin_addressof(__x); } diff --git a/libcxx/include/__cxx03/__memory/allocate_at_least.h b/libcxx/include/__cxx03/__memory/allocate_at_least.h index 3906d88599c74..3a906c7145b75 100644 --- a/libcxx/include/__cxx03/__memory/allocate_at_least.h +++ b/libcxx/include/__cxx03/__memory/allocate_at_least.h @@ -26,8 +26,7 @@ struct __allocation_result { }; template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR __allocation_result::pointer> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __allocation_result::pointer> __allocate_at_least(_Alloc& __alloc, size_t __n) { return {__alloc.allocate(__n), __n}; } diff --git a/libcxx/include/__cxx03/__memory/allocator.h b/libcxx/include/__cxx03/__memory/allocator.h index a20d2354d41a5..cd6f7d9ad4476 100644 --- a/libcxx/include/__cxx03/__memory/allocator.h +++ b/libcxx/include/__cxx03/__memory/allocator.h @@ -32,32 +32,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD template class allocator; -// These specializations shouldn't be marked _LIBCPP_DEPRECATED_IN_CXX17. -// Specializing allocator is deprecated, but not using it. template <> class _LIBCPP_TEMPLATE_VIS allocator { public: - _LIBCPP_DEPRECATED_IN_CXX17 typedef void* pointer; - _LIBCPP_DEPRECATED_IN_CXX17 typedef const void* const_pointer; - _LIBCPP_DEPRECATED_IN_CXX17 typedef void value_type; + typedef void* pointer; + typedef const void* const_pointer; + typedef void value_type; template - struct _LIBCPP_DEPRECATED_IN_CXX17 rebind { + struct rebind { typedef allocator<_Up> other; }; }; // TODO(LLVM 20): Remove the escape hatch -# ifdef _LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST +#ifdef _LIBCPP_ENABLE_REMOVED_ALLOCATOR_CONST template <> class _LIBCPP_TEMPLATE_VIS allocator { public: - _LIBCPP_DEPRECATED_IN_CXX17 typedef const void* pointer; - _LIBCPP_DEPRECATED_IN_CXX17 typedef const void* const_pointer; - _LIBCPP_DEPRECATED_IN_CXX17 typedef const void value_type; + typedef const void* pointer; + typedef const void* const_pointer; + typedef const void value_type; template - struct _LIBCPP_DEPRECATED_IN_CXX17 rebind { + struct rebind { typedef allocator<_Up> other; }; }; @@ -80,7 +78,7 @@ struct __non_trivial_if {}; template struct __non_trivial_if { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __non_trivial_if() _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI __non_trivial_if() _NOEXCEPT {} }; // allocator @@ -98,14 +96,14 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if::v typedef ptrdiff_t difference_type; typedef _Tp value_type; typedef true_type propagate_on_container_move_assignment; - _LIBCPP_DEPRECATED_IN_CXX23 typedef true_type is_always_equal; + typedef true_type is_always_equal; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator() _NOEXCEPT = default; + _LIBCPP_HIDE_FROM_ABI allocator() _NOEXCEPT = default; template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator(const allocator<_Up>&) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI allocator(const allocator<_Up>&) _NOEXCEPT {} - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* allocate(size_t __n) { + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _Tp* allocate(size_t __n) { if (__n > allocator_traits::max_size(*this)) __throw_bad_array_new_length(); if (__libcpp_is_constant_evaluated()) { @@ -115,7 +113,7 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if::v } } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void deallocate(_Tp* __p, size_t __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void deallocate(_Tp* __p, size_t __n) _NOEXCEPT { if (__libcpp_is_constant_evaluated()) { ::operator delete(__p); } else { @@ -124,42 +122,33 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if::v } // C++20 Removed members - _LIBCPP_DEPRECATED_IN_CXX17 typedef _Tp* pointer; - _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer; - _LIBCPP_DEPRECATED_IN_CXX17 typedef _Tp& reference; - _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp& const_reference; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; template - struct _LIBCPP_DEPRECATED_IN_CXX17 rebind { + struct rebind { typedef allocator<_Up> other; }; - _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI pointer address(reference __x) const _NOEXCEPT { - return std::addressof(__x); - } - _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI const_pointer address(const_reference __x) const _NOEXCEPT { - return std::addressof(__x); - } + _LIBCPP_HIDE_FROM_ABI pointer address(reference __x) const _NOEXCEPT { return std::addressof(__x); } + _LIBCPP_HIDE_FROM_ABI const_pointer address(const_reference __x) const _NOEXCEPT { return std::addressof(__x); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_DEPRECATED_IN_CXX17 _Tp* allocate(size_t __n, const void*) { - return allocate(__n); - } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _Tp* allocate(size_t __n, const void*) { return allocate(__n); } - _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { - return size_type(~0) / sizeof(_Tp); - } + _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return size_type(~0) / sizeof(_Tp); } template - _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI void construct(_Up* __p, _Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI void construct(_Up* __p, _Args&&... __args) { ::new ((void*)__p) _Up(std::forward<_Args>(__args)...); } - _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI void destroy(pointer __p) { __p->~_Tp(); } + _LIBCPP_HIDE_FROM_ABI void destroy(pointer __p) { __p->~_Tp(); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool -operator==(const allocator<_Tp>&, const allocator<_Up>&) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const allocator<_Tp>&, const allocator<_Up>&) _NOEXCEPT { return true; } diff --git a/libcxx/include/__cxx03/__memory/allocator_traits.h b/libcxx/include/__cxx03/__memory/allocator_traits.h index 8708ed2ccea4c..6e7b8ca51f881 100644 --- a/libcxx/include/__cxx03/__memory/allocator_traits.h +++ b/libcxx/include/__cxx03/__memory/allocator_traits.h @@ -245,13 +245,12 @@ struct _LIBCPP_TEMPLATE_VIS allocator_traits { using other = allocator_traits::other>; }; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static pointer - allocate(allocator_type& __a, size_type __n) { + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static pointer allocate(allocator_type& __a, size_type __n) { return __a.allocate(__n); } template ::value, int> = 0> - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static pointer + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static pointer allocate(allocator_type& __a, size_type __n, const_void_pointer __hint) { _LIBCPP_SUPPRESS_DEPRECATED_PUSH return __a.allocate(__n, __hint); @@ -260,19 +259,17 @@ struct _LIBCPP_TEMPLATE_VIS allocator_traits { template ::value, int> = 0> - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static pointer + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static pointer allocate(allocator_type& __a, size_type __n, const_void_pointer) { return __a.allocate(__n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static void - deallocate(allocator_type& __a, pointer __p, size_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static void deallocate(allocator_type& __a, pointer __p, size_type __n) _NOEXCEPT { __a.deallocate(__p, __n); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static void - construct(allocator_type& __a, _Tp* __p, _Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI static void construct(allocator_type& __a, _Tp* __p, _Args&&... __args) { _LIBCPP_SUPPRESS_DEPRECATED_PUSH __a.construct(__p, std::forward<_Args>(__args)...); _LIBCPP_SUPPRESS_DEPRECATED_POP @@ -281,43 +278,40 @@ struct _LIBCPP_TEMPLATE_VIS allocator_traits { class... _Args, class = void, __enable_if_t::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static void - construct(allocator_type&, _Tp* __p, _Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI static void construct(allocator_type&, _Tp* __p, _Args&&... __args) { std::__construct_at(__p, std::forward<_Args>(__args)...); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static void destroy(allocator_type& __a, _Tp* __p) { + _LIBCPP_HIDE_FROM_ABI static void destroy(allocator_type& __a, _Tp* __p) { _LIBCPP_SUPPRESS_DEPRECATED_PUSH __a.destroy(__p); _LIBCPP_SUPPRESS_DEPRECATED_POP } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static void destroy(allocator_type&, _Tp* __p) { + _LIBCPP_HIDE_FROM_ABI static void destroy(allocator_type&, _Tp* __p) { std::__destroy_at(__p); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static size_type max_size(const allocator_type& __a) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static size_type max_size(const allocator_type& __a) _NOEXCEPT { _LIBCPP_SUPPRESS_DEPRECATED_PUSH return __a.max_size(); _LIBCPP_SUPPRESS_DEPRECATED_POP } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static size_type max_size(const allocator_type&) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static size_type max_size(const allocator_type&) _NOEXCEPT { return numeric_limits::max() / sizeof(value_type); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static allocator_type - select_on_container_copy_construction(const allocator_type& __a) { + _LIBCPP_HIDE_FROM_ABI static allocator_type select_on_container_copy_construction(const allocator_type& __a) { return __a.select_on_container_copy_construction(); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static allocator_type - select_on_container_copy_construction(const allocator_type& __a) { + _LIBCPP_HIDE_FROM_ABI static allocator_type select_on_container_copy_construction(const allocator_type& __a) { return __a; } }; diff --git a/libcxx/include/__cxx03/__memory/assume_aligned.h b/libcxx/include/__cxx03/__memory/assume_aligned.h index f504e95431cd6..e0e2b690b513d 100644 --- a/libcxx/include/__cxx03/__memory/assume_aligned.h +++ b/libcxx/include/__cxx03/__memory/assume_aligned.h @@ -23,7 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __assume_aligned(_Tp* __ptr) { +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _Tp* __assume_aligned(_Tp* __ptr) { static_assert(_Np != 0 && (_Np & (_Np - 1)) == 0, "std::assume_aligned(p) requires N to be a power of two"); if (__libcpp_is_constant_evaluated()) { diff --git a/libcxx/include/__cxx03/__memory/auto_ptr.h b/libcxx/include/__cxx03/__memory/auto_ptr.h index b27f5cf49cd1e..3c3a8881cb2b8 100644 --- a/libcxx/include/__cxx03/__memory/auto_ptr.h +++ b/libcxx/include/__cxx03/__memory/auto_ptr.h @@ -19,12 +19,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_DEPRECATED_IN_CXX11 auto_ptr_ref { +struct auto_ptr_ref { _Tp* __ptr_; }; template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 auto_ptr { +class _LIBCPP_TEMPLATE_VIS auto_ptr { private: _Tp* __ptr_; @@ -78,7 +78,7 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 auto_ptr { }; template <> -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX11 auto_ptr { +class _LIBCPP_TEMPLATE_VIS auto_ptr { public: typedef void element_type; }; diff --git a/libcxx/include/__cxx03/__memory/builtin_new_allocator.h b/libcxx/include/__cxx03/__memory/builtin_new_allocator.h index e14a57e8a0611..f939d08f61dd7 100644 --- a/libcxx/include/__cxx03/__memory/builtin_new_allocator.h +++ b/libcxx/include/__cxx03/__memory/builtin_new_allocator.h @@ -28,7 +28,7 @@ struct __builtin_new_allocator { struct __builtin_new_deleter { typedef void* pointer_type; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __builtin_new_deleter(size_t __size, size_t __align) + _LIBCPP_HIDE_FROM_ABI explicit __builtin_new_deleter(size_t __size, size_t __align) : __size_(__size), __align_(__align) {} _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const _NOEXCEPT { diff --git a/libcxx/include/__cxx03/__memory/compressed_pair.h b/libcxx/include/__cxx03/__memory/compressed_pair.h index 550aa7e62204d..601736a40cd8a 100644 --- a/libcxx/include/__cxx03/__memory/compressed_pair.h +++ b/libcxx/include/__cxx03/__memory/compressed_pair.h @@ -45,15 +45,14 @@ struct __compressed_pair_elem { using reference = _Tp&; using const_reference = const _Tp&; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair_elem(__default_init_tag) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair_elem(__value_init_tag) : __value_() {} + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair_elem(__default_init_tag) {} + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair_elem(__value_init_tag) : __value_() {} template >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair_elem(_Up&& __u) - : __value_(std::forward<_Up>(__u)) {} + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair_elem(_Up&& __u) : __value_(std::forward<_Up>(__u)) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference __get() _NOEXCEPT { return __value_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __get() const _NOEXCEPT { return __value_; } + _LIBCPP_HIDE_FROM_ABI reference __get() _NOEXCEPT { return __value_; } + _LIBCPP_HIDE_FROM_ABI const_reference __get() const _NOEXCEPT { return __value_; } private: _Tp __value_; @@ -66,16 +65,15 @@ struct __compressed_pair_elem<_Tp, _Idx, true> : private _Tp { using const_reference = const _Tp&; using __value_type = _Tp; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair_elem() = default; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair_elem(__default_init_tag) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair_elem(__value_init_tag) : __value_type() {} + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair_elem() = default; + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair_elem(__default_init_tag) {} + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair_elem(__value_init_tag) : __value_type() {} template >::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair_elem(_Up&& __u) - : __value_type(std::forward<_Up>(__u)) {} + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair_elem(_Up&& __u) : __value_type(std::forward<_Up>(__u)) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference __get() _NOEXCEPT { return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __get() const _NOEXCEPT { return *this; } + _LIBCPP_HIDE_FROM_ABI reference __get() _NOEXCEPT { return *this; } + _LIBCPP_HIDE_FROM_ABI const_reference __get() const _NOEXCEPT { return *this; } }; template @@ -97,38 +95,32 @@ class __compressed_pair : private __compressed_pair_elem<_T1, 0>, private __comp __enable_if_t< __dependent_type, _Dummy>::value && __dependent_type, _Dummy>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair() - : _Base1(__value_init_tag()), _Base2(__value_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair() : _Base1(__value_init_tag()), _Base2(__value_init_tag()) {} template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __compressed_pair(_U1&& __t1, _U2&& __t2) + _LIBCPP_HIDE_FROM_ABI explicit __compressed_pair(_U1&& __t1, _U2&& __t2) : _Base1(std::forward<_U1>(__t1)), _Base2(std::forward<_U2>(__t2)) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename _Base1::reference first() _NOEXCEPT { - return static_cast<_Base1&>(*this).__get(); - } + _LIBCPP_HIDE_FROM_ABI typename _Base1::reference first() _NOEXCEPT { return static_cast<_Base1&>(*this).__get(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR typename _Base1::const_reference first() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI typename _Base1::const_reference first() const _NOEXCEPT { return static_cast<_Base1 const&>(*this).__get(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename _Base2::reference second() _NOEXCEPT { - return static_cast<_Base2&>(*this).__get(); - } + _LIBCPP_HIDE_FROM_ABI typename _Base2::reference second() _NOEXCEPT { return static_cast<_Base2&>(*this).__get(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR typename _Base2::const_reference second() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI typename _Base2::const_reference second() const _NOEXCEPT { return static_cast<_Base2 const&>(*this).__get(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR static _Base1* __get_first_base(__compressed_pair* __pair) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static _Base1* __get_first_base(__compressed_pair* __pair) _NOEXCEPT { return static_cast<_Base1*>(__pair); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR static _Base2* __get_second_base(__compressed_pair* __pair) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static _Base2* __get_second_base(__compressed_pair* __pair) _NOEXCEPT { return static_cast<_Base2*>(__pair); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void swap(__compressed_pair& __x) - _NOEXCEPT_(__is_nothrow_swappable_v<_T1>&& __is_nothrow_swappable_v<_T2>) { + _LIBCPP_HIDE_FROM_ABI void swap(__compressed_pair& __x) { using std::swap; swap(first(), __x.first()); swap(second(), __x.second()); @@ -136,9 +128,7 @@ class __compressed_pair : private __compressed_pair_elem<_T1, 0>, private __comp }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void -swap(__compressed_pair<_T1, _T2>& __x, __compressed_pair<_T1, _T2>& __y) - _NOEXCEPT_(__is_nothrow_swappable_v<_T1>&& __is_nothrow_swappable_v<_T2>) { +inline _LIBCPP_HIDE_FROM_ABI void swap(__compressed_pair<_T1, _T2>& __x, __compressed_pair<_T1, _T2>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/__memory/construct_at.h b/libcxx/include/__cxx03/__memory/construct_at.h index b2f1a073fdc43..8d1089335de90 100644 --- a/libcxx/include/__cxx03/__memory/construct_at.h +++ b/libcxx/include/__cxx03/__memory/construct_at.h @@ -34,7 +34,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // construct_at template ()) _Tp(std::declval<_Args>()...))> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* __construct_at(_Tp* __location, _Args&&... __args) { +_LIBCPP_HIDE_FROM_ABI _Tp* __construct_at(_Tp* __location, _Args&&... __args) { return _LIBCPP_ASSERT_NON_NULL(__location != nullptr, "null pointer given to construct_at"), ::new (std::__voidify(*__location)) _Tp(std::forward<_Args>(__args)...); } @@ -45,24 +45,23 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* __construct_at(_Tp* __l // taking an array). template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator __destroy(_ForwardIterator, _ForwardIterator); +_LIBCPP_HIDE_FROM_ABI _ForwardIterator __destroy(_ForwardIterator, _ForwardIterator); template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __destroy_at(_Tp* __loc) { +_LIBCPP_HIDE_FROM_ABI void __destroy_at(_Tp* __loc) { _LIBCPP_ASSERT_NON_NULL(__loc != nullptr, "null pointer given to destroy_at"); __loc->~_Tp(); } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator -__destroy(_ForwardIterator __first, _ForwardIterator __last) { +_LIBCPP_HIDE_FROM_ABI _ForwardIterator __destroy(_ForwardIterator __first, _ForwardIterator __last) { for (; __first != __last; ++__first) std::__destroy_at(std::addressof(*__first)); return __first; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _BidirectionalIterator +_LIBCPP_HIDE_FROM_ABI _BidirectionalIterator __reverse_destroy(_BidirectionalIterator __first, _BidirectionalIterator __last) { while (__last != __first) { --__last; diff --git a/libcxx/include/__cxx03/__memory/pointer_traits.h b/libcxx/include/__cxx03/__memory/pointer_traits.h index b159515f9722a..d38004aca9dc4 100644 --- a/libcxx/include/__cxx03/__memory/pointer_traits.h +++ b/libcxx/include/__cxx03/__memory/pointer_traits.h @@ -124,7 +124,7 @@ struct __pointer_traits_impl<_Ptr, __void_t::value, __nat, element_type>& __r) { return pointer::pointer_to(__r); } @@ -148,7 +148,7 @@ struct _LIBCPP_TEMPLATE_VIS pointer_traits<_Tp*> { struct __nat {}; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static pointer + _LIBCPP_HIDE_FROM_ABI static pointer pointer_to(__conditional_t::value, __nat, element_type>& __r) _NOEXCEPT { return std::addressof(__r); } @@ -163,7 +163,7 @@ template struct __to_address_helper; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp* __to_address(_Tp* __p) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI _Tp* __to_address(_Tp* __p) _NOEXCEPT { static_assert(!is_function<_Tp>::value, "_Tp is a function type"); return __p; } @@ -188,16 +188,14 @@ struct _IsFancyPointer { // enable_if is needed here to avoid instantiating checks for fancy pointers on raw pointers template , _IsFancyPointer<_Pointer> >::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR __decay_t::__call(std::declval()))> +_LIBCPP_HIDE_FROM_ABI __decay_t::__call(std::declval()))> __to_address(const _Pointer& __p) _NOEXCEPT { return __to_address_helper<_Pointer>::__call(__p); } template struct __to_address_helper { - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR static decltype(std::__to_address(std::declval().operator->())) + _LIBCPP_HIDE_FROM_ABI static decltype(std::__to_address(std::declval().operator->())) __call(const _Pointer& __p) _NOEXCEPT { return std::__to_address(__p.operator->()); } @@ -206,8 +204,7 @@ struct __to_address_helper { template struct __to_address_helper<_Pointer, decltype((void)pointer_traits<_Pointer>::to_address(std::declval()))> { - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR static decltype(pointer_traits<_Pointer>::to_address(std::declval())) + _LIBCPP_HIDE_FROM_ABI static decltype(pointer_traits<_Pointer>::to_address(std::declval())) __call(const _Pointer& __p) _NOEXCEPT { return pointer_traits<_Pointer>::to_address(__p); } diff --git a/libcxx/include/__cxx03/__memory/raw_storage_iterator.h b/libcxx/include/__cxx03/__memory/raw_storage_iterator.h index cc1d8f5ce657f..0157b663c975e 100644 --- a/libcxx/include/__cxx03/__memory/raw_storage_iterator.h +++ b/libcxx/include/__cxx03/__memory/raw_storage_iterator.h @@ -29,8 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_SUPPRESS_DEPRECATED_PUSH template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 raw_storage_iterator - : public iterator { +class _LIBCPP_TEMPLATE_VIS raw_storage_iterator : public iterator { _LIBCPP_SUPPRESS_DEPRECATED_POP private: diff --git a/libcxx/include/__cxx03/__memory/shared_ptr.h b/libcxx/include/__cxx03/__memory/shared_ptr.h index 91dd2fa5d1364..fb94d9d55596f 100644 --- a/libcxx/include/__cxx03/__memory/shared_ptr.h +++ b/libcxx/include/__cxx03/__memory/shared_ptr.h @@ -401,9 +401,9 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr { __shared_weak_count* __cntrl_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR shared_ptr() _NOEXCEPT : __ptr_(nullptr), __cntrl_(nullptr) {} + _LIBCPP_HIDE_FROM_ABI shared_ptr() _NOEXCEPT : __ptr_(nullptr), __cntrl_(nullptr) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR shared_ptr(nullptr_t) _NOEXCEPT : __ptr_(nullptr), __cntrl_(nullptr) {} + _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t) _NOEXCEPT : __ptr_(nullptr), __cntrl_(nullptr) {} template >::value, int> = 0> _LIBCPP_HIDE_FROM_ABI explicit shared_ptr(_Yp* __p) : __ptr_(__p) { @@ -661,7 +661,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr { _LIBCPP_HIDE_FROM_ABI long use_count() const _NOEXCEPT { return __cntrl_ ? __cntrl_->use_count() : 0; } - _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI bool unique() const _NOEXCEPT { return use_count() == 1; } + _LIBCPP_HIDE_FROM_ABI bool unique() const _NOEXCEPT { return use_count() == 1; } _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return get() != nullptr; } @@ -892,7 +892,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS weak_ptr { __shared_weak_count* __cntrl_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR weak_ptr() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI weak_ptr() _NOEXCEPT; template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI weak_ptr(shared_ptr<_Yp> const& __r) _NOEXCEPT; @@ -942,7 +942,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS weak_ptr { }; template -inline _LIBCPP_CONSTEXPR weak_ptr<_Tp>::weak_ptr() _NOEXCEPT : __ptr_(nullptr), __cntrl_(nullptr) {} +inline weak_ptr<_Tp>::weak_ptr() _NOEXCEPT : __ptr_(nullptr), __cntrl_(nullptr) {} template inline weak_ptr<_Tp>::weak_ptr(weak_ptr const& __r) _NOEXCEPT : __ptr_(__r.__ptr_), __cntrl_(__r.__cntrl_) { @@ -1076,7 +1076,7 @@ class _LIBCPP_TEMPLATE_VIS enable_shared_from_this { mutable weak_ptr<_Tp> __weak_this_; protected: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR enable_shared_from_this() _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI enable_shared_from_this() _NOEXCEPT {} _LIBCPP_HIDE_FROM_ABI enable_shared_from_this(enable_shared_from_this const&) _NOEXCEPT {} _LIBCPP_HIDE_FROM_ABI enable_shared_from_this& operator=(enable_shared_from_this const&) _NOEXCEPT { return *this; } _LIBCPP_HIDE_FROM_ABI ~enable_shared_from_this() {} @@ -1094,8 +1094,8 @@ struct _LIBCPP_TEMPLATE_VIS hash; template struct _LIBCPP_TEMPLATE_VIS hash > { - _LIBCPP_DEPRECATED_IN_CXX17 typedef shared_ptr<_Tp> argument_type; - _LIBCPP_DEPRECATED_IN_CXX17 typedef size_t result_type; + typedef shared_ptr<_Tp> argument_type; + typedef size_t result_type; _LIBCPP_HIDE_FROM_ABI size_t operator()(const shared_ptr<_Tp>& __ptr) const _NOEXCEPT { return hash::element_type*>()(__ptr.get()); @@ -1116,7 +1116,7 @@ class _LIBCPP_EXPORTED_FROM_ABI __sp_mut { void unlock() _NOEXCEPT; private: - _LIBCPP_CONSTEXPR __sp_mut(void*) _NOEXCEPT; + __sp_mut(void*) _NOEXCEPT; __sp_mut(const __sp_mut&); __sp_mut& operator=(const __sp_mut&); diff --git a/libcxx/include/__cxx03/__memory/swap_allocator.h b/libcxx/include/__cxx03/__memory/swap_allocator.h index 9288c0589aa43..8659145ece024 100644 --- a/libcxx/include/__cxx03/__memory/swap_allocator.h +++ b/libcxx/include/__cxx03/__memory/swap_allocator.h @@ -22,19 +22,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __swap_allocator(_Alloc& __a1, _Alloc& __a2, true_type) - _NOEXCEPT_(__is_nothrow_swappable_v<_Alloc>) { +_LIBCPP_HIDE_FROM_ABI void __swap_allocator(_Alloc& __a1, _Alloc& __a2, true_type) { using std::swap; swap(__a1, __a2); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void -__swap_allocator(_Alloc&, _Alloc&, false_type) _NOEXCEPT {} +inline _LIBCPP_HIDE_FROM_ABI void __swap_allocator(_Alloc&, _Alloc&, false_type) _NOEXCEPT {} template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __swap_allocator(_Alloc& __a1, _Alloc& __a2) - _NOEXCEPT_(__is_nothrow_swappable_v<_Alloc>) { +inline _LIBCPP_HIDE_FROM_ABI void __swap_allocator(_Alloc& __a1, _Alloc& __a2) { std::__swap_allocator( __a1, __a2, integral_constant::propagate_on_container_swap::value>()); } diff --git a/libcxx/include/__cxx03/__memory/temp_value.h b/libcxx/include/__cxx03/__memory/temp_value.h index 159f6a1df4066..b79c0a740d643 100644 --- a/libcxx/include/__cxx03/__memory/temp_value.h +++ b/libcxx/include/__cxx03/__memory/temp_value.h @@ -28,19 +28,16 @@ struct __temp_value { typename aligned_storage::type __v; _Alloc& __a; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp* __addr() { - return reinterpret_cast<_Tp*>(std::addressof(__v)); - } + _LIBCPP_HIDE_FROM_ABI _Tp* __addr() { return reinterpret_cast<_Tp*>(std::addressof(__v)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& get() { return *__addr(); } + _LIBCPP_HIDE_FROM_ABI _Tp& get() { return *__addr(); } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _LIBCPP_CONSTEXPR_SINCE_CXX20 __temp_value(_Alloc& __alloc, _Args&&... __args) - : __a(__alloc) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI __temp_value(_Alloc& __alloc, _Args&&... __args) : __a(__alloc) { _Traits::construct(__a, __addr(), std::forward<_Args>(__args)...); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__temp_value() { _Traits::destroy(__a, __addr()); } + _LIBCPP_HIDE_FROM_ABI ~__temp_value() { _Traits::destroy(__a, __addr()); } }; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/temporary_buffer.h b/libcxx/include/__cxx03/__memory/temporary_buffer.h index 3f584a7337f06..60c12379905c8 100644 --- a/libcxx/include/__cxx03/__memory/temporary_buffer.h +++ b/libcxx/include/__cxx03/__memory/temporary_buffer.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _LIBCPP_DEPRECATED_IN_CXX17 pair<_Tp*, ptrdiff_t> +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI pair<_Tp*, ptrdiff_t> get_temporary_buffer(ptrdiff_t __n) _NOEXCEPT { pair<_Tp*, ptrdiff_t> __r(0, 0); const ptrdiff_t __m = @@ -57,7 +57,7 @@ get_temporary_buffer(ptrdiff_t __n) _NOEXCEPT { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_DEPRECATED_IN_CXX17 void return_temporary_buffer(_Tp* __p) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void return_temporary_buffer(_Tp* __p) _NOEXCEPT { std::__libcpp_deallocate_unsized((void*)__p, _LIBCPP_ALIGNOF(_Tp)); } diff --git a/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h b/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h index ff9223d345b29..17c30b4fbdd72 100644 --- a/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h +++ b/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h @@ -48,7 +48,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD struct __always_false { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator()(_Args&&...) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI bool operator()(_Args&&...) const _NOEXCEPT { return false; } }; @@ -171,8 +171,7 @@ uninitialized_fill_n(_ForwardIterator __first, _Size __n, const _Tp& __x) { // Destroy all elements in [__first, __last) from left to right using allocator destruction. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -__allocator_destroy(_Alloc& __alloc, _Iter __first, _Sent __last) { +_LIBCPP_HIDE_FROM_ABI void __allocator_destroy(_Alloc& __alloc, _Iter __first, _Sent __last) { for (; __first != __last; ++__first) allocator_traits<_Alloc>::destroy(__alloc, std::__to_address(__first)); } @@ -180,11 +179,10 @@ __allocator_destroy(_Alloc& __alloc, _Iter __first, _Sent __last) { template class _AllocatorDestroyRangeReverse { public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 - _AllocatorDestroyRangeReverse(_Alloc& __alloc, _Iter& __first, _Iter& __last) + _LIBCPP_HIDE_FROM_ABI _AllocatorDestroyRangeReverse(_Alloc& __alloc, _Iter& __first, _Iter& __last) : __alloc_(__alloc), __first_(__first), __last_(__last) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void operator()() const { + _LIBCPP_HIDE_FROM_ABI void operator()() const { std::__allocator_destroy(__alloc_, std::reverse_iterator<_Iter>(__last_), std::reverse_iterator<_Iter>(__first_)); } @@ -199,7 +197,7 @@ class _AllocatorDestroyRangeReverse { // The caller has to ensure that __first2 can hold at least N uninitialized elements. If an exception is thrown the // already copied elements are destroyed in reverse order of their construction. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter2 +_LIBCPP_HIDE_FROM_ABI _Iter2 __uninitialized_allocator_copy_impl(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) { auto __destruct_first = __first2; auto __guard = @@ -229,8 +227,7 @@ template , __remove_const_t<_Out> >::value && __allocator_has_trivial_copy_construct<_Alloc, _RawTypeIn>::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Out* -__uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* __first2) { +_LIBCPP_HIDE_FROM_ABI _Out* __uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* __first2) { // TODO: Remove the const_cast once we drop support for std::allocator if (__libcpp_is_constant_evaluated()) { while (__first1 != __last1) { @@ -245,7 +242,7 @@ __uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter2 +_LIBCPP_HIDE_FROM_ABI _Iter2 __uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) { auto __unwrapped_range = std::__unwrap_range(__first1, __last1); auto __result = std::__uninitialized_allocator_copy_impl( @@ -278,7 +275,7 @@ struct __allocator_has_trivial_destroy, _Up> : true_type {}; // - is_copy_constructible<_Tp> // - __libcpp_is_trivially_relocatable<_Tp> template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void +_LIBCPP_HIDE_FROM_ABI void __uninitialized_allocator_relocate(_Alloc& __alloc, _Tp* __first, _Tp* __last, _Tp* __result) { static_assert(__is_cpp17_move_insertable<_Alloc>::value, "The specified type does not meet the requirements of Cpp17MoveInsertable"); diff --git a/libcxx/include/__cxx03/__memory/unique_ptr.h b/libcxx/include/__cxx03/__memory/unique_ptr.h index 0452b70b37144..2a9b330ec0d39 100644 --- a/libcxx/include/__cxx03/__memory/unique_ptr.h +++ b/libcxx/include/__cxx03/__memory/unique_ptr.h @@ -54,9 +54,9 @@ struct _LIBCPP_TEMPLATE_VIS default_delete { static_assert(!is_function<_Tp>::value, "default_delete cannot be instantiated for function types"); _LIBCPP_HIDE_FROM_ABI default_delete() {} template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 default_delete(const default_delete<_Up>&) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI default_delete(const default_delete<_Up>&) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator()(_Tp* __ptr) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void operator()(_Tp* __ptr) const _NOEXCEPT { static_assert(sizeof(_Tp) >= 0, "cannot delete an incomplete type"); static_assert(!is_void<_Tp>::value, "cannot delete an incomplete type"); delete __ptr; @@ -73,12 +73,11 @@ struct _LIBCPP_TEMPLATE_VIS default_delete<_Tp[]> { _LIBCPP_HIDE_FROM_ABI default_delete() {} template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 + _LIBCPP_HIDE_FROM_ABI default_delete(const default_delete<_Up[]>&, typename _EnableIfConvertible<_Up>::type* = 0) _NOEXCEPT {} template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 typename _EnableIfConvertible<_Up>::type - operator()(_Up* __ptr) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI typename _EnableIfConvertible<_Up>::type operator()(_Up* __ptr) const _NOEXCEPT { static_assert(sizeof(_Up) >= 0, "cannot delete an incomplete type"); delete[] __ptr; } @@ -167,44 +166,40 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr { public: template > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unique_ptr() _NOEXCEPT : __ptr_(__value_init_tag(), __value_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI unique_ptr() _NOEXCEPT : __ptr_(__value_init_tag(), __value_init_tag()) {} template > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unique_ptr(nullptr_t) _NOEXCEPT - : __ptr_(__value_init_tag(), __value_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI unique_ptr(nullptr_t) _NOEXCEPT : __ptr_(__value_init_tag(), __value_init_tag()) {} template > - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit unique_ptr(pointer __p) _NOEXCEPT : __ptr_(__p, __value_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI explicit unique_ptr(pointer __p) _NOEXCEPT : __ptr_(__p, __value_init_tag()) {} template > > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(pointer __p, _LValRefType<_Dummy> __d) _NOEXCEPT - : __ptr_(__p, __d) {} + _LIBCPP_HIDE_FROM_ABI unique_ptr(pointer __p, _LValRefType<_Dummy> __d) _NOEXCEPT : __ptr_(__p, __d) {} template > > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(pointer __p, _GoodRValRefType<_Dummy> __d) _NOEXCEPT - : __ptr_(__p, std::move(__d)) { + _LIBCPP_HIDE_FROM_ABI unique_ptr(pointer __p, _GoodRValRefType<_Dummy> __d) _NOEXCEPT : __ptr_(__p, std::move(__d)) { static_assert(!is_reference::value, "rvalue deleter bound to reference"); } template > > _LIBCPP_HIDE_FROM_ABI unique_ptr(pointer __p, _BadRValRefType<_Dummy> __d) = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(unique_ptr&& __u) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI unique_ptr(unique_ptr&& __u) _NOEXCEPT : __ptr_(__u.release(), std::forward(__u.get_deleter())) {} template , _Up>, class = _EnableIfDeleterConvertible<_Ep> > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI unique_ptr(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT : __ptr_(__u.release(), std::forward<_Ep>(__u.get_deleter())) {} template ::value && is_same<_Dp, default_delete<_Tp> >::value, int> = 0> _LIBCPP_HIDE_FROM_ABI unique_ptr(auto_ptr<_Up>&& __p) _NOEXCEPT : __ptr_(__p.release(), __value_init_tag()) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(unique_ptr&& __u) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI unique_ptr& operator=(unique_ptr&& __u) _NOEXCEPT { reset(__u.release()); __ptr_.second() = std::forward(__u.get_deleter()); return *this; @@ -214,7 +209,7 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr { class _Ep, class = _EnableIfMoveConvertible, _Up>, class = _EnableIfDeleterAssignable<_Ep> > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI unique_ptr& operator=(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT { reset(__u.release()); __ptr_.second() = std::forward<_Ep>(__u.get_deleter()); return *this; @@ -230,41 +225,34 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr { unique_ptr(unique_ptr const&) = delete; unique_ptr& operator=(unique_ptr const&) = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 ~unique_ptr() { reset(); } + _LIBCPP_HIDE_FROM_ABI ~unique_ptr() { reset(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(nullptr_t) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI unique_ptr& operator=(nullptr_t) _NOEXCEPT { reset(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __add_lvalue_reference_t<_Tp> operator*() const - _NOEXCEPT_(__is_noexcept_deref_or_void::value) { - return *__ptr_.first(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer operator->() const _NOEXCEPT { return __ptr_.first(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer get() const _NOEXCEPT { return __ptr_.first(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 deleter_type& get_deleter() _NOEXCEPT { return __ptr_.second(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const deleter_type& get_deleter() const _NOEXCEPT { - return __ptr_.second(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit operator bool() const _NOEXCEPT { - return __ptr_.first() != nullptr; - } + _LIBCPP_HIDE_FROM_ABI __add_lvalue_reference_t<_Tp> operator*() const { return *__ptr_.first(); } + _LIBCPP_HIDE_FROM_ABI pointer operator->() const _NOEXCEPT { return __ptr_.first(); } + _LIBCPP_HIDE_FROM_ABI pointer get() const _NOEXCEPT { return __ptr_.first(); } + _LIBCPP_HIDE_FROM_ABI deleter_type& get_deleter() _NOEXCEPT { return __ptr_.second(); } + _LIBCPP_HIDE_FROM_ABI const deleter_type& get_deleter() const _NOEXCEPT { return __ptr_.second(); } + _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_.first() != nullptr; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer release() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI pointer release() _NOEXCEPT { pointer __t = __ptr_.first(); __ptr_.first() = pointer(); return __t; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void reset(pointer __p = pointer()) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void reset(pointer __p = pointer()) _NOEXCEPT { pointer __tmp = __ptr_.first(); __ptr_.first() = __p; if (__tmp) __ptr_.second()(__tmp); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void swap(unique_ptr& __u) _NOEXCEPT { __ptr_.swap(__u.__ptr_); } + _LIBCPP_HIDE_FROM_ABI void swap(unique_ptr& __u) _NOEXCEPT { __ptr_.swap(__u.__ptr_); } }; template @@ -335,41 +323,36 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp> public: template > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unique_ptr() _NOEXCEPT : __ptr_(__value_init_tag(), __value_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI unique_ptr() _NOEXCEPT : __ptr_(__value_init_tag(), __value_init_tag()) {} template > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unique_ptr(nullptr_t) _NOEXCEPT - : __ptr_(__value_init_tag(), __value_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI unique_ptr(nullptr_t) _NOEXCEPT : __ptr_(__value_init_tag(), __value_init_tag()) {} template , class = _EnableIfPointerConvertible<_Pp> > - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit unique_ptr(_Pp __p) _NOEXCEPT : __ptr_(__p, __value_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI explicit unique_ptr(_Pp __p) _NOEXCEPT : __ptr_(__p, __value_init_tag()) {} template >, class = _EnableIfPointerConvertible<_Pp> > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(_Pp __p, _LValRefType<_Dummy> __d) _NOEXCEPT - : __ptr_(__p, __d) {} + _LIBCPP_HIDE_FROM_ABI unique_ptr(_Pp __p, _LValRefType<_Dummy> __d) _NOEXCEPT : __ptr_(__p, __d) {} template > > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(nullptr_t, _LValRefType<_Dummy> __d) _NOEXCEPT - : __ptr_(nullptr, __d) {} + _LIBCPP_HIDE_FROM_ABI unique_ptr(nullptr_t, _LValRefType<_Dummy> __d) _NOEXCEPT : __ptr_(nullptr, __d) {} template >, class = _EnableIfPointerConvertible<_Pp> > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(_Pp __p, _GoodRValRefType<_Dummy> __d) _NOEXCEPT - : __ptr_(__p, std::move(__d)) { + _LIBCPP_HIDE_FROM_ABI unique_ptr(_Pp __p, _GoodRValRefType<_Dummy> __d) _NOEXCEPT : __ptr_(__p, std::move(__d)) { static_assert(!is_reference::value, "rvalue deleter bound to reference"); } template > > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(nullptr_t, _GoodRValRefType<_Dummy> __d) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI unique_ptr(nullptr_t, _GoodRValRefType<_Dummy> __d) _NOEXCEPT : __ptr_(nullptr, std::move(__d)) { static_assert(!is_reference::value, "rvalue deleter bound to reference"); } @@ -380,10 +363,10 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp> class = _EnableIfPointerConvertible<_Pp> > _LIBCPP_HIDE_FROM_ABI unique_ptr(_Pp __p, _BadRValRefType<_Dummy> __d) = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(unique_ptr&& __u) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI unique_ptr(unique_ptr&& __u) _NOEXCEPT : __ptr_(__u.release(), std::forward(__u.get_deleter())) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(unique_ptr&& __u) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI unique_ptr& operator=(unique_ptr&& __u) _NOEXCEPT { reset(__u.release()); __ptr_.second() = std::forward(__u.get_deleter()); return *this; @@ -393,14 +376,14 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp> class _Ep, class = _EnableIfMoveConvertible, _Up>, class = _EnableIfDeleterConvertible<_Ep> > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI unique_ptr(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT : __ptr_(__u.release(), std::forward<_Ep>(__u.get_deleter())) {} template , _Up>, class = _EnableIfDeleterAssignable<_Ep> > - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI unique_ptr& operator=(unique_ptr<_Up, _Ep>&& __u) _NOEXCEPT { reset(__u.release()); __ptr_.second() = std::forward<_Ep>(__u.get_deleter()); return *this; @@ -410,60 +393,52 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp> unique_ptr& operator=(unique_ptr const&) = delete; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 ~unique_ptr() { reset(); } + _LIBCPP_HIDE_FROM_ABI ~unique_ptr() { reset(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unique_ptr& operator=(nullptr_t) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI unique_ptr& operator=(nullptr_t) _NOEXCEPT { reset(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __add_lvalue_reference_t<_Tp> operator[](size_t __i) const { - return __ptr_.first()[__i]; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer get() const _NOEXCEPT { return __ptr_.first(); } + _LIBCPP_HIDE_FROM_ABI __add_lvalue_reference_t<_Tp> operator[](size_t __i) const { return __ptr_.first()[__i]; } + _LIBCPP_HIDE_FROM_ABI pointer get() const _NOEXCEPT { return __ptr_.first(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 deleter_type& get_deleter() _NOEXCEPT { return __ptr_.second(); } + _LIBCPP_HIDE_FROM_ABI deleter_type& get_deleter() _NOEXCEPT { return __ptr_.second(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const deleter_type& get_deleter() const _NOEXCEPT { - return __ptr_.second(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit operator bool() const _NOEXCEPT { - return __ptr_.first() != nullptr; - } + _LIBCPP_HIDE_FROM_ABI const deleter_type& get_deleter() const _NOEXCEPT { return __ptr_.second(); } + _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_.first() != nullptr; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 pointer release() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI pointer release() _NOEXCEPT { pointer __t = __ptr_.first(); __ptr_.first() = pointer(); return __t; } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void reset(_Pp __p) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void reset(_Pp __p) _NOEXCEPT { pointer __tmp = __ptr_.first(); __ptr_.first() = __p; if (__tmp) __ptr_.second()(__tmp); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void reset(nullptr_t = nullptr) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void reset(nullptr_t = nullptr) _NOEXCEPT { pointer __tmp = __ptr_.first(); __ptr_.first() = nullptr; if (__tmp) __ptr_.second()(__tmp); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void swap(unique_ptr& __u) _NOEXCEPT { __ptr_.swap(__u.__ptr_); } + _LIBCPP_HIDE_FROM_ABI void swap(unique_ptr& __u) _NOEXCEPT { __ptr_.swap(__u.__ptr_); } }; template , int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void -swap(unique_ptr<_Tp, _Dp>& __x, unique_ptr<_Tp, _Dp>& __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void swap(unique_ptr<_Tp, _Dp>& __x, unique_ptr<_Tp, _Dp>& __y) _NOEXCEPT { __x.swap(__y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool -operator==(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const unique_ptr<_T1, _D1>& __x, const unique_ptr<_T2, _D2>& __y) { return __x.get() == __y.get(); } @@ -496,8 +471,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const unique_ptr<_T1, _D1>& __x, co } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool -operator==(const unique_ptr<_T1, _D1>& __x, nullptr_t) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const unique_ptr<_T1, _D1>& __x, nullptr_t) _NOEXCEPT { return !__x; } @@ -517,44 +491,44 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator!=(nullptr_t, const unique_ptr<_T1, _D } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator<(const unique_ptr<_T1, _D1>& __x, nullptr_t) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<(const unique_ptr<_T1, _D1>& __x, nullptr_t) { typedef typename unique_ptr<_T1, _D1>::pointer _P1; return less<_P1>()(__x.get(), nullptr); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator<(nullptr_t, const unique_ptr<_T1, _D1>& __x) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<(nullptr_t, const unique_ptr<_T1, _D1>& __x) { typedef typename unique_ptr<_T1, _D1>::pointer _P1; return less<_P1>()(nullptr, __x.get()); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator>(const unique_ptr<_T1, _D1>& __x, nullptr_t) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>(const unique_ptr<_T1, _D1>& __x, nullptr_t) { return nullptr < __x; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator>(nullptr_t, const unique_ptr<_T1, _D1>& __x) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>(nullptr_t, const unique_ptr<_T1, _D1>& __x) { return __x < nullptr; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator<=(const unique_ptr<_T1, _D1>& __x, nullptr_t) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const unique_ptr<_T1, _D1>& __x, nullptr_t) { return !(nullptr < __x); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator<=(nullptr_t, const unique_ptr<_T1, _D1>& __x) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(nullptr_t, const unique_ptr<_T1, _D1>& __x) { return !(__x < nullptr); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator>=(const unique_ptr<_T1, _D1>& __x, nullptr_t) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const unique_ptr<_T1, _D1>& __x, nullptr_t) { return !(__x < nullptr); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator>=(nullptr_t, const unique_ptr<_T1, _D1>& __x) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(nullptr_t, const unique_ptr<_T1, _D1>& __x) { return !(nullptr < __x); } @@ -563,8 +537,8 @@ struct _LIBCPP_TEMPLATE_VIS hash; template struct _LIBCPP_TEMPLATE_VIS hash > { - _LIBCPP_DEPRECATED_IN_CXX17 typedef unique_ptr<_Tp, _Dp> argument_type; - _LIBCPP_DEPRECATED_IN_CXX17 typedef size_t result_type; + typedef unique_ptr<_Tp, _Dp> argument_type; + typedef size_t result_type; _LIBCPP_HIDE_FROM_ABI size_t operator()(const unique_ptr<_Tp, _Dp>& __ptr) const { typedef typename unique_ptr<_Tp, _Dp>::pointer pointer; diff --git a/libcxx/include/__cxx03/__memory/voidify.h b/libcxx/include/__cxx03/__memory/voidify.h index 63e03b7d7f8f8..842fb8d4e7a0f 100644 --- a/libcxx/include/__cxx03/__memory/voidify.h +++ b/libcxx/include/__cxx03/__memory/voidify.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void* __voidify(_Tp& __from) { +_LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI void* __voidify(_Tp& __from) { // Cast away cv-qualifiers to allow modifying elements of a range through const iterators. return const_cast(static_cast(std::addressof(__from))); } diff --git a/libcxx/include/__cxx03/__mutex/mutex.h b/libcxx/include/__cxx03/__mutex/mutex.h index 46c7546f77e32..644131a047fa1 100644 --- a/libcxx/include/__cxx03/__mutex/mutex.h +++ b/libcxx/include/__cxx03/__mutex/mutex.h @@ -25,7 +25,7 @@ class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_THREAD_SAFETY_ANNOTATION(capability("mut __libcpp_mutex_t __m_ = _LIBCPP_MUTEX_INITIALIZER; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR mutex() = default; + _LIBCPP_HIDE_FROM_ABI mutex() = default; mutex(const mutex&) = delete; mutex& operator=(const mutex&) = delete; diff --git a/libcxx/include/__cxx03/__mutex/once_flag.h b/libcxx/include/__cxx03/__mutex/once_flag.h index 0345bfab8184b..eaeaa6121bff8 100644 --- a/libcxx/include/__cxx03/__mutex/once_flag.h +++ b/libcxx/include/__cxx03/__mutex/once_flag.h @@ -36,7 +36,7 @@ template _LIBCPP_HIDE_FROM_ABI void call_once(once_flag&, const _Callable&); struct _LIBCPP_TEMPLATE_VIS once_flag { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR once_flag() _NOEXCEPT : __state_(_Unset) {} + _LIBCPP_HIDE_FROM_ABI once_flag() _NOEXCEPT : __state_(_Unset) {} once_flag(const once_flag&) = delete; once_flag& operator=(const once_flag&) = delete; diff --git a/libcxx/include/__cxx03/__numeric/accumulate.h b/libcxx/include/__cxx03/__numeric/accumulate.h index 3ac3419201442..0b1a0a9bf9d15 100644 --- a/libcxx/include/__cxx03/__numeric/accumulate.h +++ b/libcxx/include/__cxx03/__numeric/accumulate.h @@ -23,15 +23,14 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp -accumulate(_InputIterator __first, _InputIterator __last, _Tp __init) { +_LIBCPP_HIDE_FROM_ABI _Tp accumulate(_InputIterator __first, _InputIterator __last, _Tp __init) { for (; __first != __last; ++__first) __init = __init + *__first; return __init; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp +_LIBCPP_HIDE_FROM_ABI _Tp accumulate(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOperation __binary_op) { for (; __first != __last; ++__first) __init = __binary_op(__init, *__first); diff --git a/libcxx/include/__cxx03/__numeric/adjacent_difference.h b/libcxx/include/__cxx03/__numeric/adjacent_difference.h index 4fd33cabde8aa..90ef7af1543f8 100644 --- a/libcxx/include/__cxx03/__numeric/adjacent_difference.h +++ b/libcxx/include/__cxx03/__numeric/adjacent_difference.h @@ -24,7 +24,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +_LIBCPP_HIDE_FROM_ABI _OutputIterator adjacent_difference(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { if (__first != __last) { typename iterator_traits<_InputIterator>::value_type __acc(*__first); @@ -32,14 +32,14 @@ adjacent_difference(_InputIterator __first, _InputIterator __last, _OutputIterat for (++__first, (void)++__result; __first != __last; ++__first, (void)++__result) { typename iterator_traits<_InputIterator>::value_type __val(*__first); *__result = __val - __acc; - __acc = std::move(__val); + __acc = std::move(__val); } } return __result; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator adjacent_difference( +_LIBCPP_HIDE_FROM_ABI _OutputIterator adjacent_difference( _InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOperation __binary_op) { if (__first != __last) { typename iterator_traits<_InputIterator>::value_type __acc(*__first); @@ -47,7 +47,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator adjacent_dif for (++__first, (void)++__result; __first != __last; ++__first, (void)++__result) { typename iterator_traits<_InputIterator>::value_type __val(*__first); *__result = __binary_op(__val, __acc); - __acc = std::move(__val); + __acc = std::move(__val); } } return __result; diff --git a/libcxx/include/__cxx03/__numeric/inner_product.h b/libcxx/include/__cxx03/__numeric/inner_product.h index 0d8483a836fba..39e09ba005e40 100644 --- a/libcxx/include/__cxx03/__numeric/inner_product.h +++ b/libcxx/include/__cxx03/__numeric/inner_product.h @@ -23,7 +23,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp +_LIBCPP_HIDE_FROM_ABI _Tp inner_product(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _Tp __init) { for (; __first1 != __last1; ++__first1, (void)++__first2) __init = __init + *__first1 * *__first2; @@ -31,7 +31,7 @@ inner_product(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp inner_product( +_LIBCPP_HIDE_FROM_ABI _Tp inner_product( _InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, diff --git a/libcxx/include/__cxx03/__numeric/iota.h b/libcxx/include/__cxx03/__numeric/iota.h index f97c124700ae8..3b1f70eef33da 100644 --- a/libcxx/include/__cxx03/__numeric/iota.h +++ b/libcxx/include/__cxx03/__numeric/iota.h @@ -19,8 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -iota(_ForwardIterator __first, _ForwardIterator __last, _Tp __value) { +_LIBCPP_HIDE_FROM_ABI void iota(_ForwardIterator __first, _ForwardIterator __last, _Tp __value) { for (; __first != __last; ++__first, (void)++__value) *__first = __value; } diff --git a/libcxx/include/__cxx03/__numeric/partial_sum.h b/libcxx/include/__cxx03/__numeric/partial_sum.h index f8689298bad46..e60f85069e542 100644 --- a/libcxx/include/__cxx03/__numeric/partial_sum.h +++ b/libcxx/include/__cxx03/__numeric/partial_sum.h @@ -24,7 +24,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +_LIBCPP_HIDE_FROM_ABI _OutputIterator partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __result) { if (__first != __last) { typename iterator_traits<_InputIterator>::value_type __t(*__first); @@ -38,7 +38,7 @@ partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __res } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator +_LIBCPP_HIDE_FROM_ABI _OutputIterator partial_sum(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryOperation __binary_op) { if (__first != __last) { typename iterator_traits<_InputIterator>::value_type __t(*__first); diff --git a/libcxx/include/__cxx03/__random/clamp_to_integral.h b/libcxx/include/__cxx03/__random/clamp_to_integral.h index be89db8c6a807..baa1ca636a2b3 100644 --- a/libcxx/include/__cxx03/__random/clamp_to_integral.h +++ b/libcxx/include/__cxx03/__random/clamp_to_integral.h @@ -26,7 +26,7 @@ template ::digits > numeric_limits<_IntT>::digits), int _Bits = (numeric_limits<_IntT>::digits - numeric_limits<_FloatT>::digits)> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _IntT __max_representable_int_for_float() _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI _IntT __max_representable_int_for_float() _NOEXCEPT { static_assert(is_floating_point<_FloatT>::value, "must be a floating point type"); static_assert(is_integral<_IntT>::value, "must be an integral type"); static_assert(numeric_limits<_FloatT>::radix == 2, "FloatT has incorrect radix"); diff --git a/libcxx/include/__cxx03/__random/discard_block_engine.h b/libcxx/include/__cxx03/__random/discard_block_engine.h index 15bdbbbd5934f..c7e8d902262d6 100644 --- a/libcxx/include/__cxx03/__random/discard_block_engine.h +++ b/libcxx/include/__cxx03/__random/discard_block_engine.h @@ -40,14 +40,14 @@ class _LIBCPP_TEMPLATE_VIS discard_block_engine { typedef typename _Engine::result_type result_type; // engine characteristics - static _LIBCPP_CONSTEXPR const size_t block_size = __p; - static _LIBCPP_CONSTEXPR const size_t used_block = __r; + static const size_t block_size = __p; + static const size_t used_block = __r; static const result_type _Min = _Engine::_Min; static const result_type _Max = _Engine::_Max; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type min() { return _Engine::min(); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type max() { return _Engine::max(); } + _LIBCPP_HIDE_FROM_ABI static result_type min() { return _Engine::min(); } + _LIBCPP_HIDE_FROM_ABI static result_type max() { return _Engine::max(); } // constructors and seeding functions _LIBCPP_HIDE_FROM_ABI discard_block_engine() : __n_(0) {} @@ -100,10 +100,10 @@ class _LIBCPP_TEMPLATE_VIS discard_block_engine { }; template -_LIBCPP_CONSTEXPR const size_t discard_block_engine<_Engine, __p, __r>::block_size; +const size_t discard_block_engine<_Engine, __p, __r>::block_size; template -_LIBCPP_CONSTEXPR const size_t discard_block_engine<_Engine, __p, __r>::used_block; +const size_t discard_block_engine<_Engine, __p, __r>::used_block; template typename discard_block_engine<_Engine, __p, __r>::result_type discard_block_engine<_Engine, __p, __r>::operator()() { diff --git a/libcxx/include/__cxx03/__random/independent_bits_engine.h b/libcxx/include/__cxx03/__random/independent_bits_engine.h index 0fe41d17cb707..ba7c2c7f67c1c 100644 --- a/libcxx/include/__cxx03/__random/independent_bits_engine.h +++ b/libcxx/include/__cxx03/__random/independent_bits_engine.h @@ -34,13 +34,13 @@ template class _LIBCPP_TEMPLATE_VIS independent_bits_engine { template class __get_n { - static _LIBCPP_CONSTEXPR const size_t _Dt = numeric_limits<_UInt>::digits; - static _LIBCPP_CONSTEXPR const size_t _Np = _Wp / _Mp + (_Wp % _Mp != 0); - static _LIBCPP_CONSTEXPR const size_t _W0 = _Wp / _Np; - static _LIBCPP_CONSTEXPR const _UInt _Y0 = _W0 >= _Dt ? 0 : (_R0 >> _W0) << _W0; + static const size_t _Dt = numeric_limits<_UInt>::digits; + static const size_t _Np = _Wp / _Mp + (_Wp % _Mp != 0); + static const size_t _W0 = _Wp / _Np; + static const _UInt _Y0 = _W0 >= _Dt ? 0 : (_R0 >> _W0) << _W0; public: - static _LIBCPP_CONSTEXPR const size_t value = _R0 - _Y0 > _Y0 / _Np ? _Np + 1 : _Np; + static const size_t value = _R0 - _Y0 > _Y0 / _Np ? _Np + 1 : _Np; }; public: @@ -50,36 +50,35 @@ class _LIBCPP_TEMPLATE_VIS independent_bits_engine { private: _Engine __e_; - static _LIBCPP_CONSTEXPR const result_type _Dt = numeric_limits::digits; + static const result_type _Dt = numeric_limits::digits; static_assert(0 < __w, "independent_bits_engine invalid parameters"); static_assert(__w <= _Dt, "independent_bits_engine invalid parameters"); typedef typename _Engine::result_type _Engine_result_type; typedef __conditional_t _Working_result_type; - static const _Working_result_type _Rp = _Engine::_Max - _Engine::_Min + _Working_result_type(1); - static _LIBCPP_CONSTEXPR const size_t __m = __log2<_Working_result_type, _Rp>::value; - static _LIBCPP_CONSTEXPR const size_t __n = __get_n<_Working_result_type, _Rp, __w, __m>::value; - static _LIBCPP_CONSTEXPR const size_t __w0 = __w / __n; - static _LIBCPP_CONSTEXPR const size_t __n0 = __n - __w % __n; - static _LIBCPP_CONSTEXPR const size_t _WDt = numeric_limits<_Working_result_type>::digits; - static _LIBCPP_CONSTEXPR const size_t _EDt = numeric_limits<_Engine_result_type>::digits; - static _LIBCPP_CONSTEXPR const _Working_result_type __y0 = __w0 >= _WDt ? 0 : (_Rp >> __w0) << __w0; - static _LIBCPP_CONSTEXPR const _Working_result_type __y1 = __w0 >= _WDt - 1 ? 0 : (_Rp >> (__w0 + 1)) << (__w0 + 1); - static _LIBCPP_CONSTEXPR const - _Engine_result_type __mask0 = __w0 > 0 ? _Engine_result_type(~0) >> (_EDt - __w0) : _Engine_result_type(0); - static _LIBCPP_CONSTEXPR const _Engine_result_type __mask1 = - __w0 < _EDt - 1 ? _Engine_result_type(~0) >> (_EDt - (__w0 + 1)) : _Engine_result_type(~0); + static const _Working_result_type _Rp = _Engine::_Max - _Engine::_Min + _Working_result_type(1); + static const size_t __m = __log2<_Working_result_type, _Rp>::value; + static const size_t __n = __get_n<_Working_result_type, _Rp, __w, __m>::value; + static const size_t __w0 = __w / __n; + static const size_t __n0 = __n - __w % __n; + static const size_t _WDt = numeric_limits<_Working_result_type>::digits; + static const size_t _EDt = numeric_limits<_Engine_result_type>::digits; + static const _Working_result_type __y0 = __w0 >= _WDt ? 0 : (_Rp >> __w0) << __w0; + static const _Working_result_type __y1 = __w0 >= _WDt - 1 ? 0 : (_Rp >> (__w0 + 1)) << (__w0 + 1); + static const _Engine_result_type + __mask0 = __w0 > 0 ? _Engine_result_type(~0) >> (_EDt - __w0) : _Engine_result_type(0); + static const _Engine_result_type + __mask1 = __w0 < _EDt - 1 ? _Engine_result_type(~0) >> (_EDt - (__w0 + 1)) : _Engine_result_type(~0); public: - static _LIBCPP_CONSTEXPR const result_type _Min = 0; - static _LIBCPP_CONSTEXPR const result_type _Max = - __w == _Dt ? result_type(~0) : (result_type(1) << __w) - result_type(1); + static const result_type _Min = 0; + static const result_type _Max = __w == _Dt ? result_type(~0) : (result_type(1) << __w) - result_type(1); static_assert(_Min < _Max, "independent_bits_engine invalid parameters"); // engine characteristics - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type min() { return _Min; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type max() { return _Max; } + _LIBCPP_HIDE_FROM_ABI static result_type min() { return _Min; } + _LIBCPP_HIDE_FROM_ABI static result_type max() { return _Max; } // constructors and seeding functions _LIBCPP_HIDE_FROM_ABI independent_bits_engine() {} diff --git a/libcxx/include/__cxx03/__random/is_seed_sequence.h b/libcxx/include/__cxx03/__random/is_seed_sequence.h index ddb9c83bb7081..da73fb478af60 100644 --- a/libcxx/include/__cxx03/__random/is_seed_sequence.h +++ b/libcxx/include/__cxx03/__random/is_seed_sequence.h @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct __is_seed_sequence { - static _LIBCPP_CONSTEXPR const bool value = + static const bool value = !is_convertible<_Sseq, typename _Engine::result_type>::value && !is_same<__remove_cv_t<_Sseq>, _Engine>::value; }; diff --git a/libcxx/include/__cxx03/__random/linear_congruential_engine.h b/libcxx/include/__cxx03/__random/linear_congruential_engine.h index 100521c202e84..7e62a2a27497b 100644 --- a/libcxx/include/__cxx03/__random/linear_congruential_engine.h +++ b/libcxx/include/__cxx03/__random/linear_congruential_engine.h @@ -42,11 +42,7 @@ template // r <= q struct __lce_alg_picker { - static _LIBCPP_CONSTEXPR const __lce_alg_type __mode = - _Full ? _LCE_Full - : _Part ? _LCE_Part - : _Schrage ? _LCE_Schrage - : _LCE_Promote; + static const __lce_alg_type __mode = _Full ? _LCE_Full : _Part ? _LCE_Part : _Schrage ? _LCE_Schrage : _LCE_Promote; #ifdef _LIBCPP_HAS_NO_INT128 static_assert(_Mp != (unsigned long long)(-1) || _Full || _Part || _Schrage, @@ -239,24 +235,24 @@ class _LIBCPP_TEMPLATE_VIS linear_congruential_engine { private: result_type __x_; - static _LIBCPP_CONSTEXPR const result_type _Mp = result_type(-1); + static const result_type _Mp = result_type(-1); static_assert(__m == 0 || __a < __m, "linear_congruential_engine invalid parameters"); static_assert(__m == 0 || __c < __m, "linear_congruential_engine invalid parameters"); static_assert(is_unsigned<_UIntType>::value, "_UIntType must be unsigned type"); public: - static _LIBCPP_CONSTEXPR const result_type _Min = __c == 0u ? 1u : 0u; - static _LIBCPP_CONSTEXPR const result_type _Max = __m - _UIntType(1u); + static const result_type _Min = __c == 0u ? 1u : 0u; + static const result_type _Max = __m - _UIntType(1u); static_assert(_Min < _Max, "linear_congruential_engine invalid parameters"); // engine characteristics - static _LIBCPP_CONSTEXPR const result_type multiplier = __a; - static _LIBCPP_CONSTEXPR const result_type increment = __c; - static _LIBCPP_CONSTEXPR const result_type modulus = __m; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type min() { return _Min; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type max() { return _Max; } - static _LIBCPP_CONSTEXPR const result_type default_seed = 1u; + static const result_type multiplier = __a; + static const result_type increment = __c; + static const result_type modulus = __m; + _LIBCPP_HIDE_FROM_ABI static result_type min() { return _Min; } + _LIBCPP_HIDE_FROM_ABI static result_type max() { return _Max; } + static const result_type default_seed = 1u; // constructors and seeding functions _LIBCPP_HIDE_FROM_ABI explicit linear_congruential_engine(result_type __s = default_seed) { seed(__s); } @@ -314,19 +310,19 @@ class _LIBCPP_TEMPLATE_VIS linear_congruential_engine { }; template -_LIBCPP_CONSTEXPR const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type +const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type linear_congruential_engine<_UIntType, __a, __c, __m>::multiplier; template -_LIBCPP_CONSTEXPR const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type +const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type linear_congruential_engine<_UIntType, __a, __c, __m>::increment; template -_LIBCPP_CONSTEXPR const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type +const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type linear_congruential_engine<_UIntType, __a, __c, __m>::modulus; template -_LIBCPP_CONSTEXPR const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type +const typename linear_congruential_engine<_UIntType, __a, __c, __m>::result_type linear_congruential_engine<_UIntType, __a, __c, __m>::default_seed; template diff --git a/libcxx/include/__cxx03/__random/mersenne_twister_engine.h b/libcxx/include/__cxx03/__random/mersenne_twister_engine.h index 404b15c3b0766..2392568b89bdf 100644 --- a/libcxx/include/__cxx03/__random/mersenne_twister_engine.h +++ b/libcxx/include/__cxx03/__random/mersenne_twister_engine.h @@ -144,7 +144,7 @@ class _LIBCPP_TEMPLATE_VIS mersenne_twister_engine { static_assert(0 < __m, "mersenne_twister_engine invalid parameters"); static_assert(__m <= __n, "mersenne_twister_engine invalid parameters"); - static _LIBCPP_CONSTEXPR const result_type _Dt = numeric_limits::digits; + static const result_type _Dt = numeric_limits::digits; static_assert(__w <= _Dt, "mersenne_twister_engine invalid parameters"); static_assert(2 <= __w, "mersenne_twister_engine invalid parameters"); static_assert(__r <= __w, "mersenne_twister_engine invalid parameters"); @@ -154,9 +154,8 @@ class _LIBCPP_TEMPLATE_VIS mersenne_twister_engine { static_assert(__l <= __w, "mersenne_twister_engine invalid parameters"); public: - static _LIBCPP_CONSTEXPR const result_type _Min = 0; - static _LIBCPP_CONSTEXPR const result_type _Max = - __w == _Dt ? result_type(~0) : (result_type(1) << __w) - result_type(1); + static const result_type _Min = 0; + static const result_type _Max = __w == _Dt ? result_type(~0) : (result_type(1) << __w) - result_type(1); static_assert(_Min < _Max, "mersenne_twister_engine invalid parameters"); static_assert(__a <= _Max, "mersenne_twister_engine invalid parameters"); static_assert(__b <= _Max, "mersenne_twister_engine invalid parameters"); @@ -165,22 +164,22 @@ class _LIBCPP_TEMPLATE_VIS mersenne_twister_engine { static_assert(__f <= _Max, "mersenne_twister_engine invalid parameters"); // engine characteristics - static _LIBCPP_CONSTEXPR const size_t word_size = __w; - static _LIBCPP_CONSTEXPR const size_t state_size = __n; - static _LIBCPP_CONSTEXPR const size_t shift_size = __m; - static _LIBCPP_CONSTEXPR const size_t mask_bits = __r; - static _LIBCPP_CONSTEXPR const result_type xor_mask = __a; - static _LIBCPP_CONSTEXPR const size_t tempering_u = __u; - static _LIBCPP_CONSTEXPR const result_type tempering_d = __d; - static _LIBCPP_CONSTEXPR const size_t tempering_s = __s; - static _LIBCPP_CONSTEXPR const result_type tempering_b = __b; - static _LIBCPP_CONSTEXPR const size_t tempering_t = __t; - static _LIBCPP_CONSTEXPR const result_type tempering_c = __c; - static _LIBCPP_CONSTEXPR const size_t tempering_l = __l; - static _LIBCPP_CONSTEXPR const result_type initialization_multiplier = __f; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type min() { return _Min; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type max() { return _Max; } - static _LIBCPP_CONSTEXPR const result_type default_seed = 5489u; + static const size_t word_size = __w; + static const size_t state_size = __n; + static const size_t shift_size = __m; + static const size_t mask_bits = __r; + static const result_type xor_mask = __a; + static const size_t tempering_u = __u; + static const result_type tempering_d = __d; + static const size_t tempering_s = __s; + static const result_type tempering_b = __b; + static const size_t tempering_t = __t; + static const result_type tempering_c = __c; + static const size_t tempering_l = __l; + static const result_type initialization_multiplier = __f; + _LIBCPP_HIDE_FROM_ABI static result_type min() { return _Min; } + _LIBCPP_HIDE_FROM_ABI static result_type max() { return _Max; } + static const result_type default_seed = 5489u; // constructors and seeding functions _LIBCPP_HIDE_FROM_ABI explicit mersenne_twister_engine(result_type __sd = default_seed) { seed(__sd); } @@ -318,7 +317,7 @@ template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::word_size; template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::state_size; template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::shift_size; template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::mask_bits; template -_LIBCPP_CONSTEXPR const typename mersenne_twister_engine< - _UIntType, - __w, - __n, - __m, - __r, - __a, - __u, - __d, - __s, - __b, - __t, - __c, - __l, - __f>::result_type - mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::xor_mask; +const typename mersenne_twister_engine< _UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + result_type + mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::xor_mask; template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::tempering_u; template -_LIBCPP_CONSTEXPR const typename mersenne_twister_engine< - _UIntType, - __w, - __n, - __m, - __r, - __a, - __u, - __d, - __s, - __b, - __t, - __c, - __l, - __f>::result_type - mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::tempering_d; +const typename mersenne_twister_engine< _UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + result_type mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + tempering_d; template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::tempering_s; template -_LIBCPP_CONSTEXPR const typename mersenne_twister_engine< - _UIntType, - __w, - __n, - __m, - __r, - __a, - __u, - __d, - __s, - __b, - __t, - __c, - __l, - __f>::result_type - mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::tempering_b; +const typename mersenne_twister_engine< _UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + result_type mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + tempering_b; template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::tempering_t; template -_LIBCPP_CONSTEXPR const typename mersenne_twister_engine< - _UIntType, - __w, - __n, - __m, - __r, - __a, - __u, - __d, - __s, - __b, - __t, - __c, - __l, - __f>::result_type - mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::tempering_c; +const typename mersenne_twister_engine< _UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + result_type mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + tempering_c; template -_LIBCPP_CONSTEXPR const size_t +const size_t mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::tempering_l; template -_LIBCPP_CONSTEXPR const typename mersenne_twister_engine< - _UIntType, - __w, - __n, - __m, - __r, - __a, - __u, - __d, - __s, - __b, - __t, - __c, - __l, - __f>::result_type - mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: +const typename mersenne_twister_engine< _UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + result_type mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: initialization_multiplier; template -_LIBCPP_CONSTEXPR const typename mersenne_twister_engine< - _UIntType, - __w, - __n, - __m, - __r, - __a, - __u, - __d, - __s, - __b, - __t, - __c, - __l, - __f>::result_type - mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>::default_seed; +const typename mersenne_twister_engine< _UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + result_type mersenne_twister_engine<_UIntType, __w, __n, __m, __r, __a, __u, __d, __s, __b, __t, __c, __l, __f>:: + default_seed; template struct __ugcd { - static _LIBCPP_CONSTEXPR const uint64_t value = __ugcd<_Yp, _Xp % _Yp>::value; + static const uint64_t value = __ugcd<_Yp, _Xp % _Yp>::value; }; template struct __ugcd<_Xp, 0> { - static _LIBCPP_CONSTEXPR const uint64_t value = _Xp; + static const uint64_t value = _Xp; }; template class __uratio { static_assert(_Dp != 0, "__uratio divide by 0"); - static _LIBCPP_CONSTEXPR const uint64_t __gcd = __ugcd<_Np, _Dp>::value; + static const uint64_t __gcd = __ugcd<_Np, _Dp>::value; public: - static _LIBCPP_CONSTEXPR const uint64_t num = _Np / __gcd; - static _LIBCPP_CONSTEXPR const uint64_t den = _Dp / __gcd; + static const uint64_t num = _Np / __gcd; + static const uint64_t den = _Dp / __gcd; typedef __uratio type; }; @@ -66,15 +66,15 @@ class _LIBCPP_TEMPLATE_VIS shuffle_order_engine { public: // engine characteristics - static _LIBCPP_CONSTEXPR const size_t table_size = __k; + static const size_t table_size = __k; static const result_type _Min = _Engine::_Min; static const result_type _Max = _Engine::_Max; static_assert(_Min < _Max, "shuffle_order_engine invalid parameters"); - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type min() { return _Min; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type max() { return _Max; } + _LIBCPP_HIDE_FROM_ABI static result_type min() { return _Min; } + _LIBCPP_HIDE_FROM_ABI static result_type max() { return _Max; } - static _LIBCPP_CONSTEXPR const unsigned long long _Rp = _Max - _Min + 1ull; + static const unsigned long long _Rp = _Max - _Min + 1ull; // constructors and seeding functions _LIBCPP_HIDE_FROM_ABI shuffle_order_engine() { __init(); } @@ -166,7 +166,7 @@ class _LIBCPP_TEMPLATE_VIS shuffle_order_engine { }; template -_LIBCPP_CONSTEXPR const size_t shuffle_order_engine<_Engine, __k>::table_size; +const size_t shuffle_order_engine<_Engine, __k>::table_size; template _LIBCPP_HIDE_FROM_ABI bool diff --git a/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h b/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h index 84aea22060cc7..1e38f30643b97 100644 --- a/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h +++ b/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h @@ -58,25 +58,24 @@ class _LIBCPP_TEMPLATE_VIS subtract_with_carry_engine { result_type __c_; size_t __i_; - static _LIBCPP_CONSTEXPR const result_type _Dt = numeric_limits::digits; + static const result_type _Dt = numeric_limits::digits; static_assert(0 < __w, "subtract_with_carry_engine invalid parameters"); static_assert(__w <= _Dt, "subtract_with_carry_engine invalid parameters"); static_assert(0 < __s, "subtract_with_carry_engine invalid parameters"); static_assert(__s < __r, "subtract_with_carry_engine invalid parameters"); public: - static _LIBCPP_CONSTEXPR const result_type _Min = 0; - static _LIBCPP_CONSTEXPR const result_type _Max = - __w == _Dt ? result_type(~0) : (result_type(1) << __w) - result_type(1); + static const result_type _Min = 0; + static const result_type _Max = __w == _Dt ? result_type(~0) : (result_type(1) << __w) - result_type(1); static_assert(_Min < _Max, "subtract_with_carry_engine invalid parameters"); // engine characteristics - static _LIBCPP_CONSTEXPR const size_t word_size = __w; - static _LIBCPP_CONSTEXPR const size_t short_lag = __s; - static _LIBCPP_CONSTEXPR const size_t long_lag = __r; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type min() { return _Min; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR result_type max() { return _Max; } - static _LIBCPP_CONSTEXPR const result_type default_seed = 19780503u; + static const size_t word_size = __w; + static const size_t short_lag = __s; + static const size_t long_lag = __r; + _LIBCPP_HIDE_FROM_ABI static result_type min() { return _Min; } + _LIBCPP_HIDE_FROM_ABI static result_type max() { return _Max; } + static const result_type default_seed = 19780503u; // constructors and seeding functions _LIBCPP_HIDE_FROM_ABI explicit subtract_with_carry_engine(result_type __sd = default_seed) { seed(__sd); } @@ -125,16 +124,16 @@ class _LIBCPP_TEMPLATE_VIS subtract_with_carry_engine { }; template -_LIBCPP_CONSTEXPR const size_t subtract_with_carry_engine<_UIntType, __w, __s, __r>::word_size; +const size_t subtract_with_carry_engine<_UIntType, __w, __s, __r>::word_size; template -_LIBCPP_CONSTEXPR const size_t subtract_with_carry_engine<_UIntType, __w, __s, __r>::short_lag; +const size_t subtract_with_carry_engine<_UIntType, __w, __s, __r>::short_lag; template -_LIBCPP_CONSTEXPR const size_t subtract_with_carry_engine<_UIntType, __w, __s, __r>::long_lag; +const size_t subtract_with_carry_engine<_UIntType, __w, __s, __r>::long_lag; template -_LIBCPP_CONSTEXPR const typename subtract_with_carry_engine<_UIntType, __w, __s, __r>::result_type +const typename subtract_with_carry_engine<_UIntType, __w, __s, __r>::result_type subtract_with_carry_engine<_UIntType, __w, __s, __r>::default_seed; template diff --git a/libcxx/include/__cxx03/__random/uniform_int_distribution.h b/libcxx/include/__cxx03/__random/uniform_int_distribution.h index 93b04d397587a..94e3899f6a5a5 100644 --- a/libcxx/include/__cxx03/__random/uniform_int_distribution.h +++ b/libcxx/include/__cxx03/__random/uniform_int_distribution.h @@ -50,10 +50,10 @@ class __independent_bits_engine { _Engine_result_type __mask0_; _Engine_result_type __mask1_; - static const _Working_result_type _Rp = _Engine::_Max - _Engine::_Min + _Working_result_type(1); - static _LIBCPP_CONSTEXPR const size_t __m = __log2<_Working_result_type, _Rp>::value; - static _LIBCPP_CONSTEXPR const size_t _WDt = numeric_limits<_Working_result_type>::digits; - static _LIBCPP_CONSTEXPR const size_t _EDt = numeric_limits<_Engine_result_type>::digits; + static const _Working_result_type _Rp = _Engine::_Max - _Engine::_Min + _Working_result_type(1); + static const size_t __m = __log2<_Working_result_type, _Rp>::value; + static const size_t _WDt = numeric_limits<_Working_result_type>::digits; + static const size_t _EDt = numeric_limits<_Engine_result_type>::digits; public: // constructors and seeding functions diff --git a/libcxx/include/__cxx03/__split_buffer b/libcxx/include/__cxx03/__split_buffer index c614704ce56b0..1e67cc935e35b 100644 --- a/libcxx/include/__cxx03/__split_buffer +++ b/libcxx/include/__cxx03/__split_buffer @@ -86,138 +86,114 @@ public: __split_buffer(const __split_buffer&) = delete; __split_buffer& operator=(const __split_buffer&) = delete; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer() - _NOEXCEPT_(is_nothrow_default_constructible::value) + _LIBCPP_HIDE_FROM_ABI __split_buffer() : __first_(nullptr), __begin_(nullptr), __end_(nullptr), __end_cap_(nullptr, __default_init_tag()) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(__alloc_rr& __a) + _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(__alloc_rr& __a) : __first_(nullptr), __begin_(nullptr), __end_(nullptr), __end_cap_(nullptr, __a) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(const __alloc_rr& __a) + _LIBCPP_HIDE_FROM_ABI explicit __split_buffer(const __alloc_rr& __a) : __first_(nullptr), __begin_(nullptr), __end_(nullptr), __end_cap_(nullptr, __a) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - __split_buffer(size_type __cap, size_type __start, __alloc_rr& __a); + _LIBCPP_HIDE_FROM_ABI __split_buffer(size_type __cap, size_type __start, __alloc_rr& __a); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c) - _NOEXCEPT_(is_nothrow_move_constructible::value); + _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c, const __alloc_rr& __a); + _LIBCPP_HIDE_FROM_ABI __split_buffer(__split_buffer&& __c, const __alloc_rr& __a); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __split_buffer& operator=(__split_buffer&& __c) - _NOEXCEPT_((__alloc_traits::propagate_on_container_move_assignment::value && - is_nothrow_move_assignable::value) || - !__alloc_traits::propagate_on_container_move_assignment::value); + _LIBCPP_HIDE_FROM_ABI __split_buffer& operator=(__split_buffer&& __c); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~__split_buffer(); + _LIBCPP_HIDE_FROM_ABI ~__split_buffer(); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __alloc_rr& __alloc() _NOEXCEPT { return __end_cap_.second(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const __alloc_rr& __alloc() const _NOEXCEPT { - return __end_cap_.second(); - } + _LIBCPP_HIDE_FROM_ABI __alloc_rr& __alloc() _NOEXCEPT { return __end_cap_.second(); } + _LIBCPP_HIDE_FROM_ABI const __alloc_rr& __alloc() const _NOEXCEPT { return __end_cap_.second(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer& __end_cap() _NOEXCEPT { return __end_cap_.first(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const pointer& __end_cap() const _NOEXCEPT { - return __end_cap_.first(); - } + _LIBCPP_HIDE_FROM_ABI pointer& __end_cap() _NOEXCEPT { return __end_cap_.first(); } + _LIBCPP_HIDE_FROM_ABI const pointer& __end_cap() const _NOEXCEPT { return __end_cap_.first(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __begin_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __begin_; } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __begin_; } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __begin_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __end_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __end_; } + _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __end_; } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __end_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __destruct_at_end(__begin_); } + _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __destruct_at_end(__begin_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type size() const { - return static_cast(__end_ - __begin_); - } + _LIBCPP_HIDE_FROM_ABI size_type size() const { return static_cast(__end_ - __begin_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool empty() const { return __end_ == __begin_; } + _LIBCPP_HIDE_FROM_ABI bool empty() const { return __end_ == __begin_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type capacity() const { - return static_cast(__end_cap() - __first_); - } + _LIBCPP_HIDE_FROM_ABI size_type capacity() const { return static_cast(__end_cap() - __first_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __front_spare() const { - return static_cast(__begin_ - __first_); - } + _LIBCPP_HIDE_FROM_ABI size_type __front_spare() const { return static_cast(__begin_ - __first_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __back_spare() const { - return static_cast(__end_cap() - __end_); - } + _LIBCPP_HIDE_FROM_ABI size_type __back_spare() const { return static_cast(__end_cap() - __end_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference front() { return *__begin_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference front() const { return *__begin_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference back() { return *(__end_ - 1); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference back() const { return *(__end_ - 1); } + _LIBCPP_HIDE_FROM_ABI reference front() { return *__begin_; } + _LIBCPP_HIDE_FROM_ABI const_reference front() const { return *__begin_; } + _LIBCPP_HIDE_FROM_ABI reference back() { return *(__end_ - 1); } + _LIBCPP_HIDE_FROM_ABI const_reference back() const { return *(__end_ - 1); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void reserve(size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void shrink_to_fit() _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void push_front(const_reference __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void push_back(const_reference __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __x); + _LIBCPP_HIDE_FROM_ABI void reserve(size_type __n); + _LIBCPP_HIDE_FROM_ABI void shrink_to_fit() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void push_front(const_reference __x); + _LIBCPP_HIDE_FROM_ABI void push_back(const_reference __x); + _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __x); + _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __x); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args); + _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void pop_front() { __destruct_at_begin(__begin_ + 1); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void pop_back() { __destruct_at_end(__end_ - 1); } + _LIBCPP_HIDE_FROM_ABI void pop_front() { __destruct_at_begin(__begin_ + 1); } + _LIBCPP_HIDE_FROM_ABI void pop_back() { __destruct_at_end(__end_ - 1); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n, const_reference __x); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n, const_reference __x); template ::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(_InputIter __first, _InputIter __last); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(_InputIter __first, _InputIter __last); template ::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __construct_at_end(_ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(_ForwardIterator __first, _ForwardIterator __last); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __construct_at_end_with_size(_Iterator __first, size_type __n); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end_with_size(_Iterator __first, size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_begin(pointer __new_begin) { + _LIBCPP_HIDE_FROM_ABI void __destruct_at_begin(pointer __new_begin) { __destruct_at_begin(__new_begin, is_trivially_destructible()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_begin(pointer __new_begin, false_type); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_begin(pointer __new_begin, true_type); + _LIBCPP_HIDE_FROM_ABI void __destruct_at_begin(pointer __new_begin, false_type); + _LIBCPP_HIDE_FROM_ABI void __destruct_at_begin(pointer __new_begin, true_type); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last) _NOEXCEPT { __destruct_at_end(__new_last, false_type()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last, false_type) _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last, true_type) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last, false_type) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last, true_type) _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer& __x) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>); + _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer& __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __invariants() const; + _LIBCPP_HIDE_FROM_ABI bool __invariants() const; private: - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer& __c, true_type) { __alloc() = std::move(__c.__alloc()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer&, false_type) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__split_buffer&, false_type) _NOEXCEPT {} struct _ConstructTransaction { - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit _ConstructTransaction(pointer* __p, size_type __n) _NOEXCEPT : __pos_(*__p), __end_(*__p + __n), __dest_(__p) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~_ConstructTransaction() { *__dest_ = __pos_; } + _LIBCPP_HIDE_FROM_ABI ~_ConstructTransaction() { *__dest_ = __pos_; } pointer __pos_; const pointer __end_; @@ -228,7 +204,7 @@ private: }; template -_LIBCPP_CONSTEXPR_SINCE_CXX20 bool __split_buffer<_Tp, _Allocator>::__invariants() const { +bool __split_buffer<_Tp, _Allocator>::__invariants() const { if (__first_ == nullptr) { if (__begin_ != nullptr) return false; @@ -253,7 +229,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __split_buffer<_Tp, _Allocator>::__invariants // Precondition: size() + __n <= capacity() // Postcondition: size() == size() + __n template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n) { +void __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n) { _ConstructTransaction __tx(&this->__end_, __n); for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) { __alloc_traits::construct(this->__alloc(), std::__to_address(__tx.__pos_)); @@ -267,8 +243,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::__construct_ // Postcondition: size() == old size() + __n // Postcondition: [i] == __x for all i in [size() - __n, __n) template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -__split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x) { +void __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x) { _ConstructTransaction __tx(&this->__end_, __n); for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_) { __alloc_traits::construct(this->__alloc(), std::__to_address(__tx.__pos_), __x); @@ -277,15 +252,13 @@ __split_buffer<_Tp, _Allocator>::__construct_at_end(size_type __n, const_referen template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -__split_buffer<_Tp, _Allocator>::__construct_at_end(_InputIter __first, _InputIter __last) { +void __split_buffer<_Tp, _Allocator>::__construct_at_end(_InputIter __first, _InputIter __last) { __construct_at_end_with_sentinel(__first, __last); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -__split_buffer<_Tp, _Allocator>::__construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last) { +void __split_buffer<_Tp, _Allocator>::__construct_at_end_with_sentinel(_Iterator __first, _Sentinel __last) { __alloc_rr& __a = this->__alloc(); for (; __first != __last; ++__first) { if (__end_ == __end_cap()) { @@ -302,15 +275,13 @@ __split_buffer<_Tp, _Allocator>::__construct_at_end_with_sentinel(_Iterator __fi } template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -__split_buffer<_Tp, _Allocator>::__construct_at_end(_ForwardIterator __first, _ForwardIterator __last) { +void __split_buffer<_Tp, _Allocator>::__construct_at_end(_ForwardIterator __first, _ForwardIterator __last) { __construct_at_end_with_size(__first, std::distance(__first, __last)); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -__split_buffer<_Tp, _Allocator>::__construct_at_end_with_size(_ForwardIterator __first, size_type __n) { +void __split_buffer<_Tp, _Allocator>::__construct_at_end_with_size(_ForwardIterator __first, size_type __n) { _ConstructTransaction __tx(&this->__end_, __n); for (; __tx.__pos_ != __tx.__end_; ++__tx.__pos_, (void)++__first) { __alloc_traits::construct(this->__alloc(), std::__to_address(__tx.__pos_), *__first); @@ -318,33 +289,30 @@ __split_buffer<_Tp, _Allocator>::__construct_at_end_with_size(_ForwardIterator _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void -__split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, false_type) { +inline void __split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, false_type) { while (__begin_ != __new_begin) __alloc_traits::destroy(__alloc(), std::__to_address(__begin_++)); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void -__split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, true_type) { +inline void __split_buffer<_Tp, _Allocator>::__destruct_at_begin(pointer __new_begin, true_type) { __begin_ = __new_begin; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void +inline _LIBCPP_HIDE_FROM_ABI void __split_buffer<_Tp, _Allocator>::__destruct_at_end(pointer __new_last, false_type) _NOEXCEPT { while (__new_last != __end_) __alloc_traits::destroy(__alloc(), std::__to_address(--__end_)); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void +inline _LIBCPP_HIDE_FROM_ABI void __split_buffer<_Tp, _Allocator>::__destruct_at_end(pointer __new_last, true_type) _NOEXCEPT { __end_ = __new_last; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::__split_buffer(size_type __cap, size_type __start, __alloc_rr& __a) : __end_cap_(nullptr, __a) { if (__cap == 0) { @@ -359,15 +327,14 @@ __split_buffer<_Tp, _Allocator>::__split_buffer(size_type __cap, size_type __sta } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::~__split_buffer() { +__split_buffer<_Tp, _Allocator>::~__split_buffer() { clear(); if (__first_) __alloc_traits::deallocate(__alloc(), __first_, capacity()); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c) - _NOEXCEPT_(is_nothrow_move_constructible::value) +__split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c) : __first_(std::move(__c.__first_)), __begin_(std::move(__c.__begin_)), __end_(std::move(__c.__end_)), @@ -379,7 +346,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::__split_buffer(__ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c, const __alloc_rr& __a) : __end_cap_(nullptr, __a) { if (__a == __c.__alloc()) { @@ -402,11 +368,7 @@ __split_buffer<_Tp, _Allocator>::__split_buffer(__split_buffer&& __c, const __al } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 __split_buffer<_Tp, _Allocator>& -__split_buffer<_Tp, _Allocator>::operator=(__split_buffer&& __c) - _NOEXCEPT_((__alloc_traits::propagate_on_container_move_assignment::value && - is_nothrow_move_assignable::value) || - !__alloc_traits::propagate_on_container_move_assignment::value) { +__split_buffer<_Tp, _Allocator>& __split_buffer<_Tp, _Allocator>::operator=(__split_buffer&& __c) { clear(); shrink_to_fit(); __first_ = __c.__first_; @@ -419,8 +381,7 @@ __split_buffer<_Tp, _Allocator>::operator=(__split_buffer&& __c) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::swap(__split_buffer& __x) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__alloc_rr>) { +void __split_buffer<_Tp, _Allocator>::swap(__split_buffer& __x) { std::swap(__first_, __x.__first_); std::swap(__begin_, __x.__begin_); std::swap(__end_, __x.__end_); @@ -429,7 +390,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::swap(__split } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::reserve(size_type __n) { +void __split_buffer<_Tp, _Allocator>::reserve(size_type __n) { if (__n < capacity()) { __split_buffer __t(__n, 0, __alloc()); __t.__construct_at_end(move_iterator(__begin_), move_iterator(__end_)); @@ -441,7 +402,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::reserve(size } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT { +void __split_buffer<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT { if (capacity() > size()) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { @@ -461,7 +422,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::shrink_to_fi } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::push_front(const_reference __x) { +void __split_buffer<_Tp, _Allocator>::push_front(const_reference __x) { if (__begin_ == __first_) { if (__end_ < __end_cap()) { difference_type __d = __end_cap() - __end_; @@ -483,7 +444,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::push_front(c } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::push_front(value_type&& __x) { +void __split_buffer<_Tp, _Allocator>::push_front(value_type&& __x) { if (__begin_ == __first_) { if (__end_ < __end_cap()) { difference_type __d = __end_cap() - __end_; @@ -505,8 +466,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::push_front(v } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void -__split_buffer<_Tp, _Allocator>::push_back(const_reference __x) { +inline _LIBCPP_HIDE_FROM_ABI void __split_buffer<_Tp, _Allocator>::push_back(const_reference __x) { if (__end_ == __end_cap()) { if (__begin_ > __first_) { difference_type __d = __begin_ - __first_; @@ -528,7 +488,7 @@ __split_buffer<_Tp, _Allocator>::push_back(const_reference __x) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::push_back(value_type&& __x) { +void __split_buffer<_Tp, _Allocator>::push_back(value_type&& __x) { if (__end_ == __end_cap()) { if (__begin_ > __first_) { difference_type __d = __begin_ - __first_; @@ -551,7 +511,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::push_back(va template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::emplace_back(_Args&&... __args) { +void __split_buffer<_Tp, _Allocator>::emplace_back(_Args&&... __args) { if (__end_ == __end_cap()) { if (__begin_ > __first_) { difference_type __d = __begin_ - __first_; @@ -573,8 +533,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void __split_buffer<_Tp, _Allocator>::emplace_back } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void -swap(__split_buffer<_Tp, _Allocator>& __x, __split_buffer<_Tp, _Allocator>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(__split_buffer<_Tp, _Allocator>& __x, __split_buffer<_Tp, _Allocator>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/__string/char_traits.h b/libcxx/include/__cxx03/__string/char_traits.h index e6f4b66c8db03..28366905c716a 100644 --- a/libcxx/include/__cxx03/__string/char_traits.h +++ b/libcxx/include/__cxx03/__string/char_traits.h @@ -83,23 +83,17 @@ struct _LIBCPP_TEMPLATE_VIS char_traits { using pos_type = streampos; using state_type = mbstate_t; - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void - assign(char_type& __c1, const char_type& __c2) _NOEXCEPT { - __c1 = __c2; - } + static inline _LIBCPP_HIDE_FROM_ABI void assign(char_type& __c1, const char_type& __c2) _NOEXCEPT { __c1 = __c2; } // TODO: Make this _LIBCPP_HIDE_FROM_ABI - static inline _LIBCPP_HIDDEN _LIBCPP_CONSTEXPR bool eq(char_type __c1, char_type __c2) _NOEXCEPT { - return __c1 == __c2; - } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool lt(char_type __c1, char_type __c2) _NOEXCEPT { + static inline _LIBCPP_HIDDEN bool eq(char_type __c1, char_type __c2) _NOEXCEPT { return __c1 == __c2; } + static inline _LIBCPP_HIDE_FROM_ABI bool lt(char_type __c1, char_type __c2) _NOEXCEPT { return (unsigned char)__c1 < (unsigned char)__c2; } // __constexpr_memcmp requires a trivially lexicographically comparable type, but char is not when char is a signed // type - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 int - compare(const char_type* __lhs, const char_type* __rhs, size_t __count) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI int compare(const char_type* __lhs, const char_type* __rhs, size_t __count) _NOEXCEPT { if (__libcpp_is_constant_evaluated()) { #ifdef _LIBCPP_COMPILER_CLANG_BASED return __builtin_memcmp(__lhs, __rhs, __count); @@ -121,49 +115,41 @@ struct _LIBCPP_TEMPLATE_VIS char_traits { } } - static inline _LIBCPP_HIDE_FROM_ABI size_t _LIBCPP_CONSTEXPR_SINCE_CXX17 length(const char_type* __s) _NOEXCEPT { + static inline _LIBCPP_HIDE_FROM_ABI size_t length(const char_type* __s) _NOEXCEPT { return std::__constexpr_strlen(__s); } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type* - find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI const char_type* find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { if (__n == 0) return nullptr; return std::__constexpr_memchr(__s, __a, __n); } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type* - move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { + static inline _LIBCPP_HIDE_FROM_ABI char_type* move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { return std::__constexpr_memmove(__s1, __s2, __element_count(__n)); } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type* - copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { + static inline _LIBCPP_HIDE_FROM_ABI char_type* copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2), "char_traits::copy: source and destination ranges overlap"); std::__constexpr_memmove(__s1, __s2, __element_count(__n)); return __s1; } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type* - assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT { + static inline _LIBCPP_HIDE_FROM_ABI char_type* assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT { std::fill_n(__s, __n, __a); return __s; } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type not_eof(int_type __c) _NOEXCEPT { + static inline _LIBCPP_HIDE_FROM_ABI int_type not_eof(int_type __c) _NOEXCEPT { return eq_int_type(__c, eof()) ? ~eof() : __c; } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR char_type to_char_type(int_type __c) _NOEXCEPT { - return char_type(__c); - } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type to_int_type(char_type __c) _NOEXCEPT { + static inline _LIBCPP_HIDE_FROM_ABI char_type to_char_type(int_type __c) _NOEXCEPT { return char_type(__c); } + static inline _LIBCPP_HIDE_FROM_ABI int_type to_int_type(char_type __c) _NOEXCEPT { return int_type((unsigned char)__c); } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool eq_int_type(int_type __c1, int_type __c2) _NOEXCEPT { - return __c1 == __c2; - } - static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type eof() _NOEXCEPT { return int_type(EOF); } + static inline _LIBCPP_HIDE_FROM_ABI bool eq_int_type(int_type __c1, int_type __c2) _NOEXCEPT { return __c1 == __c2; } + static inline _LIBCPP_HIDE_FROM_ABI int_type eof() _NOEXCEPT { return int_type(EOF); } }; template @@ -176,50 +162,36 @@ struct __char_traits_base { // There are different aliases for the different char types, but they are all aliases to this type using pos_type = fpos; - _LIBCPP_HIDE_FROM_ABI static inline _LIBCPP_CONSTEXPR_SINCE_CXX17 void - assign(char_type& __lhs, const char_type& __rhs) _NOEXCEPT { - __lhs = __rhs; - } + _LIBCPP_HIDE_FROM_ABI static inline void assign(char_type& __lhs, const char_type& __rhs) _NOEXCEPT { __lhs = __rhs; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR bool eq(char_type __lhs, char_type __rhs) _NOEXCEPT { - return __lhs == __rhs; - } + _LIBCPP_HIDE_FROM_ABI static bool eq(char_type __lhs, char_type __rhs) _NOEXCEPT { return __lhs == __rhs; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR bool lt(char_type __lhs, char_type __rhs) _NOEXCEPT { - return __lhs < __rhs; - } + _LIBCPP_HIDE_FROM_ABI static bool lt(char_type __lhs, char_type __rhs) _NOEXCEPT { return __lhs < __rhs; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type* - move(char_type* __dest, const char_type* __src, size_t __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static char_type* move(char_type* __dest, const char_type* __src, size_t __n) _NOEXCEPT { return std::__constexpr_memmove(__dest, __src, __element_count(__n)); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type* - copy(char_type* __dest, const char_type* __src, size_t __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static char_type* copy(char_type* __dest, const char_type* __src, size_t __n) _NOEXCEPT { _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__dest, __dest + __n, __src), "char_traits::copy: source and destination ranges overlap"); return std::__constexpr_memmove(__dest, __src, __element_count(__n)); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type* - assign(char_type* __str, size_t __n, char_type __fill_char) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static char_type* assign(char_type* __str, size_t __n, char_type __fill_char) _NOEXCEPT { std::fill_n(__str, __n, __fill_char); return __str; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR char_type to_char_type(int_type __c) _NOEXCEPT { - return char_type(__c); - } + _LIBCPP_HIDE_FROM_ABI static char_type to_char_type(int_type __c) _NOEXCEPT { return char_type(__c); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR int_type to_int_type(char_type __c) _NOEXCEPT { return int_type(__c); } + _LIBCPP_HIDE_FROM_ABI static int_type to_int_type(char_type __c) _NOEXCEPT { return int_type(__c); } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR bool eq_int_type(int_type __lhs, int_type __rhs) _NOEXCEPT { - return __lhs == __rhs; - } + _LIBCPP_HIDE_FROM_ABI static bool eq_int_type(int_type __lhs, int_type __rhs) _NOEXCEPT { return __lhs == __rhs; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR int_type eof() _NOEXCEPT { return _EOFVal; } + _LIBCPP_HIDE_FROM_ABI static int_type eof() _NOEXCEPT { return _EOFVal; } - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR int_type not_eof(int_type __c) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static int_type not_eof(int_type __c) _NOEXCEPT { return eq_int_type(__c, eof()) ? static_cast(~eof()) : __c; } }; @@ -229,19 +201,15 @@ struct __char_traits_base { #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS template <> struct _LIBCPP_TEMPLATE_VIS char_traits : __char_traits_base(WEOF)> { - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 int - compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI int compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { if (__n == 0) return 0; return std::__constexpr_wmemcmp(__s1, __s2, __n); } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t length(const char_type* __s) _NOEXCEPT { - return std::__constexpr_wcslen(__s); - } + static _LIBCPP_HIDE_FROM_ABI size_t length(const char_type* __s) _NOEXCEPT { return std::__constexpr_wcslen(__s); } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type* - find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI const char_type* find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { if (__n == 0) return nullptr; return std::__constexpr_wmemchr(__s, __a, __n); @@ -274,12 +242,10 @@ struct _LIBCPP_TEMPLATE_VIS char_traits template <> struct _LIBCPP_TEMPLATE_VIS char_traits : __char_traits_base(0xFFFF)> { - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 int - compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t length(const char_type* __s) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI static int compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI static size_t length(const char_type* __s) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type* - find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static const char_type* find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { __identity __proj; const char_type* __match = std::__find(__s, __s + __n, __a, __proj); if (__match == __s + __n) @@ -288,8 +254,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits } }; -inline _LIBCPP_CONSTEXPR_SINCE_CXX17 int -char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { +inline int char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { for (; __n; --__n, ++__s1, ++__s2) { if (lt(*__s1, *__s2)) return -1; @@ -299,7 +264,7 @@ char_traits::compare(const char_type* __s1, const char_type* __s2, siz return 0; } -inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits::length(const char_type* __s) _NOEXCEPT { +inline size_t char_traits::length(const char_type* __s) _NOEXCEPT { size_t __len = 0; for (; !eq(*__s, char_type(0)); ++__s) ++__len; @@ -309,12 +274,10 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits::length(const template <> struct _LIBCPP_TEMPLATE_VIS char_traits : __char_traits_base(0xFFFFFFFF)> { - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 int - compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t length(const char_type* __s) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI static int compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI static size_t length(const char_type* __s) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type* - find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static const char_type* find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { __identity __proj; const char_type* __match = std::__find(__s, __s + __n, __a, __proj); if (__match == __s + __n) @@ -323,8 +286,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits } }; -inline _LIBCPP_CONSTEXPR_SINCE_CXX17 int -char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { +inline int char_traits::compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { for (; __n; --__n, ++__s1, ++__s2) { if (lt(*__s1, *__s2)) return -1; @@ -334,7 +296,7 @@ char_traits::compare(const char_type* __s1, const char_type* __s2, siz return 0; } -inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits::length(const char_type* __s) _NOEXCEPT { +inline size_t char_traits::length(const char_type* __s) _NOEXCEPT { size_t __len = 0; for (; !eq(*__s, char_type(0)); ++__s) ++__len; @@ -345,8 +307,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits::length(const // __str_find template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI -__str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT { +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT { if (__pos >= __sz) return __npos; const _CharT* __r = _Traits::find(__p + __pos, __sz - __pos, __c); @@ -356,7 +317,7 @@ __str_find(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT { } template -_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR_SINCE_CXX14 const _CharT* __search_substring( +_LIBCPP_HIDE_FROM_ABI inline const _CharT* __search_substring( const _CharT* __first1, const _CharT* __last1, const _CharT* __first2, const _CharT* __last2) _NOEXCEPT { // Take advantage of knowing source and pattern lengths. // Stop short when source is smaller than pattern. @@ -394,7 +355,7 @@ _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR_SINCE_CXX14 const _CharT* __searc } template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) _NOEXCEPT { if (__pos > __sz) return __npos; @@ -412,8 +373,7 @@ __str_find(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _Siz // __str_rfind template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI -__str_rfind(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT { +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_rfind(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT { if (__sz < 1) return __npos; if (__pos < __sz) @@ -428,7 +388,7 @@ __str_rfind(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT } template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_rfind(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) _NOEXCEPT { __pos = std::min(__pos, __sz); if (__n < __sz - __pos) @@ -443,7 +403,7 @@ __str_rfind(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _Si // __str_find_first_of template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find_first_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) _NOEXCEPT { if (__pos >= __sz || __n == 0) return __npos; @@ -455,7 +415,7 @@ __str_find_first_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __ // __str_find_last_of template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find_last_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) _NOEXCEPT { if (__n != 0) { if (__pos < __sz) @@ -473,7 +433,7 @@ __str_find_last_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __p // __str_find_first_not_of template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find_first_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) _NOEXCEPT { if (__pos < __sz) { const _CharT* __pe = __p + __sz; @@ -485,7 +445,7 @@ __str_find_first_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _Size } template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find_first_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT { if (__pos < __sz) { const _CharT* __pe = __p + __sz; @@ -498,7 +458,7 @@ __str_find_first_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos // __str_find_last_not_of template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find_last_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT __pos, _SizeT __n) _NOEXCEPT { if (__pos < __sz) ++__pos; @@ -511,7 +471,7 @@ __str_find_last_not_of(const _CharT* __p, _SizeT __sz, const _CharT* __s, _SizeT } template -inline _SizeT _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI +inline _SizeT _LIBCPP_HIDE_FROM_ABI __str_find_last_not_of(const _CharT* __p, _SizeT __sz, _CharT __c, _SizeT __pos) _NOEXCEPT { if (__pos < __sz) ++__pos; diff --git a/libcxx/include/__cxx03/__string/constexpr_c_functions.h b/libcxx/include/__cxx03/__string/constexpr_c_functions.h index 95ab640118464..315058dcc06e5 100644 --- a/libcxx/include/__cxx03/__string/constexpr_c_functions.h +++ b/libcxx/include/__cxx03/__string/constexpr_c_functions.h @@ -47,7 +47,7 @@ inline const bool __is_char_type = true; #endif template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_strlen(const _Tp* __str) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI size_t __constexpr_strlen(const _Tp* __str) _NOEXCEPT { static_assert(__is_char_type<_Tp>, "__constexpr_strlen only works with char and char8_t"); // GCC currently doesn't support __builtin_strlen for heap-allocated memory during constant evaluation. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70816 @@ -64,8 +64,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_st // equivalent to a std::memcmp. Since we have multiple objects contiguously in memory, we can call memcmp once instead // of invoking it on every object individually. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int -__constexpr_memcmp(const _Tp* __lhs, const _Up* __rhs, __element_count __n) { +_LIBCPP_HIDE_FROM_ABI int __constexpr_memcmp(const _Tp* __lhs, const _Up* __rhs, __element_count __n) { static_assert(__libcpp_is_trivially_lexicographically_comparable<_Tp, _Up>::value, "_Tp and _Up have to be trivially lexicographically comparable"); @@ -97,8 +96,7 @@ __constexpr_memcmp(const _Tp* __lhs, const _Up* __rhs, __element_count __n) { // to a std::memcmp(...) == 0. Since we have multiple objects contiguously in memory, we can call memcmp once instead // of invoking it on every object individually. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -__constexpr_memcmp_equal(const _Tp* __lhs, const _Up* __rhs, __element_count __n) { +_LIBCPP_HIDE_FROM_ABI bool __constexpr_memcmp_equal(const _Tp* __lhs, const _Up* __rhs, __element_count __n) { static_assert(__libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, "_Tp and _Up have to be trivially equality comparable"); @@ -124,7 +122,7 @@ __constexpr_memcmp_equal(const _Tp* __lhs, const _Up* __rhs, __element_count __n } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_memchr(_Tp* __str, _Up __value, size_t __count) { +_LIBCPP_HIDE_FROM_ABI _Tp* __constexpr_memchr(_Tp* __str, _Up __value, size_t __count) { static_assert(sizeof(_Tp) == 1 && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, "Calling memchr on non-trivially equality comparable types is unsafe."); @@ -155,7 +153,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_memchr(_Tp* // This is necessary in order to implement __constexpr_memmove below in a way that mirrors as // closely as possible what the compiler's __builtin_memmove is able to do. template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up const& __src) { +_LIBCPP_HIDE_FROM_ABI _Tp& __assign_trivially_copyable(_Tp& __dest, _Up const& __src) { __dest = __src; return __dest; } @@ -164,7 +162,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& __assign_trivially_copy template ::value && is_assignable<_Tp&, _Up&&>::value, int> = 0> // clang-format on -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up& __src) { +_LIBCPP_HIDE_FROM_ABI _Tp& __assign_trivially_copyable(_Tp& __dest, _Up& __src) { __dest = static_cast<_Up&&>(__src); // this is safe, we're not actually moving anything since the assignment is trivial return __dest; @@ -175,7 +173,7 @@ template :: !is_assignable<_Tp&, _Up&&>::value && is_constructible<_Tp, _Up const&>::value, int> = 0> // clang-format on -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up const& __src) { +_LIBCPP_HIDE_FROM_ABI _Tp& __assign_trivially_copyable(_Tp& __dest, _Up const& __src) { // _Tp is trivially destructible, so we don't need to call its destructor to end the lifetime of the object // that was there previously std::__construct_at(std::addressof(__dest), __src); @@ -188,7 +186,7 @@ template :: !is_constructible<_Tp, _Up const&>::value && is_constructible<_Tp, _Up&&>::value, int> = 0> // clang-format on -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up& __src) { +_LIBCPP_HIDE_FROM_ABI _Tp& __assign_trivially_copyable(_Tp& __dest, _Up& __src) { // _Tp is trivially destructible, so we don't need to call its destructor to end the lifetime of the object // that was there previously std::__construct_at( @@ -198,8 +196,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& __assign_trivially_copy } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* -__constexpr_memmove(_Tp* __dest, _Up* __src, __element_count __n) { +_LIBCPP_HIDE_FROM_ABI _Tp* __constexpr_memmove(_Tp* __dest, _Up* __src, __element_count __n) { size_t __count = static_cast(__n); if (__libcpp_is_constant_evaluated()) { #ifdef _LIBCPP_COMPILER_CLANG_BASED diff --git a/libcxx/include/__cxx03/__system_error/error_category.h b/libcxx/include/__cxx03/__system_error/error_category.h index c5f3f79c5074c..8f04158ffc010 100644 --- a/libcxx/include/__cxx03/__system_error/error_category.h +++ b/libcxx/include/__cxx03/__system_error/error_category.h @@ -31,7 +31,7 @@ class _LIBCPP_EXPORTED_FROM_ABI error_category { #if defined(_LIBCPP_ERROR_CATEGORY_DEFINE_LEGACY_INLINE_FUNCTIONS) error_category() noexcept; #else - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 error_category() _NOEXCEPT = default; + _LIBCPP_HIDE_FROM_ABI error_category() _NOEXCEPT = default; #endif error_category(const error_category&) = delete; error_category& operator=(const error_category&) = delete; diff --git a/libcxx/include/__cxx03/__thread/poll_with_backoff.h b/libcxx/include/__cxx03/__thread/poll_with_backoff.h index b500629c85217..1d2db7728cc70 100644 --- a/libcxx/include/__cxx03/__thread/poll_with_backoff.h +++ b/libcxx/include/__cxx03/__thread/poll_with_backoff.h @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -static _LIBCPP_CONSTEXPR const int __libcpp_polling_count = 64; +static const int __libcpp_polling_count = 64; // Polls a thread for a condition given by a predicate, and backs off based on a backoff policy // before polling again. @@ -59,7 +59,7 @@ _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI bool __libcpp_thread_poll_with_b // so this should most likely only be used on single-threaded systems where there // are no other threads to compete with. struct __spinning_backoff_policy { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator()(chrono::nanoseconds const&) const { return false; } + _LIBCPP_HIDE_FROM_ABI bool operator()(chrono::nanoseconds const&) const { return false; } }; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__thread/this_thread.h b/libcxx/include/__cxx03/__thread/this_thread.h index ae9c37ef02fbb..4584d2d00ea9f 100644 --- a/libcxx/include/__cxx03/__thread/this_thread.h +++ b/libcxx/include/__cxx03/__thread/this_thread.h @@ -37,7 +37,7 @@ _LIBCPP_HIDE_FROM_ABI void sleep_for(const chrono::duration<_Rep, _Period>& __d) // The standard guarantees a 64bit signed integer resolution for nanoseconds, // so use INT64_MAX / 1e9 as cut-off point. Use a constant to avoid // and issues with long double folding on PowerPC with GCC. - _LIBCPP_CONSTEXPR chrono::duration __max = chrono::duration(9223372036.0L); + chrono::duration __max = chrono::duration(9223372036.0L); chrono::nanoseconds __ns; if (__d < __max) { __ns = chrono::duration_cast(__d); diff --git a/libcxx/include/__cxx03/__tree b/libcxx/include/__cxx03/__tree index 2a4a6e9864500..3773fb485f488 100644 --- a/libcxx/include/__cxx03/__tree +++ b/libcxx/include/__cxx03/__tree @@ -950,8 +950,7 @@ public: typedef __tree_iterator iterator; typedef __tree_const_iterator const_iterator; - _LIBCPP_HIDE_FROM_ABI explicit __tree(const value_compare& __comp) _NOEXCEPT_( - is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_copy_constructible::value); + _LIBCPP_HIDE_FROM_ABI explicit __tree(const value_compare& __comp); _LIBCPP_HIDE_FROM_ABI explicit __tree(const allocator_type& __a); _LIBCPP_HIDE_FROM_ABI __tree(const value_compare& __comp, const allocator_type& __a); _LIBCPP_HIDE_FROM_ABI __tree(const __tree& __t); @@ -960,12 +959,9 @@ public: _LIBCPP_HIDE_FROM_ABI void __assign_unique(_ForwardIterator __first, _ForwardIterator __last); template _LIBCPP_HIDE_FROM_ABI void __assign_multi(_InputIterator __first, _InputIterator __last); - _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t) _NOEXCEPT_( - is_nothrow_move_constructible<__node_allocator>::value&& is_nothrow_move_constructible::value); + _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t); _LIBCPP_HIDE_FROM_ABI __tree(__tree&& __t, const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI __tree& operator=(__tree&& __t) _NOEXCEPT_( - __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable::value&& - is_nothrow_move_assignable<__node_allocator>::value); + _LIBCPP_HIDE_FROM_ABI __tree& operator=(__tree&& __t); _LIBCPP_HIDE_FROM_ABI ~__tree(); _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node()); } @@ -979,9 +975,7 @@ public: _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI void swap(__tree& __t) - _NOEXCEPT_(__is_nothrow_swappable_v && - (!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>)); + _LIBCPP_HIDE_FROM_ABI void swap(__tree& __t); template _LIBCPP_HIDE_FROM_ABI pair __emplace_unique_key_args(_Key const&, _Args&&... __args); @@ -1215,17 +1209,13 @@ private: _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __tree&, false_type) {} _LIBCPP_HIDE_FROM_ABI void __move_assign(__tree& __t, false_type); - _LIBCPP_HIDE_FROM_ABI void __move_assign(__tree& __t, true_type) _NOEXCEPT_( - is_nothrow_move_assignable::value&& is_nothrow_move_assignable<__node_allocator>::value); + _LIBCPP_HIDE_FROM_ABI void __move_assign(__tree& __t, true_type); - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__tree& __t) - _NOEXCEPT_(!__node_traits::propagate_on_container_move_assignment::value || - is_nothrow_move_assignable<__node_allocator>::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__tree& __t) { __move_assign_alloc(__t, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__tree& __t, true_type) - _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__tree& __t, true_type) { __node_alloc() = std::move(__t.__node_alloc()); } _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__tree&, false_type) _NOEXCEPT {} @@ -1274,9 +1264,7 @@ private: }; template -__tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp) _NOEXCEPT_( - is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_copy_constructible::value) - : __pair3_(0, __comp) { +__tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp) : __pair3_(0, __comp) { __begin_node() = __end_node(); } @@ -1395,8 +1383,7 @@ __tree<_Tp, _Compare, _Allocator>::__tree(const __tree& __t) } template -__tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) _NOEXCEPT_( - is_nothrow_move_constructible<__node_allocator>::value&& is_nothrow_move_constructible::value) +__tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) : __begin_node_(std::move(__t.__begin_node_)), __pair1_(std::move(__t.__pair1_)), __pair3_(std::move(__t.__pair3_)) { @@ -1431,8 +1418,7 @@ __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t, const allocator_type& __ } template -void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value&& is_nothrow_move_assignable<__node_allocator>::value) { +void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type) { destroy(static_cast<__node_pointer>(__end_node()->__left_)); __begin_node_ = __t.__begin_node_; __pair1_.first() = __t.__pair1_.first(); @@ -1469,11 +1455,7 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) { } template -__tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(__tree&& __t) _NOEXCEPT_( - __node_traits::propagate_on_container_move_assignment::value&& is_nothrow_move_assignable::value&& - is_nothrow_move_assignable<__node_allocator>::value) - -{ +__tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(__tree&& __t) { __move_assign(__t, integral_constant()); return *this; } @@ -1496,9 +1478,7 @@ void __tree<_Tp, _Compare, _Allocator>::destroy(__node_pointer __nd) _NOEXCEPT { } template -void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t) - _NOEXCEPT_(__is_nothrow_swappable_v && - (!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>)) { +void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t) { using std::swap; swap(__begin_node_, __t.__begin_node_); swap(__pair1_.first(), __t.__pair1_.first()); @@ -2115,8 +2095,7 @@ __tree<_Tp, _Compare, _Allocator>::remove(const_iterator __p) _NOEXCEPT { } template -inline _LIBCPP_HIDE_FROM_ABI void swap(__tree<_Tp, _Compare, _Allocator>& __x, __tree<_Tp, _Compare, _Allocator>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(__tree<_Tp, _Compare, _Allocator>& __x, __tree<_Tp, _Compare, _Allocator>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/__type_traits/aligned_storage.h b/libcxx/include/__cxx03/__type_traits/aligned_storage.h index 9195926459472..216b8b410cc25 100644 --- a/libcxx/include/__cxx03/__type_traits/aligned_storage.h +++ b/libcxx/include/__cxx03/__type_traits/aligned_storage.h @@ -87,7 +87,7 @@ struct __find_max_align<__type_list<_Hp, _Tp>, _Len> : public integral_constant::value>::value> {}; template ::value> -struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS aligned_storage { +struct _LIBCPP_TEMPLATE_VIS aligned_storage { typedef typename __find_pod<__all_types, _Align>::type _Aligner; union type { _Aligner __align; @@ -97,7 +97,7 @@ struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS aligned_storage { #define _CREATE_ALIGNED_STORAGE_SPECIALIZATION(n) \ template \ - struct _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_TEMPLATE_VIS aligned_storage<_Len, n> { \ + struct _LIBCPP_TEMPLATE_VIS aligned_storage<_Len, n> { \ struct _ALIGNAS(n) type { \ unsigned char __lx[(_Len + n - 1) / n * n]; \ }; \ diff --git a/libcxx/include/__cxx03/__type_traits/aligned_union.h b/libcxx/include/__cxx03/__type_traits/aligned_union.h index b3fa2b8a56c07..22543833bb452 100644 --- a/libcxx/include/__cxx03/__type_traits/aligned_union.h +++ b/libcxx/include/__cxx03/__type_traits/aligned_union.h @@ -34,7 +34,7 @@ struct __static_max<_I0, _I1, _In...> { }; template -struct _LIBCPP_DEPRECATED_IN_CXX23 aligned_union { +struct aligned_union { static const size_t alignment_value = __static_max<_LIBCPP_PREFERRED_ALIGNOF(_Type0), _LIBCPP_PREFERRED_ALIGNOF(_Types)...>::value; static const size_t __len = __static_max<_Len, sizeof(_Type0), sizeof(_Types)...>::value; diff --git a/libcxx/include/__cxx03/__type_traits/integral_constant.h b/libcxx/include/__cxx03/__type_traits/integral_constant.h index 16fb9f60e631a..3e67e010b4da8 100644 --- a/libcxx/include/__cxx03/__type_traits/integral_constant.h +++ b/libcxx/include/__cxx03/__type_traits/integral_constant.h @@ -19,14 +19,14 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct _LIBCPP_TEMPLATE_VIS integral_constant { - static _LIBCPP_CONSTEXPR const _Tp value = __v; + static const _Tp value = __v; typedef _Tp value_type; typedef integral_constant type; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR operator value_type() const _NOEXCEPT { return value; } + _LIBCPP_HIDE_FROM_ABI operator value_type() const _NOEXCEPT { return value; } }; template -_LIBCPP_CONSTEXPR const _Tp integral_constant<_Tp, __v>::value; +const _Tp integral_constant<_Tp, __v>::value; typedef integral_constant true_type; typedef integral_constant false_type; diff --git a/libcxx/include/__cxx03/__type_traits/invoke.h b/libcxx/include/__cxx03/__type_traits/invoke.h index 9ea67475a4f62..1ae236d9676b0 100644 --- a/libcxx/include/__cxx03/__type_traits/invoke.h +++ b/libcxx/include/__cxx03/__type_traits/invoke.h @@ -96,56 +96,49 @@ __nat __invoke(_Args&&... __args); // clang-format off template > -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR +inline _LIBCPP_HIDE_FROM_ABI decltype((std::declval<_A0>().*std::declval<_Fp>())(std::declval<_Args>()...)) __invoke(_Fp&& __f, _A0&& __a0, _Args&&... __args) - _NOEXCEPT_(noexcept((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...))) { return (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...); } template > -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR +inline _LIBCPP_HIDE_FROM_ABI decltype((std::declval<_A0>().get().*std::declval<_Fp>())(std::declval<_Args>()...)) __invoke(_Fp&& __f, _A0&& __a0, _Args&&... __args) - _NOEXCEPT_(noexcept((__a0.get().*__f)(static_cast<_Args&&>(__args)...))) { return (__a0.get().*__f)(static_cast<_Args&&>(__args)...); } template > -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR +inline _LIBCPP_HIDE_FROM_ABI decltype(((*std::declval<_A0>()).*std::declval<_Fp>())(std::declval<_Args>()...)) __invoke(_Fp&& __f, _A0&& __a0, _Args&&... __args) - _NOEXCEPT_(noexcept(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...))) { return ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...); } // bullets 4, 5 and 6 template > -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR +inline _LIBCPP_HIDE_FROM_ABI decltype(std::declval<_A0>().*std::declval<_Fp>()) __invoke(_Fp&& __f, _A0&& __a0) - _NOEXCEPT_(noexcept(static_cast<_A0&&>(__a0).*__f)) { return static_cast<_A0&&>(__a0).*__f; } template > -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR +inline _LIBCPP_HIDE_FROM_ABI decltype(std::declval<_A0>().get().*std::declval<_Fp>()) __invoke(_Fp&& __f, _A0&& __a0) - _NOEXCEPT_(noexcept(__a0.get().*__f)) { return __a0.get().*__f; } template > -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR +inline _LIBCPP_HIDE_FROM_ABI decltype((*std::declval<_A0>()).*std::declval<_Fp>()) __invoke(_Fp&& __f, _A0&& __a0) - _NOEXCEPT_(noexcept((*static_cast<_A0&&>(__a0)).*__f)) { return (*static_cast<_A0&&>(__a0)).*__f; } // bullet 7 template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR +inline _LIBCPP_HIDE_FROM_ABI decltype(std::declval<_Fp>()(std::declval<_Args>()...)) __invoke(_Fp&& __f, _Args&&... __args) - _NOEXCEPT_(noexcept(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...))) { return static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...); } // clang-format on @@ -203,7 +196,7 @@ struct __invoke_of template ::value> struct __invoke_void_return_wrapper { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static _Ret __call(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI static _Ret __call(_Args&&... __args) { return std::__invoke(std::forward<_Args>(__args)...); } }; @@ -211,7 +204,7 @@ struct __invoke_void_return_wrapper { template struct __invoke_void_return_wrapper<_Ret, true> { template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static void __call(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI static void __call(_Args&&... __args) { std::__invoke(std::forward<_Args>(__args)...); } }; diff --git a/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h b/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h index e091b8c576025..9034a3499cbf7 100644 --- a/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h +++ b/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h @@ -17,7 +17,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR bool __libcpp_is_constant_evaluated() _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI inline bool __libcpp_is_constant_evaluated() _NOEXCEPT { return __builtin_is_constant_evaluated(); } diff --git a/libcxx/include/__cxx03/__type_traits/is_literal_type.h b/libcxx/include/__cxx03/__type_traits/is_literal_type.h index 5c15a6c395f48..6c5df6ef71d81 100644 --- a/libcxx/include/__cxx03/__type_traits/is_literal_type.h +++ b/libcxx/include/__cxx03/__type_traits/is_literal_type.h @@ -19,8 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -struct _LIBCPP_TEMPLATE_VIS -_LIBCPP_DEPRECATED_IN_CXX17 is_literal_type : public integral_constant {}; +struct _LIBCPP_TEMPLATE_VIS is_literal_type : public integral_constant {}; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__type_traits/is_swappable.h b/libcxx/include/__cxx03/__type_traits/is_swappable.h index 7b87a95446cc5..592c6f247c1ff 100644 --- a/libcxx/include/__cxx03/__type_traits/is_swappable.h +++ b/libcxx/include/__cxx03/__type_traits/is_swappable.h @@ -42,12 +42,10 @@ template using __swap_result_t = void; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __swap_result_t<_Tp> swap(_Tp& __x, _Tp& __y) - _NOEXCEPT_(is_nothrow_move_constructible<_Tp>::value&& is_nothrow_move_assignable<_Tp>::value); +inline _LIBCPP_HIDE_FROM_ABI __swap_result_t<_Tp> swap(_Tp& __x, _Tp& __y); template , int> = 0> -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(_Tp (&__a)[_Np], _Tp (&__b)[_Np]) _NOEXCEPT_(__is_nothrow_swappable_v<_Tp>); +inline _LIBCPP_HIDE_FROM_ABI void swap(_Tp (&__a)[_Np], _Tp (&__b)[_Np]); // ALL generic swap overloads MUST already have a declaration available at this point. diff --git a/libcxx/include/__cxx03/__type_traits/result_of.h b/libcxx/include/__cxx03/__type_traits/result_of.h index 4efe42de04536..da1552cdebc0e 100644 --- a/libcxx/include/__cxx03/__type_traits/result_of.h +++ b/libcxx/include/__cxx03/__type_traits/result_of.h @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // result_of template -class _LIBCPP_DEPRECATED_IN_CXX17 result_of; +class result_of; template class _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : public __invoke_of<_Fp, _Args...> {}; diff --git a/libcxx/include/__cxx03/__utility/convert_to_integral.h b/libcxx/include/__cxx03/__utility/convert_to_integral.h index e1cc9195de165..dc91147298c94 100644 --- a/libcxx/include/__cxx03/__utility/convert_to_integral.h +++ b/libcxx/include/__cxx03/__utility/convert_to_integral.h @@ -21,31 +21,27 @@ _LIBCPP_BEGIN_NAMESPACE_STD -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __convert_to_integral(int __val) { return __val; } +inline _LIBCPP_HIDE_FROM_ABI int __convert_to_integral(int __val) { return __val; } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unsigned __convert_to_integral(unsigned __val) { return __val; } +inline _LIBCPP_HIDE_FROM_ABI unsigned __convert_to_integral(unsigned __val) { return __val; } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR long __convert_to_integral(long __val) { return __val; } +inline _LIBCPP_HIDE_FROM_ABI long __convert_to_integral(long __val) { return __val; } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unsigned long __convert_to_integral(unsigned long __val) { - return __val; -} +inline _LIBCPP_HIDE_FROM_ABI unsigned long __convert_to_integral(unsigned long __val) { return __val; } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR long long __convert_to_integral(long long __val) { return __val; } +inline _LIBCPP_HIDE_FROM_ABI long long __convert_to_integral(long long __val) { return __val; } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR unsigned long long __convert_to_integral(unsigned long long __val) { - return __val; -} +inline _LIBCPP_HIDE_FROM_ABI unsigned long long __convert_to_integral(unsigned long long __val) { return __val; } template ::value, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR long long __convert_to_integral(_Fp __val) { +inline _LIBCPP_HIDE_FROM_ABI long long __convert_to_integral(_Fp __val) { return __val; } #ifndef _LIBCPP_HAS_NO_INT128 -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __int128_t __convert_to_integral(__int128_t __val) { return __val; } +inline _LIBCPP_HIDE_FROM_ABI __int128_t __convert_to_integral(__int128_t __val) { return __val; } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __uint128_t __convert_to_integral(__uint128_t __val) { return __val; } +inline _LIBCPP_HIDE_FROM_ABI __uint128_t __convert_to_integral(__uint128_t __val) { return __val; } #endif template ::value> @@ -58,8 +54,7 @@ template struct __sfinae_underlying_type<_Tp, false> {}; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR typename __sfinae_underlying_type<_Tp>::__promoted_type -__convert_to_integral(_Tp __val) { +inline _LIBCPP_HIDE_FROM_ABI typename __sfinae_underlying_type<_Tp>::__promoted_type __convert_to_integral(_Tp __val) { return __val; } diff --git a/libcxx/include/__cxx03/__utility/exception_guard.h b/libcxx/include/__cxx03/__utility/exception_guard.h index a72f77b4edd12..3ced26c67b1c3 100644 --- a/libcxx/include/__cxx03/__utility/exception_guard.h +++ b/libcxx/include/__cxx03/__utility/exception_guard.h @@ -64,12 +64,10 @@ template struct __exception_guard_exceptions { __exception_guard_exceptions() = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __exception_guard_exceptions(_Rollback __rollback) + _LIBCPP_HIDE_FROM_ABI explicit __exception_guard_exceptions(_Rollback __rollback) : __rollback_(std::move(__rollback)), __completed_(false) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - __exception_guard_exceptions(__exception_guard_exceptions&& __other) - _NOEXCEPT_(is_nothrow_move_constructible<_Rollback>::value) + _LIBCPP_HIDE_FROM_ABI __exception_guard_exceptions(__exception_guard_exceptions&& __other) : __rollback_(std::move(__other.__rollback_)), __completed_(__other.__completed_) { __other.__completed_ = true; } @@ -78,9 +76,9 @@ struct __exception_guard_exceptions { __exception_guard_exceptions& operator=(__exception_guard_exceptions const&) = delete; __exception_guard_exceptions& operator=(__exception_guard_exceptions&&) = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __complete() _NOEXCEPT { __completed_ = true; } + _LIBCPP_HIDE_FROM_ABI void __complete() _NOEXCEPT { __completed_ = true; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__exception_guard_exceptions() { + _LIBCPP_HIDE_FROM_ABI ~__exception_guard_exceptions() { if (!__completed_) __rollback_(); } @@ -95,12 +93,9 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_exceptions); template struct __exception_guard_noexceptions { __exception_guard_noexceptions() = delete; - _LIBCPP_HIDE_FROM_ABI - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG explicit __exception_guard_noexceptions(_Rollback) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_NODEBUG explicit __exception_guard_noexceptions(_Rollback) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG - __exception_guard_noexceptions(__exception_guard_noexceptions&& __other) - _NOEXCEPT_(is_nothrow_move_constructible<_Rollback>::value) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_NODEBUG __exception_guard_noexceptions(__exception_guard_noexceptions&& __other) : __completed_(__other.__completed_) { __other.__completed_ = true; } @@ -109,11 +104,9 @@ struct __exception_guard_noexceptions { __exception_guard_noexceptions& operator=(__exception_guard_noexceptions const&) = delete; __exception_guard_noexceptions& operator=(__exception_guard_noexceptions&&) = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG void __complete() _NOEXCEPT { - __completed_ = true; - } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_NODEBUG void __complete() _NOEXCEPT { __completed_ = true; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG ~__exception_guard_noexceptions() { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_NODEBUG ~__exception_guard_noexceptions() { _LIBCPP_ASSERT_INTERNAL(__completed_, "__exception_guard not completed with exceptions disabled"); } @@ -132,7 +125,7 @@ using __exception_guard = __exception_guard_exceptions<_Rollback>; #endif template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __exception_guard<_Rollback> __make_exception_guard(_Rollback __rollback) { +_LIBCPP_HIDE_FROM_ABI __exception_guard<_Rollback> __make_exception_guard(_Rollback __rollback) { return __exception_guard<_Rollback>(std::move(__rollback)); } diff --git a/libcxx/include/__cxx03/__utility/forward.h b/libcxx/include/__cxx03/__utility/forward.h index fc0b45a0ed7ce..091ce5b9dbd02 100644 --- a/libcxx/include/__cxx03/__utility/forward.h +++ b/libcxx/include/__cxx03/__utility/forward.h @@ -21,13 +21,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp&& +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _Tp&& forward(_LIBCPP_LIFETIMEBOUND __libcpp_remove_reference_t<_Tp>& __t) _NOEXCEPT { return static_cast<_Tp&&>(__t); } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp&& +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _Tp&& forward(_LIBCPP_LIFETIMEBOUND __libcpp_remove_reference_t<_Tp>&& __t) _NOEXCEPT { static_assert(!is_lvalue_reference<_Tp>::value, "cannot forward an rvalue as an lvalue"); return static_cast<_Tp&&>(__t); diff --git a/libcxx/include/__cxx03/__utility/is_pointer_in_range.h b/libcxx/include/__cxx03/__utility/is_pointer_in_range.h index 4d7d3c5e039f2..2da13106adb02 100644 --- a/libcxx/include/__cxx03/__utility/is_pointer_in_range.h +++ b/libcxx/include/__cxx03/__utility/is_pointer_in_range.h @@ -33,7 +33,7 @@ struct __is_less_than_comparable<_Tp, _Up, __void_t() }; template ::value, int> = 0> -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool +_LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool __is_pointer_in_range(const _Tp* __begin, const _Tp* __end, const _Up* __ptr) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(std::__is_valid_range(__begin, __end), "[__begin, __end) is not a valid range"); @@ -48,7 +48,7 @@ __is_pointer_in_range(const _Tp* __begin, const _Tp* __end, const _Up* __ptr) { } template ::value, int> = 0> -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool +_LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool __is_pointer_in_range(const _Tp* __begin, const _Tp* __end, const _Up* __ptr) { if (__libcpp_is_constant_evaluated()) return false; diff --git a/libcxx/include/__cxx03/__utility/is_valid_range.h b/libcxx/include/__cxx03/__utility/is_valid_range.h index b3770c2c428b0..0d601c75f6017 100644 --- a/libcxx/include/__cxx03/__utility/is_valid_range.h +++ b/libcxx/include/__cxx03/__utility/is_valid_range.h @@ -20,8 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool -__is_valid_range(const _Tp* __first, const _Tp* __last) { +_LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool __is_valid_range(const _Tp* __first, const _Tp* __last) { if (__libcpp_is_constant_evaluated()) { // If this is not a constant during constant evaluation, that is because __first and __last are not // part of the same allocation. If they are part of the same allocation, we must still make sure they diff --git a/libcxx/include/__cxx03/__utility/move.h b/libcxx/include/__cxx03/__utility/move.h index 18692ba9ec128..99b158b18adf2 100644 --- a/libcxx/include/__cxx03/__utility/move.h +++ b/libcxx/include/__cxx03/__utility/move.h @@ -26,7 +26,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __libcpp_remove_reference_t<_Tp>&& +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI __libcpp_remove_reference_t<_Tp>&& move(_LIBCPP_LIFETIMEBOUND _Tp&& __t) _NOEXCEPT { typedef _LIBCPP_NODEBUG __libcpp_remove_reference_t<_Tp> _Up; return static_cast<_Up&&>(__t); @@ -37,7 +37,7 @@ using __move_if_noexcept_result_t = __conditional_t::value && is_copy_constructible<_Tp>::value, const _Tp&, _Tp&&>; template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __move_if_noexcept_result_t<_Tp> +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI __move_if_noexcept_result_t<_Tp> move_if_noexcept(_LIBCPP_LIFETIMEBOUND _Tp& __x) _NOEXCEPT { return std::move(__x); } diff --git a/libcxx/include/__cxx03/__utility/no_destroy.h b/libcxx/include/__cxx03/__utility/no_destroy.h index 5c3dfc5d81e65..3512fe101880f 100644 --- a/libcxx/include/__cxx03/__utility/no_destroy.h +++ b/libcxx/include/__cxx03/__utility/no_destroy.h @@ -30,7 +30,7 @@ struct __uninitialized_tag {}; // initialization using __emplace. template struct __no_destroy { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __no_destroy(__uninitialized_tag) : __obj_() {} + _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) : __obj_() {} template _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args) { diff --git a/libcxx/include/__cxx03/__utility/pair.h b/libcxx/include/__cxx03/__utility/pair.h index 765f4b48ac05b..fcd0dccf9b5fb 100644 --- a/libcxx/include/__cxx03/__utility/pair.h +++ b/libcxx/include/__cxx03/__utility/pair.h @@ -52,9 +52,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct __non_trivially_copyable_base { - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI __non_trivially_copyable_base() _NOEXCEPT {} - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI - __non_trivially_copyable_base(__non_trivially_copyable_base const&) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI __non_trivially_copyable_base() _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI __non_trivially_copyable_base(__non_trivially_copyable_base const&) _NOEXCEPT {} }; template @@ -128,8 +127,7 @@ struct _LIBCPP_TEMPLATE_VIS pair return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(pair& __p) - _NOEXCEPT_(__is_nothrow_swappable_v&& __is_nothrow_swappable_v) { + _LIBCPP_HIDE_FROM_ABI void swap(pair& __p) { using std::swap; swap(first, __p.first); swap(second, __p.second); @@ -139,50 +137,42 @@ struct _LIBCPP_TEMPLATE_VIS pair // [pairs.spec], specialized algorithms template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator==(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return __x.first == __y.first && __x.second == __y.second; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator!=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator<(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return __x.first < __y.first || (!(__y.first < __x.first) && __x.second < __y.second); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator>(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return __y < __x; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator>=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return !(__x < __y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator<=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const pair<_T1, _T2>& __x, const pair<_U1, _U2>& __y) { return !(__y < __x); } template && __is_swappable_v<_T2>, int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(pair<_T1, _T2>& __x, pair<_T1, _T2>& __y) - _NOEXCEPT_(__is_nothrow_swappable_v<_T1>&& __is_nothrow_swappable_v<_T2>) { +inline _LIBCPP_HIDE_FROM_ABI void swap(pair<_T1, _T2>& __x, pair<_T1, _T2>& __y) { __x.swap(__y); } template -inline _LIBCPP_HIDE_FROM_ABI -_LIBCPP_CONSTEXPR_SINCE_CXX14 pair::type, typename __unwrap_ref_decay<_T2>::type> +inline _LIBCPP_HIDE_FROM_ABI pair::type, typename __unwrap_ref_decay<_T2>::type> make_pair(_T1&& __t1, _T2&& __t2) { return pair::type, typename __unwrap_ref_decay<_T2>::type>( std::forward<_T1>(__t1), std::forward<_T2>(__t2)); @@ -212,22 +202,22 @@ struct __get_pair; template <> struct __get_pair<0> { template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T1& get(pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI _T1& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __p.first; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T1& get(const pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI const _T1& get(const pair<_T1, _T2>& __p) _NOEXCEPT { return __p.first; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T1&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI _T1&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward<_T1>(__p.first); } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T1&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI const _T1&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward(__p.first); } }; @@ -235,46 +225,44 @@ struct __get_pair<0> { template <> struct __get_pair<1> { template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T2& get(pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI _T2& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __p.second; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T2& get(const pair<_T1, _T2>& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI const _T2& get(const pair<_T1, _T2>& __p) _NOEXCEPT { return __p.second; } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _T2&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI _T2&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward<_T2>(__p.second); } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _T2&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI const _T2&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { return std::forward(__p.second); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type& -get(pair<_T1, _T2>& __p) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI typename tuple_element<_Ip, pair<_T1, _T2> >::type& get(pair<_T1, _T2>& __p) _NOEXCEPT { return __get_pair<_Ip>::get(__p); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type& +inline _LIBCPP_HIDE_FROM_ABI const typename tuple_element<_Ip, pair<_T1, _T2> >::type& get(const pair<_T1, _T2>& __p) _NOEXCEPT { return __get_pair<_Ip>::get(__p); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type&& -get(pair<_T1, _T2>&& __p) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI typename tuple_element<_Ip, pair<_T1, _T2> >::type&& get(pair<_T1, _T2>&& __p) _NOEXCEPT { return __get_pair<_Ip>::get(std::move(__p)); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const typename tuple_element<_Ip, pair<_T1, _T2> >::type&& +inline _LIBCPP_HIDE_FROM_ABI const typename tuple_element<_Ip, pair<_T1, _T2> >::type&& get(const pair<_T1, _T2>&& __p) _NOEXCEPT { return __get_pair<_Ip>::get(std::move(__p)); } diff --git a/libcxx/include/__cxx03/__utility/rel_ops.h b/libcxx/include/__cxx03/__utility/rel_ops.h index b8fadd4d86227..803aaaa7b45e6 100644 --- a/libcxx/include/__cxx03/__utility/rel_ops.h +++ b/libcxx/include/__cxx03/__utility/rel_ops.h @@ -20,22 +20,22 @@ _LIBCPP_BEGIN_NAMESPACE_STD namespace rel_ops { template -inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator!=(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const _Tp& __x, const _Tp& __y) { return !(__x == __y); } template -inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator>(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>(const _Tp& __x, const _Tp& __y) { return __y < __x; } template -inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator<=(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const _Tp& __x, const _Tp& __y) { return !(__y < __x); } template -inline _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI bool operator>=(const _Tp& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const _Tp& __x, const _Tp& __y) { return !(__x < __y); } diff --git a/libcxx/include/__cxx03/__utility/swap.h b/libcxx/include/__cxx03/__utility/swap.h index 6b9a8e20323ae..df29e1e388784 100644 --- a/libcxx/include/__cxx03/__utility/swap.h +++ b/libcxx/include/__cxx03/__utility/swap.h @@ -32,16 +32,14 @@ template using __swap_result_t = void; template -inline _LIBCPP_HIDE_FROM_ABI __swap_result_t<_Tp> _LIBCPP_CONSTEXPR_SINCE_CXX20 swap(_Tp& __x, _Tp& __y) - _NOEXCEPT_(is_nothrow_move_constructible<_Tp>::value&& is_nothrow_move_assignable<_Tp>::value) { +inline _LIBCPP_HIDE_FROM_ABI __swap_result_t<_Tp> swap(_Tp& __x, _Tp& __y) { _Tp __t(std::move(__x)); __x = std::move(__y); __y = std::move(__t); } template , int> > -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(_Tp (&__a)[_Np], _Tp (&__b)[_Np]) - _NOEXCEPT_(__is_nothrow_swappable_v<_Tp>) { +inline _LIBCPP_HIDE_FROM_ABI void swap(_Tp (&__a)[_Np], _Tp (&__b)[_Np]) { for (size_t __i = 0; __i != _Np; ++__i) { swap(__a[__i], __b[__i]); } diff --git a/libcxx/include/__cxx03/array b/libcxx/include/__cxx03/array index 4dfebaa9708db..cacab3e86f0a7 100644 --- a/libcxx/include/__cxx03/array +++ b/libcxx/include/__cxx03/array @@ -182,80 +182,60 @@ struct _LIBCPP_TEMPLATE_VIS array { _Tp __elems_[_Size]; // No explicit construct/copy/destroy for aggregate type - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void fill(const value_type& __u) { - std::fill_n(data(), _Size, __u); - } + _LIBCPP_HIDE_FROM_ABI void fill(const value_type& __u) { std::fill_n(data(), _Size, __u); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(array& __a) _NOEXCEPT_(__is_nothrow_swappable_v<_Tp>) { - std::swap_ranges(data(), data() + _Size, __a.data()); - } + _LIBCPP_HIDE_FROM_ABI void swap(array& __a) { std::swap_ranges(data(), data() + _Size, __a.data()); } // iterators: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT { return iterator(data()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT { - return const_iterator(data()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT { return iterator(data() + _Size); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT { - return const_iterator(data() + _Size); - } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(data()); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(data()); } + _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(data() + _Size); } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(data() + _Size); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT { - return reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rbegin() const _NOEXCEPT { - return const_reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT { - return reverse_iterator(begin()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT { - return const_reverse_iterator(begin()); - } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { return begin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { return end(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crbegin() const _NOEXCEPT { - return rbegin(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return end(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return rbegin(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return rend(); } // capacity: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return _Size; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return _Size; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return _Size == 0; } + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return _Size; } + _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return _Size; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return _Size == 0; } // element access: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference operator[](size_type __n) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < _Size, "out-of-bounds access in std::array"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference operator[](size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < _Size, "out-of-bounds access in std::array"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type __n) { + _LIBCPP_HIDE_FROM_ABI reference at(size_type __n) { if (__n >= _Size) __throw_out_of_range("array::at"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type __n) const { + _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __n) const { if (__n >= _Size) __throw_out_of_range("array::at"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT { return (*this)[0]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT { return (*this)[0]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT { return (*this)[_Size - 1]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT { - return (*this)[_Size - 1]; - } + _LIBCPP_HIDE_FROM_ABI reference front() _NOEXCEPT { return (*this)[0]; } + _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT { return (*this)[0]; } + _LIBCPP_HIDE_FROM_ABI reference back() _NOEXCEPT { return (*this)[_Size - 1]; } + _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { return (*this)[_Size - 1]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return __elems_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { return __elems_; } + _LIBCPP_HIDE_FROM_ABI value_type* data() _NOEXCEPT { return __elems_; } + _LIBCPP_HIDE_FROM_ABI const value_type* data() const _NOEXCEPT { return __elems_; } }; template @@ -281,98 +261,83 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0> { }; _ALIGNAS_TYPE(_ArrayInStructT) _EmptyType __elems_[sizeof(_ArrayInStructT)]; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return nullptr; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { return nullptr; } + _LIBCPP_HIDE_FROM_ABI value_type* data() _NOEXCEPT { return nullptr; } + _LIBCPP_HIDE_FROM_ABI const value_type* data() const _NOEXCEPT { return nullptr; } // No explicit construct/copy/destroy for aggregate type - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void fill(const value_type&) { + _LIBCPP_HIDE_FROM_ABI void fill(const value_type&) { static_assert(!is_const<_Tp>::value, "cannot fill zero-sized array of type 'const T'"); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(array&) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void swap(array&) _NOEXCEPT { static_assert(!is_const<_Tp>::value, "cannot swap zero-sized array of type 'const T'"); } // iterators: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT { return iterator(data()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT { - return const_iterator(data()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT { return iterator(data()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT { - return const_iterator(data()); - } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(data()); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(data()); } + _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(data()); } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(data()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT { - return reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rbegin() const _NOEXCEPT { - return const_reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT { - return reverse_iterator(begin()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT { - return const_reverse_iterator(begin()); - } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { return begin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { return end(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crbegin() const _NOEXCEPT { - return rbegin(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return end(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return rbegin(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return rend(); } // capacity: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return 0; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return 0; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return true; } + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return 0; } + _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return 0; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return true; } // element access: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference operator[](size_type) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::operator[] on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference operator[](size_type) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::operator[] on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type) { + _LIBCPP_HIDE_FROM_ABI reference at(size_type) { __throw_out_of_range("array::at"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type) const { + _LIBCPP_HIDE_FROM_ABI const_reference at(size_type) const { __throw_out_of_range("array::at"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference front() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::front() on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::front() on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference back() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::back() on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::back() on a zero-sized array"); __libcpp_unreachable(); } }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool -operator==(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y) { return std::equal(__x.begin(), __x.end(), __y.begin()); } @@ -402,8 +367,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const array<_Tp, _Size>& __x, const } template , int> = 0> -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(array<_Tp, _Size>& __x, array<_Tp, _Size>& __y) - _NOEXCEPT_(noexcept(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(array<_Tp, _Size>& __x, array<_Tp, _Size>& __y) { __x.swap(__y); } @@ -417,25 +381,25 @@ struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, array<_Tp, _Size> > { }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>& __a) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI _Tp& get(array<_Tp, _Size>& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array)"); return __a.__elems_[_Ip]; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>& __a) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI const _Tp& get(const array<_Tp, _Size>& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array)"); return __a.__elems_[_Ip]; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& get(array<_Tp, _Size>&& __a) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI _Tp&& get(array<_Tp, _Size>&& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array &&)"); return std::move(__a.__elems_[_Ip]); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&& __a) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI const _Tp&& get(const array<_Tp, _Size>&& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array &&)"); return std::move(__a.__elems_[_Ip]); } diff --git a/libcxx/include/__cxx03/bitset b/libcxx/include/__cxx03/bitset index a68c656a5153e..63d2c23a73bdc 100644 --- a/libcxx/include/__cxx03/bitset +++ b/libcxx/include/__cxx03/bitset @@ -189,51 +189,51 @@ protected: typedef __bit_iterator<__bitset, false> iterator; typedef __bit_iterator<__bitset, true> const_iterator; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI __bitset() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI explicit __bitset(unsigned long long __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference __make_ref(size_t __pos) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference __make_ref(size_t __pos) _NOEXCEPT { return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference __make_ref(size_t __pos) const _NOEXCEPT { return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t __pos) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI iterator __make_iter(size_t __pos) _NOEXCEPT { return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(size_t __pos) const _NOEXCEPT { return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset& __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator|=(const __bitset& __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator^=(const __bitset& __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void operator&=(const __bitset& __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void operator|=(const __bitset& __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void operator^=(const __bitset& __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void flip() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const { + _LIBCPP_HIDE_FROM_ABI void flip() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI unsigned long to_ulong() const { return to_ulong(integral_constant < bool, _Size< sizeof(unsigned long) * CHAR_BIT>()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong() const { + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong() const { return to_ullong(integral_constant < bool, _Size< sizeof(unsigned long long) * CHAR_BIT>()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool all() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool any() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bool all() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bool any() const _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT; private: void __init(unsigned long long __v, false_type) _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI void __init(unsigned long long __v, true_type) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong(false_type) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong(true_type) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong(false_type) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong(true_type) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong(true_type, false_type) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong(true_type, true_type) const; + _LIBCPP_HIDE_FROM_ABI unsigned long to_ulong(false_type) const; + _LIBCPP_HIDE_FROM_ABI unsigned long to_ulong(true_type) const; + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong(false_type) const; + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong(true_type) const; + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong(true_type, false_type) const; + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong(true_type, true_type) const; }; template -inline _LIBCPP_CONSTEXPR __bitset<_N_words, _Size>::__bitset() _NOEXCEPT { +inline __bitset<_N_words, _Size>::__bitset() _NOEXCEPT { std::fill_n(__first_, _N_words, __storage_type(0)); } @@ -262,33 +262,30 @@ inline _LIBCPP_HIDE_FROM_ABI void __bitset<_N_words, _Size>::__init(unsigned lon } template -inline _LIBCPP_CONSTEXPR __bitset<_N_words, _Size>::__bitset(unsigned long long __v) _NOEXCEPT { +inline __bitset<_N_words, _Size>::__bitset(unsigned long long __v) _NOEXCEPT { __init(__v, integral_constant()); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void -__bitset<_N_words, _Size>::operator&=(const __bitset& __v) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void __bitset<_N_words, _Size>::operator&=(const __bitset& __v) _NOEXCEPT { for (size_type __i = 0; __i < _N_words; ++__i) __first_[__i] &= __v.__first_[__i]; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void -__bitset<_N_words, _Size>::operator|=(const __bitset& __v) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void __bitset<_N_words, _Size>::operator|=(const __bitset& __v) _NOEXCEPT { for (size_type __i = 0; __i < _N_words; ++__i) __first_[__i] |= __v.__first_[__i]; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void -__bitset<_N_words, _Size>::operator^=(const __bitset& __v) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void __bitset<_N_words, _Size>::operator^=(const __bitset& __v) _NOEXCEPT { for (size_type __i = 0; __i < _N_words; ++__i) __first_[__i] ^= __v.__first_[__i]; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void __bitset<_N_words, _Size>::flip() _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI void __bitset<_N_words, _Size>::flip() _NOEXCEPT { // do middle whole words size_type __n = _Size; __storage_pointer __p = __first_; @@ -304,8 +301,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void __bitset<_N_words, _Siz } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long -__bitset<_N_words, _Size>::to_ulong(false_type) const { +_LIBCPP_HIDE_FROM_ABI unsigned long __bitset<_N_words, _Size>::to_ulong(false_type) const { const_iterator __e = __make_iter(_Size); const_iterator __i = std::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true); if (__i != __e) @@ -315,14 +311,12 @@ __bitset<_N_words, _Size>::to_ulong(false_type) const { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long -__bitset<_N_words, _Size>::to_ulong(true_type) const { +inline _LIBCPP_HIDE_FROM_ABI unsigned long __bitset<_N_words, _Size>::to_ulong(true_type) const { return __first_[0]; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long -__bitset<_N_words, _Size>::to_ullong(false_type) const { +_LIBCPP_HIDE_FROM_ABI unsigned long long __bitset<_N_words, _Size>::to_ullong(false_type) const { const_iterator __e = __make_iter(_Size); const_iterator __i = std::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true); if (__i != __e) @@ -332,20 +326,17 @@ __bitset<_N_words, _Size>::to_ullong(false_type) const { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long -__bitset<_N_words, _Size>::to_ullong(true_type) const { +inline _LIBCPP_HIDE_FROM_ABI unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type) const { return to_ullong(true_type(), integral_constant()); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long -__bitset<_N_words, _Size>::to_ullong(true_type, false_type) const { +inline _LIBCPP_HIDE_FROM_ABI unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, false_type) const { return __first_[0]; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long -__bitset<_N_words, _Size>::to_ullong(true_type, true_type) const { +_LIBCPP_HIDE_FROM_ABI unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, true_type) const { unsigned long long __r = __first_[0]; _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wshift-count-overflow") @@ -356,7 +347,7 @@ __bitset<_N_words, _Size>::to_ullong(true_type, true_type) const { } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool __bitset<_N_words, _Size>::all() const _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __bitset<_N_words, _Size>::all() const _NOEXCEPT { // do middle whole words size_type __n = _Size; __const_storage_pointer __p = __first_; @@ -373,7 +364,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool __bitset<_N_words, _Siz } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool __bitset<_N_words, _Size>::any() const _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __bitset<_N_words, _Size>::any() const _NOEXCEPT { // do middle whole words size_type __n = _Size; __const_storage_pointer __p = __first_; @@ -423,88 +414,85 @@ protected: typedef __bit_iterator<__bitset, false> iterator; typedef __bit_iterator<__bitset, true> const_iterator; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI __bitset() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI explicit __bitset(unsigned long long __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference __make_ref(size_t __pos) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference __make_ref(size_t __pos) _NOEXCEPT { return reference(&__first_, __storage_type(1) << __pos); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference __make_ref(size_t __pos) const _NOEXCEPT { return const_reference(&__first_, __storage_type(1) << __pos); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t __pos) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI iterator __make_iter(size_t __pos) _NOEXCEPT { return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(size_t __pos) const _NOEXCEPT { return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset& __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator|=(const __bitset& __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator^=(const __bitset& __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void operator&=(const __bitset& __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void operator|=(const __bitset& __v) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void operator^=(const __bitset& __v) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void flip() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void flip() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong() const; + _LIBCPP_HIDE_FROM_ABI unsigned long to_ulong() const; + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong() const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool all() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool any() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bool all() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bool any() const _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT; }; template -inline _LIBCPP_CONSTEXPR __bitset<1, _Size>::__bitset() _NOEXCEPT : __first_(0) {} +inline __bitset<1, _Size>::__bitset() _NOEXCEPT : __first_(0) {} template -inline _LIBCPP_CONSTEXPR __bitset<1, _Size>::__bitset(unsigned long long __v) _NOEXCEPT +inline __bitset<1, _Size>::__bitset(unsigned long long __v) _NOEXCEPT : __first_(_Size == __bits_per_word ? static_cast<__storage_type>(__v) : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - 1)) {} template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void -__bitset<1, _Size>::operator&=(const __bitset& __v) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void __bitset<1, _Size>::operator&=(const __bitset& __v) _NOEXCEPT { __first_ &= __v.__first_; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void -__bitset<1, _Size>::operator|=(const __bitset& __v) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void __bitset<1, _Size>::operator|=(const __bitset& __v) _NOEXCEPT { __first_ |= __v.__first_; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void -__bitset<1, _Size>::operator^=(const __bitset& __v) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void __bitset<1, _Size>::operator^=(const __bitset& __v) _NOEXCEPT { __first_ ^= __v.__first_; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void __bitset<1, _Size>::flip() _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI void __bitset<1, _Size>::flip() _NOEXCEPT { __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); __first_ = ~__first_; __first_ &= __m; } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long __bitset<1, _Size>::to_ulong() const { +inline unsigned long __bitset<1, _Size>::to_ulong() const { return __first_; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long __bitset<1, _Size>::to_ullong() const { +inline _LIBCPP_HIDE_FROM_ABI unsigned long long __bitset<1, _Size>::to_ullong() const { return __first_; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool __bitset<1, _Size>::all() const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool __bitset<1, _Size>::all() const _NOEXCEPT { __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); return !(~__first_ & __m); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool __bitset<1, _Size>::any() const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool __bitset<1, _Size>::any() const _NOEXCEPT { __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size); return __first_ & __m; } @@ -538,40 +526,32 @@ protected: typedef __bit_iterator<__bitset, false> iterator; typedef __bit_iterator<__bitset, true> const_iterator; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI __bitset() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI explicit __bitset(unsigned long long) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference __make_ref(size_t) _NOEXCEPT { - return reference(nullptr, 1); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t) const _NOEXCEPT { - return const_reference(nullptr, 1); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t) _NOEXCEPT { - return iterator(nullptr, 0); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t) const _NOEXCEPT { - return const_iterator(nullptr, 0); - } + _LIBCPP_HIDE_FROM_ABI reference __make_ref(size_t) _NOEXCEPT { return reference(nullptr, 1); } + _LIBCPP_HIDE_FROM_ABI const_reference __make_ref(size_t) const _NOEXCEPT { return const_reference(nullptr, 1); } + _LIBCPP_HIDE_FROM_ABI iterator __make_iter(size_t) _NOEXCEPT { return iterator(nullptr, 0); } + _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(size_t) const _NOEXCEPT { return const_iterator(nullptr, 0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset&) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator|=(const __bitset&) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator^=(const __bitset&) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void operator&=(const __bitset&) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void operator|=(const __bitset&) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void operator^=(const __bitset&) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void flip() _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void flip() _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const { return 0; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong() const { return 0; } + _LIBCPP_HIDE_FROM_ABI unsigned long to_ulong() const { return 0; } + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong() const { return 0; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool all() const _NOEXCEPT { return true; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool any() const _NOEXCEPT { return false; } + _LIBCPP_HIDE_FROM_ABI bool all() const _NOEXCEPT { return true; } + _LIBCPP_HIDE_FROM_ABI bool any() const _NOEXCEPT { return false; } _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT { return 0; } }; -inline _LIBCPP_CONSTEXPR __bitset<0, 0>::__bitset() _NOEXCEPT {} +inline __bitset<0, 0>::__bitset() _NOEXCEPT {} -inline _LIBCPP_CONSTEXPR __bitset<0, 0>::__bitset(unsigned long long) _NOEXCEPT {} +inline __bitset<0, 0>::__bitset(unsigned long long) _NOEXCEPT {} template class _LIBCPP_TEMPLATE_VIS bitset; @@ -590,10 +570,10 @@ public: typedef typename base::const_reference const_reference; // 23.3.5.1 constructors: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bitset() _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bitset(unsigned long long __v) _NOEXCEPT : base(__v) {} + _LIBCPP_HIDE_FROM_ABI bitset() _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI bitset(unsigned long long __v) _NOEXCEPT : base(__v) {} template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit bitset( + _LIBCPP_HIDE_FROM_ABI explicit bitset( const _CharT* __str, typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos, _CharT __zero = _CharT('0'), @@ -603,7 +583,7 @@ public: } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit bitset( + _LIBCPP_HIDE_FROM_ABI explicit bitset( const basic_string<_CharT, _Traits, _Allocator>& __str, typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0, typename basic_string<_CharT, _Traits, _Allocator>::size_type __n = @@ -618,53 +598,53 @@ public: } // 23.3.5.2 bitset operations: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& operator&=(const bitset& __rhs) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& operator|=(const bitset& __rhs) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& operator^=(const bitset& __rhs) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& operator<<=(size_t __pos) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& operator>>=(size_t __pos) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& set() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& set(size_t __pos, bool __val = true); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& reset() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& reset(size_t __pos); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset operator~() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& flip() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset& flip(size_t __pos); + _LIBCPP_HIDE_FROM_ABI bitset& operator&=(const bitset& __rhs) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& operator|=(const bitset& __rhs) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& operator^=(const bitset& __rhs) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& operator<<=(size_t __pos) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& operator>>=(size_t __pos) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& set() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& set(size_t __pos, bool __val = true); + _LIBCPP_HIDE_FROM_ABI bitset& reset() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& reset(size_t __pos); + _LIBCPP_HIDE_FROM_ABI bitset operator~() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& flip() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset& flip(size_t __pos); // element access: #ifdef _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool operator[](size_t __p) const { return base::__make_ref(__p); } + _LIBCPP_HIDE_FROM_ABI bool operator[](size_t __p) const { return base::__make_ref(__p); } #else - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference operator[](size_t __p) const { return base::__make_ref(__p); } + _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_t __p) const { return base::__make_ref(__p); } #endif - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 reference operator[](size_t __p) { return base::__make_ref(__p); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong() const; + _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __p) { return base::__make_ref(__p); } + _LIBCPP_HIDE_FROM_ABI unsigned long to_ulong() const; + _LIBCPP_HIDE_FROM_ABI unsigned long long to_ullong() const; template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, _Allocator> + _LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const; template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, allocator<_CharT> > + _LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, allocator<_CharT> > to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const; template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> > + _LIBCPP_HIDE_FROM_ABI basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> > to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string, allocator > + _LIBCPP_HIDE_FROM_ABI basic_string, allocator > to_string(char __zero = '0', char __one = '1') const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 size_t count() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_t size() const _NOEXCEPT { return _Size; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator==(const bitset& __rhs) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_t count() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_t size() const _NOEXCEPT { return _Size; } + _LIBCPP_HIDE_FROM_ABI bool operator==(const bitset& __rhs) const _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI bool operator!=(const bitset& __rhs) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool test(size_t __pos) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool all() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool any() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool none() const _NOEXCEPT { return !any(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset operator<<(size_t __pos) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset operator>>(size_t __pos) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bool test(size_t __pos) const; + _LIBCPP_HIDE_FROM_ABI bool all() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bool any() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bool none() const _NOEXCEPT { return !any(); } + _LIBCPP_HIDE_FROM_ABI bitset operator<<(size_t __pos) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI bitset operator>>(size_t __pos) const _NOEXCEPT; private: template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void + _LIBCPP_HIDE_FROM_ABI void __init_from_string_view(basic_string_view<_CharT, _Traits> __str, _CharT __zero, _CharT __one) { for (size_t __i = 0; __i < __str.size(); ++__i) if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one)) @@ -685,28 +665,25 @@ private: }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& -bitset<_Size>::operator&=(const bitset& __rhs) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::operator&=(const bitset& __rhs) _NOEXCEPT { base::operator&=(__rhs); return *this; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& -bitset<_Size>::operator|=(const bitset& __rhs) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::operator|=(const bitset& __rhs) _NOEXCEPT { base::operator|=(__rhs); return *this; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& -bitset<_Size>::operator^=(const bitset& __rhs) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::operator^=(const bitset& __rhs) _NOEXCEPT { base::operator^=(__rhs); return *this; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::operator<<=(size_t __pos) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::operator<<=(size_t __pos) _NOEXCEPT { __pos = std::min(__pos, _Size); std::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size)); std::fill_n(base::__make_iter(0), __pos, false); @@ -714,7 +691,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size> } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::operator>>=(size_t __pos) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::operator>>=(size_t __pos) _NOEXCEPT { __pos = std::min(__pos, _Size); std::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0)); std::fill_n(base::__make_iter(_Size - __pos), __pos, false); @@ -722,13 +699,13 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size> } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::set() _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::set() _NOEXCEPT { std::fill_n(base::__make_iter(0), _Size, true); return *this; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::set(size_t __pos, bool __val) { +_LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::set(size_t __pos, bool __val) { if (__pos >= _Size) __throw_out_of_range("bitset set argument out of range"); @@ -737,13 +714,13 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size> } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::reset() _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::reset() _NOEXCEPT { std::fill_n(base::__make_iter(0), _Size, false); return *this; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::reset(size_t __pos) { +_LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::reset(size_t __pos) { if (__pos >= _Size) __throw_out_of_range("bitset reset argument out of range"); @@ -752,20 +729,20 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size> } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size> bitset<_Size>::operator~() const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size> bitset<_Size>::operator~() const _NOEXCEPT { bitset __x(*this); __x.flip(); return __x; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::flip() _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::flip() _NOEXCEPT { base::flip(); return *this; } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size>::flip(size_t __pos) { +_LIBCPP_HIDE_FROM_ABI bitset<_Size>& bitset<_Size>::flip(size_t __pos) { if (__pos >= _Size) __throw_out_of_range("bitset flip argument out of range"); @@ -775,18 +752,18 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size>& bitset<_Size> } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long bitset<_Size>::to_ulong() const { +inline _LIBCPP_HIDE_FROM_ABI unsigned long bitset<_Size>::to_ulong() const { return base::to_ulong(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long bitset<_Size>::to_ullong() const { +inline _LIBCPP_HIDE_FROM_ABI unsigned long long bitset<_Size>::to_ullong() const { return base::to_ullong(); } template template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const { basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero); for (size_t __i = 0; __i != _Size; ++__i) { @@ -798,32 +775,31 @@ bitset<_Size>::to_string(_CharT __zero, _CharT __one) const { template template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, allocator<_CharT> > +inline _LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, allocator<_CharT> > bitset<_Size>::to_string(_CharT __zero, _CharT __one) const { return to_string<_CharT, _Traits, allocator<_CharT> >(__zero, __one); } template template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> > +inline _LIBCPP_HIDE_FROM_ABI basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> > bitset<_Size>::to_string(_CharT __zero, _CharT __one) const { return to_string<_CharT, char_traits<_CharT>, allocator<_CharT> >(__zero, __one); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string, allocator > +inline _LIBCPP_HIDE_FROM_ABI basic_string, allocator > bitset<_Size>::to_string(char __zero, char __one) const { return to_string, allocator >(__zero, __one); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 size_t bitset<_Size>::count() const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI size_t bitset<_Size>::count() const _NOEXCEPT { return static_cast(std::count(base::__make_iter(0), base::__make_iter(_Size), true)); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool -bitset<_Size>::operator==(const bitset& __rhs) const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool bitset<_Size>::operator==(const bitset& __rhs) const _NOEXCEPT { return std::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0)); } @@ -833,7 +809,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool bitset<_Size>::operator!=(const bitset& __rhs) } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::test(size_t __pos) const { +_LIBCPP_HIDE_FROM_ABI bool bitset<_Size>::test(size_t __pos) const { if (__pos >= _Size) __throw_out_of_range("bitset test argument out of range"); @@ -841,50 +817,45 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::test(siz } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::all() const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool bitset<_Size>::all() const _NOEXCEPT { return base::all(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool bitset<_Size>::any() const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool bitset<_Size>::any() const _NOEXCEPT { return base::any(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size> -bitset<_Size>::operator<<(size_t __pos) const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size> bitset<_Size>::operator<<(size_t __pos) const _NOEXCEPT { bitset __r = *this; __r <<= __pos; return __r; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size> -bitset<_Size>::operator>>(size_t __pos) const _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size> bitset<_Size>::operator>>(size_t __pos) const _NOEXCEPT { bitset __r = *this; __r >>= __pos; return __r; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size> -operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size> operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) _NOEXCEPT { bitset<_Size> __r = __x; __r &= __y; return __r; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size> -operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size> operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) _NOEXCEPT { bitset<_Size> __r = __x; __r |= __y; return __r; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bitset<_Size> -operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bitset<_Size> operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) _NOEXCEPT { bitset<_Size> __r = __x; __r ^= __y; return __r; diff --git a/libcxx/include/__cxx03/cmath b/libcxx/include/__cxx03/cmath index 2f515cd4fb5a9..4de9c59406c12 100644 --- a/libcxx/include/__cxx03/cmath +++ b/libcxx/include/__cxx03/cmath @@ -554,7 +554,7 @@ using ::tgammal _LIBCPP_USING_IF_EXISTS; using ::truncl _LIBCPP_USING_IF_EXISTS; template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT { #if __has_builtin(__builtin_isnan) return __builtin_isnan(__lcpp_x); #else @@ -563,12 +563,12 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isnan(_A1 __lcpp_x) _NO } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __constexpr_isnan(_A1 __lcpp_x) _NOEXCEPT { return std::isnan(__lcpp_x); } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isinf(_A1 __lcpp_x) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __constexpr_isinf(_A1 __lcpp_x) _NOEXCEPT { #if __has_builtin(__builtin_isinf) return __builtin_isinf(__lcpp_x); #else @@ -577,12 +577,12 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isinf(_A1 __lcpp_x) _NO } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isinf(_A1 __lcpp_x) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __constexpr_isinf(_A1 __lcpp_x) _NOEXCEPT { return std::isinf(__lcpp_x); } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT { #if __has_builtin(__builtin_isfinite) return __builtin_isfinite(__lcpp_x); #else @@ -591,7 +591,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isfinite(_A1 __lcpp_x) } template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool __constexpr_isfinite(_A1 __lcpp_x) _NOEXCEPT { return __builtin_isfinite(__lcpp_x); } diff --git a/libcxx/include/__cxx03/codecvt b/libcxx/include/__cxx03/codecvt index 003ebfbf713e5..b996ea404b1cc 100644 --- a/libcxx/include/__cxx03/codecvt +++ b/libcxx/include/__cxx03/codecvt @@ -64,14 +64,14 @@ class codecvt_utf8_utf16 _LIBCPP_BEGIN_NAMESPACE_STD -enum _LIBCPP_DEPRECATED_IN_CXX17 codecvt_mode { consume_header = 4, generate_header = 2, little_endian = 1 }; +enum codecvt_mode { consume_header = 4, generate_header = 2, little_endian = 1 }; // codecvt_utf8 template class __codecvt_utf8; -# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI __codecvt_utf8 : public codecvt { unsigned long __maxcode_; @@ -110,7 +110,7 @@ protected: int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const override; int do_max_length() const _NOEXCEPT override; }; -# endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS +#endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS _LIBCPP_SUPPRESS_DEPRECATED_PUSH template <> @@ -188,7 +188,7 @@ protected: _LIBCPP_SUPPRESS_DEPRECATED_PUSH template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 codecvt_utf8 : public __codecvt_utf8<_Elem> { +class _LIBCPP_TEMPLATE_VIS codecvt_utf8 : public __codecvt_utf8<_Elem> { public: _LIBCPP_HIDE_FROM_ABI explicit codecvt_utf8(size_t __refs = 0) : __codecvt_utf8<_Elem>(__refs, _Maxcode, _Mode) {} @@ -201,7 +201,7 @@ _LIBCPP_SUPPRESS_DEPRECATED_POP template class __codecvt_utf16; -# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI __codecvt_utf16 : public codecvt { unsigned long __maxcode_; @@ -279,7 +279,7 @@ protected: int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const override; int do_max_length() const _NOEXCEPT override; }; -# endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS +#endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS _LIBCPP_SUPPRESS_DEPRECATED_PUSH template <> @@ -431,11 +431,11 @@ protected: _LIBCPP_SUPPRESS_DEPRECATED_PUSH template -class _LIBCPP_TEMPLATE_VIS -_LIBCPP_DEPRECATED_IN_CXX17 codecvt_utf16 : public __codecvt_utf16<_Elem, _Mode & little_endian> { +class _LIBCPP_TEMPLATE_VIS codecvt_utf16 : public __codecvt_utf16<_Elem, _Mode & little_endian> { public: _LIBCPP_HIDE_FROM_ABI explicit codecvt_utf16(size_t __refs = 0) - : __codecvt_utf16<_Elem, _Mode & little_endian>(__refs, _Maxcode, _Mode) {} + : __codecvt_utf16 < _Elem, + _Mode & little_endian > (__refs, _Maxcode, _Mode) {} _LIBCPP_HIDE_FROM_ABI ~codecvt_utf16() {} }; @@ -446,7 +446,7 @@ _LIBCPP_SUPPRESS_DEPRECATED_POP template class __codecvt_utf8_utf16; -# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS +#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI __codecvt_utf8_utf16 : public codecvt { unsigned long __maxcode_; @@ -485,7 +485,7 @@ protected: int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const override; int do_max_length() const _NOEXCEPT override; }; -# endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS +#endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS _LIBCPP_SUPPRESS_DEPRECATED_PUSH template <> @@ -563,7 +563,7 @@ protected: _LIBCPP_SUPPRESS_DEPRECATED_PUSH template -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 codecvt_utf8_utf16 : public __codecvt_utf8_utf16<_Elem> { +class _LIBCPP_TEMPLATE_VIS codecvt_utf8_utf16 : public __codecvt_utf8_utf16<_Elem> { public: _LIBCPP_HIDE_FROM_ABI explicit codecvt_utf8_utf16(size_t __refs = 0) : __codecvt_utf8_utf16<_Elem>(__refs, _Maxcode, _Mode) {} diff --git a/libcxx/include/__cxx03/complex b/libcxx/include/__cxx03/complex index d670a0b69c458..d7b9976a016d5 100644 --- a/libcxx/include/__cxx03/complex +++ b/libcxx/include/__cxx03/complex @@ -283,20 +283,16 @@ template class _LIBCPP_TEMPLATE_VIS complex; template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator*(const complex<_Tp>& __z, const complex<_Tp>& __w); +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w); template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator*(const complex<_Tp>& __z, const complex<_Tp>& __w); +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w); template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator/(const complex<_Tp>& __x, const complex<_Tp>& __y); +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator/(const complex<_Tp>& __x, const complex<_Tp>& __y); template ::value, int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator/(const complex<_Tp>& __x, const complex<_Tp>& __y); +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator/(const complex<_Tp>& __x, const complex<_Tp>& __y); template class _LIBCPP_TEMPLATE_VIS complex { @@ -308,68 +304,66 @@ private: value_type __im_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 - complex(const value_type& __re = value_type(), const value_type& __im = value_type()) + _LIBCPP_HIDE_FROM_ABI complex(const value_type& __re = value_type(), const value_type& __im = value_type()) : __re_(__re), __im_(__im) {} template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 complex(const complex<_Xp>& __c) - : __re_(__c.real()), __im_(__c.imag()) {} + _LIBCPP_HIDE_FROM_ABI complex(const complex<_Xp>& __c) : __re_(__c.real()), __im_(__c.imag()) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 value_type real() const { return __re_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 value_type imag() const { return __im_; } + _LIBCPP_HIDE_FROM_ABI value_type real() const { return __re_; } + _LIBCPP_HIDE_FROM_ABI value_type imag() const { return __im_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void real(value_type __re) { __re_ = __re; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void imag(value_type __im) { __im_ = __im; } + _LIBCPP_HIDE_FROM_ABI void real(value_type __re) { __re_ = __re; } + _LIBCPP_HIDE_FROM_ABI void imag(value_type __im) { __im_ = __im; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(const value_type& __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(const value_type& __re) { __re_ = __re; __im_ = value_type(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(const value_type& __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(const value_type& __re) { __re_ += __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(const value_type& __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(const value_type& __re) { __re_ -= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(const value_type& __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(const value_type& __re) { __re_ *= __re; __im_ *= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(const value_type& __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(const value_type& __re) { __re_ /= __re; __im_ /= __re; return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(const complex<_Xp>& __c) { __re_ = __c.real(); __im_ = __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(const complex<_Xp>& __c) { __re_ += __c.real(); __im_ += __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(const complex<_Xp>& __c) { __re_ -= __c.real(); __im_ -= __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(const complex<_Xp>& __c) { *this = *this * complex(__c.real(), __c.imag()); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(const complex<_Xp>& __c) { *this = *this / complex(__c.real(), __c.imag()); return *this; } @@ -389,7 +383,7 @@ using __complex_t = __conditional_t::value, _Complex double, _Complex long double> >; template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __complex_t<_Tp> __make_complex(_Tp __re, _Tp __im) { +_LIBCPP_HIDE_FROM_ABI __complex_t<_Tp> __make_complex(_Tp __re, _Tp __im) { #if __has_builtin(__builtin_complex) return __builtin_complex(__re, __im); #else @@ -405,76 +399,75 @@ class _LIBCPP_TEMPLATE_VIS complex { public: typedef float value_type; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(float __re = 0.0f, float __im = 0.0f) : __re_(__re), __im_(__im) {} + _LIBCPP_HIDE_FROM_ABI complex(float __re = 0.0f, float __im = 0.0f) : __re_(__re), __im_(__im) {} template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit complex(_Tag, _Complex float __v) - : __re_(__real__ __v), __im_(__imag__ __v) {} + _LIBCPP_HIDE_FROM_ABI explicit complex(_Tag, _Complex float __v) : __re_(__real__ __v), __im_(__imag__ __v) {} - _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR complex(const complex& __c); - _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR complex(const complex& __c); + _LIBCPP_HIDE_FROM_ABI explicit complex(const complex& __c); + _LIBCPP_HIDE_FROM_ABI explicit complex(const complex& __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR float real() const { return __re_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR float imag() const { return __im_; } + _LIBCPP_HIDE_FROM_ABI float real() const { return __re_; } + _LIBCPP_HIDE_FROM_ABI float imag() const { return __im_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void real(value_type __re) { __re_ = __re; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void imag(value_type __im) { __im_ = __im; } + _LIBCPP_HIDE_FROM_ABI void real(value_type __re) { __re_ = __re; } + _LIBCPP_HIDE_FROM_ABI void imag(value_type __im) { __im_ = __im; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Complex float __builtin() const { return std::__make_complex(__re_, __im_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __builtin(_Complex float __f) { + _LIBCPP_HIDE_FROM_ABI _Complex float __builtin() const { return std::__make_complex(__re_, __im_); } + _LIBCPP_HIDE_FROM_ABI void __builtin(_Complex float __f) { __re_ = __real__ __f; __im_ = __imag__ __f; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(float __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(float __re) { __re_ = __re; __im_ = value_type(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(float __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(float __re) { __re_ += __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(float __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(float __re) { __re_ -= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(float __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(float __re) { __re_ *= __re; __im_ *= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(float __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(float __re) { __re_ /= __re; __im_ /= __re; return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(const complex<_Xp>& __c) { __re_ = __c.real(); __im_ = __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(const complex<_Xp>& __c) { __re_ += __c.real(); __im_ += __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(const complex<_Xp>& __c) { __re_ -= __c.real(); __im_ -= __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(const complex<_Xp>& __c) { *this = *this * complex(__c.real(), __c.imag()); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(const complex<_Xp>& __c) { *this = *this / complex(__c.real(), __c.imag()); return *this; } @@ -488,79 +481,76 @@ class _LIBCPP_TEMPLATE_VIS complex { public: typedef double value_type; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(double __re = 0.0, double __im = 0.0) : __re_(__re), __im_(__im) {} + _LIBCPP_HIDE_FROM_ABI complex(double __re = 0.0, double __im = 0.0) : __re_(__re), __im_(__im) {} template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit complex(_Tag, _Complex double __v) - : __re_(__real__ __v), __im_(__imag__ __v) {} + _LIBCPP_HIDE_FROM_ABI explicit complex(_Tag, _Complex double __v) : __re_(__real__ __v), __im_(__imag__ __v) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(const complex& __c); - _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR complex(const complex& __c); + _LIBCPP_HIDE_FROM_ABI complex(const complex& __c); + _LIBCPP_HIDE_FROM_ABI explicit complex(const complex& __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR double real() const { return __re_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR double imag() const { return __im_; } + _LIBCPP_HIDE_FROM_ABI double real() const { return __re_; } + _LIBCPP_HIDE_FROM_ABI double imag() const { return __im_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void real(value_type __re) { __re_ = __re; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void imag(value_type __im) { __im_ = __im; } + _LIBCPP_HIDE_FROM_ABI void real(value_type __re) { __re_ = __re; } + _LIBCPP_HIDE_FROM_ABI void imag(value_type __im) { __im_ = __im; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Complex double __builtin() const { - return std::__make_complex(__re_, __im_); - } + _LIBCPP_HIDE_FROM_ABI _Complex double __builtin() const { return std::__make_complex(__re_, __im_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __builtin(_Complex double __f) { + _LIBCPP_HIDE_FROM_ABI void __builtin(_Complex double __f) { __re_ = __real__ __f; __im_ = __imag__ __f; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(double __re) { __re_ = __re; __im_ = value_type(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(double __re) { __re_ += __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(double __re) { __re_ -= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(double __re) { __re_ *= __re; __im_ *= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(double __re) { __re_ /= __re; __im_ /= __re; return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(const complex<_Xp>& __c) { __re_ = __c.real(); __im_ = __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(const complex<_Xp>& __c) { __re_ += __c.real(); __im_ += __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(const complex<_Xp>& __c) { __re_ -= __c.real(); __im_ -= __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(const complex<_Xp>& __c) { *this = *this * complex(__c.real(), __c.imag()); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(const complex<_Xp>& __c) { *this = *this / complex(__c.real(), __c.imag()); return *this; } @@ -574,160 +564,144 @@ class _LIBCPP_TEMPLATE_VIS complex { public: typedef long double value_type; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(long double __re = 0.0L, long double __im = 0.0L) - : __re_(__re), __im_(__im) {} + _LIBCPP_HIDE_FROM_ABI complex(long double __re = 0.0L, long double __im = 0.0L) : __re_(__re), __im_(__im) {} template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit complex(_Tag, _Complex long double __v) - : __re_(__real__ __v), __im_(__imag__ __v) {} + _LIBCPP_HIDE_FROM_ABI explicit complex(_Tag, _Complex long double __v) : __re_(__real__ __v), __im_(__imag__ __v) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(const complex& __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR complex(const complex& __c); + _LIBCPP_HIDE_FROM_ABI complex(const complex& __c); + _LIBCPP_HIDE_FROM_ABI complex(const complex& __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR long double real() const { return __re_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR long double imag() const { return __im_; } + _LIBCPP_HIDE_FROM_ABI long double real() const { return __re_; } + _LIBCPP_HIDE_FROM_ABI long double imag() const { return __im_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void real(value_type __re) { __re_ = __re; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void imag(value_type __im) { __im_ = __im; } + _LIBCPP_HIDE_FROM_ABI void real(value_type __re) { __re_ = __re; } + _LIBCPP_HIDE_FROM_ABI void imag(value_type __im) { __im_ = __im; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Complex long double __builtin() const { - return std::__make_complex(__re_, __im_); - } + _LIBCPP_HIDE_FROM_ABI _Complex long double __builtin() const { return std::__make_complex(__re_, __im_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __builtin(_Complex long double __f) { + _LIBCPP_HIDE_FROM_ABI void __builtin(_Complex long double __f) { __re_ = __real__ __f; __im_ = __imag__ __f; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(long double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(long double __re) { __re_ = __re; __im_ = value_type(); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(long double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(long double __re) { __re_ += __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(long double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(long double __re) { __re_ -= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(long double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(long double __re) { __re_ *= __re; __im_ *= __re; return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(long double __re) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(long double __re) { __re_ /= __re; __im_ /= __re; return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator=(const complex<_Xp>& __c) { __re_ = __c.real(); __im_ = __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator+=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator+=(const complex<_Xp>& __c) { __re_ += __c.real(); __im_ += __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator-=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator-=(const complex<_Xp>& __c) { __re_ -= __c.real(); __im_ -= __c.imag(); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator*=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator*=(const complex<_Xp>& __c) { *this = *this * complex(__c.real(), __c.imag()); return *this; } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex& operator/=(const complex<_Xp>& __c) { + _LIBCPP_HIDE_FROM_ABI complex& operator/=(const complex<_Xp>& __c) { *this = *this / complex(__c.real(), __c.imag()); return *this; } }; -inline _LIBCPP_CONSTEXPR complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} +inline complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} -inline _LIBCPP_CONSTEXPR complex::complex(const complex& __c) - : __re_(__c.real()), __im_(__c.imag()) {} +inline complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} -inline _LIBCPP_CONSTEXPR complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} +inline complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} -inline _LIBCPP_CONSTEXPR complex::complex(const complex& __c) - : __re_(__c.real()), __im_(__c.imag()) {} +inline complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} -inline _LIBCPP_CONSTEXPR complex::complex(const complex& __c) - : __re_(__c.real()), __im_(__c.imag()) {} +inline complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} -inline _LIBCPP_CONSTEXPR complex::complex(const complex& __c) - : __re_(__c.real()), __im_(__c.imag()) {} +inline complex::complex(const complex& __c) : __re_(__c.real()), __im_(__c.imag()) {} // 26.3.6 operators: template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator+(const complex<_Tp>& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator+(const complex<_Tp>& __x, const complex<_Tp>& __y) { complex<_Tp> __t(__x); __t += __y; return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator+(const complex<_Tp>& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator+(const complex<_Tp>& __x, const _Tp& __y) { complex<_Tp> __t(__x); __t += __y; return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator+(const _Tp& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator+(const _Tp& __x, const complex<_Tp>& __y) { complex<_Tp> __t(__y); __t += __x; return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator-(const complex<_Tp>& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator-(const complex<_Tp>& __x, const complex<_Tp>& __y) { complex<_Tp> __t(__x); __t -= __y; return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator-(const complex<_Tp>& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator-(const complex<_Tp>& __x, const _Tp& __y) { complex<_Tp> __t(__x); __t -= __y; return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator-(const _Tp& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator-(const _Tp& __x, const complex<_Tp>& __y) { complex<_Tp> __t(-__y); __t += __x; return __t; } template ::value, int> > -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator*(const complex<_Tp>& __lhs, const complex<_Tp>& __rhs) { +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator*(const complex<_Tp>& __lhs, const complex<_Tp>& __rhs) { return complex<_Tp>(__from_builtin_tag(), __lhs.__builtin() * __rhs.__builtin()); } template ::value, int> > -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) { +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) { _Tp __a = __z.real(); _Tp __b = __z.imag(); _Tp __c = __w.real(); @@ -737,30 +711,26 @@ operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator*(const complex<_Tp>& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator*(const complex<_Tp>& __x, const _Tp& __y) { complex<_Tp> __t(__x); __t *= __y; return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator*(const _Tp& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator*(const _Tp& __x, const complex<_Tp>& __y) { complex<_Tp> __t(__y); __t *= __x; return __t; } template ::value, int> > -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator/(const complex<_Tp>& __lhs, const complex<_Tp>& __rhs) { +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator/(const complex<_Tp>& __lhs, const complex<_Tp>& __rhs) { return complex<_Tp>(__from_builtin_tag(), __lhs.__builtin() / __rhs.__builtin()); } template ::value, int> > -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) { +_LIBCPP_HIDE_FROM_ABI complex<_Tp> operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) { _Tp __a = __z.real(); _Tp __b = __z.imag(); _Tp __c = __w.real(); @@ -771,58 +741,54 @@ operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) { } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator/(const complex<_Tp>& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator/(const complex<_Tp>& __x, const _Tp& __y) { return complex<_Tp>(__x.real() / __y, __x.imag() / __y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> -operator/(const _Tp& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator/(const _Tp& __x, const complex<_Tp>& __y) { complex<_Tp> __t(__x); __t /= __y; return __t; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> operator+(const complex<_Tp>& __x) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator+(const complex<_Tp>& __x) { return __x; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> operator-(const complex<_Tp>& __x) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> operator-(const complex<_Tp>& __x) { return complex<_Tp>(-__x.real(), -__x.imag()); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator==(const complex<_Tp>& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const complex<_Tp>& __x, const complex<_Tp>& __y) { return __x.real() == __y.real() && __x.imag() == __y.imag(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator==(const complex<_Tp>& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const complex<_Tp>& __x, const _Tp& __y) { return __x.real() == __y && __x.imag() == 0; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator==(const _Tp& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const _Tp& __x, const complex<_Tp>& __y) { return __x == __y.real() && 0 == __y.imag(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool -operator!=(const complex<_Tp>& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const complex<_Tp>& __x, const complex<_Tp>& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator!=(const complex<_Tp>& __x, const _Tp& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const complex<_Tp>& __x, const _Tp& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool operator!=(const _Tp& __x, const complex<_Tp>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const _Tp& __x, const complex<_Tp>& __y) { return !(__x == __y); } @@ -848,26 +814,24 @@ struct __libcpp_complex_overload_traits<_Tp, false, true> { // real template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp real(const complex<_Tp>& __c) { +inline _LIBCPP_HIDE_FROM_ABI _Tp real(const complex<_Tp>& __c) { return __c.real(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __libcpp_complex_overload_traits<_Tp>::_ValueType -real(_Tp __re) { +inline _LIBCPP_HIDE_FROM_ABI typename __libcpp_complex_overload_traits<_Tp>::_ValueType real(_Tp __re) { return __re; } // imag template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp imag(const complex<_Tp>& __c) { +inline _LIBCPP_HIDE_FROM_ABI _Tp imag(const complex<_Tp>& __c) { return __c.imag(); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __libcpp_complex_overload_traits<_Tp>::_ValueType -imag(_Tp) { +inline _LIBCPP_HIDE_FROM_ABI typename __libcpp_complex_overload_traits<_Tp>::_ValueType imag(_Tp) { return 0; } @@ -903,7 +867,7 @@ inline _LIBCPP_HIDE_FROM_ABI float arg(_Tp __re) { // norm template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp norm(const complex<_Tp>& __c) { +inline _LIBCPP_HIDE_FROM_ABI _Tp norm(const complex<_Tp>& __c) { if (std::__constexpr_isinf(__c.real())) return std::abs(__c.real()); if (std::__constexpr_isinf(__c.imag())) @@ -912,8 +876,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp norm(const comple } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __libcpp_complex_overload_traits<_Tp>::_ValueType -norm(_Tp __re) { +inline _LIBCPP_HIDE_FROM_ABI typename __libcpp_complex_overload_traits<_Tp>::_ValueType norm(_Tp __re) { typedef typename __libcpp_complex_overload_traits<_Tp>::_ValueType _ValueType; return static_cast<_ValueType>(__re) * __re; } @@ -921,13 +884,12 @@ norm(_Tp __re) { // conj template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 complex<_Tp> conj(const complex<_Tp>& __c) { +inline _LIBCPP_HIDE_FROM_ABI complex<_Tp> conj(const complex<_Tp>& __c) { return complex<_Tp>(__c.real(), -__c.imag()); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename __libcpp_complex_overload_traits<_Tp>::_ComplexType -conj(_Tp __re) { +inline _LIBCPP_HIDE_FROM_ABI typename __libcpp_complex_overload_traits<_Tp>::_ComplexType conj(_Tp __re) { typedef typename __libcpp_complex_overload_traits<_Tp>::_ComplexType _ComplexType; return _ComplexType(__re); } diff --git a/libcxx/include/__cxx03/cwchar b/libcxx/include/__cxx03/cwchar index 0d029c0e80286..43ff5fcb900a4 100644 --- a/libcxx/include/__cxx03/cwchar +++ b/libcxx/include/__cxx03/cwchar @@ -193,7 +193,7 @@ using ::putwchar _LIBCPP_USING_IF_EXISTS; using ::vwprintf _LIBCPP_USING_IF_EXISTS; using ::wprintf _LIBCPP_USING_IF_EXISTS; -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_wcslen(const wchar_t* __str) { +inline _LIBCPP_HIDE_FROM_ABI size_t __constexpr_wcslen(const wchar_t* __str) { #if __has_builtin(__builtin_wcslen) return __builtin_wcslen(__str); #else @@ -207,8 +207,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_wc #endif } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int -__constexpr_wmemcmp(const wchar_t* __lhs, const wchar_t* __rhs, size_t __count) { +inline _LIBCPP_HIDE_FROM_ABI int __constexpr_wmemcmp(const wchar_t* __lhs, const wchar_t* __rhs, size_t __count) { #if __has_builtin(__builtin_wmemcmp) return __builtin_wmemcmp(__lhs, __rhs, __count); #else @@ -226,7 +225,7 @@ __constexpr_wmemcmp(const wchar_t* __lhs, const wchar_t* __rhs, size_t __count) } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_wmemchr(_Tp* __str, _Up __value, size_t __count) { +_LIBCPP_HIDE_FROM_ABI _Tp* __constexpr_wmemchr(_Tp* __str, _Up __value, size_t __count) { static_assert(sizeof(_Tp) == sizeof(wchar_t)&& _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t) && __libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value, "Calling wmemchr on non-trivially equality comparable types is unsafe."); diff --git a/libcxx/include/__cxx03/deque b/libcxx/include/__cxx03/deque index d37f77e2ef230..fcf9d606f408f 100644 --- a/libcxx/include/__cxx03/deque +++ b/libcxx/include/__cxx03/deque @@ -541,10 +541,7 @@ private: public: // construct/copy/destroy: - _LIBCPP_HIDE_FROM_ABI deque() _NOEXCEPT_(is_nothrow_default_constructible::value) - : __start_(0), __size_(0, __default_init_tag()) { - __annotate_new(0); - } + _LIBCPP_HIDE_FROM_ABI deque() : __start_(0), __size_(0, __default_init_tag()) { __annotate_new(0); } _LIBCPP_HIDE_FROM_ABI ~deque() { clear(); @@ -672,8 +669,7 @@ public: _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __p); _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI void swap(deque& __c) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v); + _LIBCPP_HIDE_FROM_ABI void swap(deque& __c); _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI bool __invariants() const { @@ -698,22 +694,14 @@ public: return true; } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(deque& __c) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_move_assignment::value || - is_nothrow_move_assignable::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(deque& __c) { __move_assign_alloc(__c, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(deque& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value) { - __alloc() = std::move(__c.__alloc()); - } - + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(deque& __c, true_type) { __alloc() = std::move(__c.__alloc()); } _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(deque&, false_type) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI void __move_assign(deque& __c) - _NOEXCEPT_(__alloc_traits::propagate_on_container_move_assignment::value&& - is_nothrow_move_assignable::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign(deque& __c) { __map_ = std::move(__c.__map_); __start_ = __c.__start_; __size() = __c.size(); @@ -1066,13 +1054,12 @@ private: _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const deque&, false_type) {} - _LIBCPP_HIDE_FROM_ABI void __move_assign(deque& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value); + _LIBCPP_HIDE_FROM_ABI void __move_assign(deque& __c, true_type); _LIBCPP_HIDE_FROM_ABI void __move_assign(deque& __c, false_type); }; template -_LIBCPP_CONSTEXPR const typename allocator_traits<_Alloc>::difference_type deque<_Tp, _Alloc>::__block_size = +const typename allocator_traits<_Alloc>::difference_type deque<_Tp, _Alloc>::__block_size = __deque_block_size::value; template @@ -2064,8 +2051,7 @@ void deque<_Tp, _Allocator>::__erase_to_end(const_iterator __f) { } template -inline void deque<_Tp, _Allocator>::swap(deque& __c) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v) { +inline void deque<_Tp, _Allocator>::swap(deque& __c) { __map_.swap(__c.__map_); std::swap(__start_, __c.__start_); std::swap(__size(), __c.__size()); @@ -2126,8 +2112,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const deque<_Tp, _Allocator>& __x, } template -inline _LIBCPP_HIDE_FROM_ABI void swap(deque<_Tp, _Allocator>& __x, deque<_Tp, _Allocator>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(deque<_Tp, _Allocator>& __x, deque<_Tp, _Allocator>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/forward_list b/libcxx/include/__cxx03/forward_list index 5090ed1ae8ed0..8a8b5f289e388 100644 --- a/libcxx/include/__cxx03/forward_list +++ b/libcxx/include/__cxx03/forward_list @@ -480,8 +480,7 @@ protected: typedef __forward_list_iterator<__node_pointer> iterator; typedef __forward_list_const_iterator<__node_pointer> const_iterator; - _LIBCPP_HIDE_FROM_ABI __forward_list_base() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) - : __before_begin_(__begin_node(), __default_init_tag()) {} + _LIBCPP_HIDE_FROM_ABI __forward_list_base() : __before_begin_(__begin_node(), __default_init_tag()) {} _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a) : __before_begin_(__begin_node(), __node_allocator(__a)) {} _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a) @@ -498,9 +497,7 @@ protected: __copy_assign_alloc(__x, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x) - _NOEXCEPT_(!__node_traits::propagate_on_container_move_assignment::value || - is_nothrow_move_assignable<__node_allocator>::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x) { __move_assign_alloc(__x, integral_constant()); } @@ -531,8 +528,7 @@ protected: } public: - _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x) - _NOEXCEPT_(!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>); + _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x); protected: _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; @@ -546,8 +542,7 @@ private: } _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type) - _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type) { __alloc() = std::move(__x.__alloc()); } }; @@ -558,8 +553,7 @@ __forward_list_base<_Tp, _Alloc>::~__forward_list_base() { } template -inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) - _NOEXCEPT_(!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>) { +inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) { std::__swap_allocator( __alloc(), __x.__alloc(), integral_constant()); using std::swap; @@ -608,8 +602,7 @@ public: typedef typename base::const_iterator const_iterator; typedef void __remove_return_type; - _LIBCPP_HIDE_FROM_ABI forward_list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) { - } // = default; + _LIBCPP_HIDE_FROM_ABI forward_list() {} // = default; _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a); _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n); _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v); @@ -682,10 +675,7 @@ public: _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p); _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x) - _NOEXCEPT_(!__node_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v<__node_allocator>) { - base::swap(__x); - } + _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x) { base::swap(__x); } _LIBCPP_HIDE_FROM_ABI void resize(size_type __n); _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v); @@ -1226,8 +1216,7 @@ operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc> } template -inline _LIBCPP_HIDE_FROM_ABI void swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/limits b/libcxx/include/__cxx03/limits index f4f8eb70062a7..174bf8433c3b8 100644 --- a/libcxx/include/__cxx03/limits +++ b/libcxx/include/__cxx03/limits @@ -125,64 +125,60 @@ enum float_round_style { round_toward_neg_infinity = 3 }; -enum _LIBCPP_DEPRECATED_IN_CXX23 float_denorm_style { - denorm_indeterminate = -1, - denorm_absent = 0, - denorm_present = 1 -}; +enum float_denorm_style { denorm_indeterminate = -1, denorm_absent = 0, denorm_present = 1 }; template ::value> class __libcpp_numeric_limits { protected: typedef _Tp type; - static _LIBCPP_CONSTEXPR const bool is_specialized = false; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return type(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return type(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return type(); } - - static _LIBCPP_CONSTEXPR const int digits = 0; - static _LIBCPP_CONSTEXPR const int digits10 = 0; - static _LIBCPP_CONSTEXPR const int max_digits10 = 0; - static _LIBCPP_CONSTEXPR const bool is_signed = false; - static _LIBCPP_CONSTEXPR const bool is_integer = false; - static _LIBCPP_CONSTEXPR const bool is_exact = false; - static _LIBCPP_CONSTEXPR const int radix = 0; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(); } - - static _LIBCPP_CONSTEXPR const int min_exponent = 0; - static _LIBCPP_CONSTEXPR const int min_exponent10 = 0; - static _LIBCPP_CONSTEXPR const int max_exponent = 0; - static _LIBCPP_CONSTEXPR const int max_exponent10 = 0; - - static _LIBCPP_CONSTEXPR const bool has_infinity = false; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = false; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = false; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(); } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = false; - static _LIBCPP_CONSTEXPR const bool is_bounded = false; - static _LIBCPP_CONSTEXPR const bool is_modulo = false; - - static _LIBCPP_CONSTEXPR const bool traps = false; - static _LIBCPP_CONSTEXPR const bool tinyness_before = false; - static _LIBCPP_CONSTEXPR const float_round_style round_style = round_toward_zero; + static const bool is_specialized = false; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type min() _NOEXCEPT { return type(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type max() _NOEXCEPT { return type(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type lowest() _NOEXCEPT { return type(); } + + static const int digits = 0; + static const int digits10 = 0; + static const int max_digits10 = 0; + static const bool is_signed = false; + static const bool is_integer = false; + static const bool is_exact = false; + static const int radix = 0; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type epsilon() _NOEXCEPT { return type(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type round_error() _NOEXCEPT { return type(); } + + static const int min_exponent = 0; + static const int min_exponent10 = 0; + static const int max_exponent = 0; + static const int max_exponent10 = 0; + + static const bool has_infinity = false; + static const bool has_quiet_NaN = false; + static const bool has_signaling_NaN = false; + static const float_denorm_style has_denorm = denorm_absent; + static const bool has_denorm_loss = false; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type infinity() _NOEXCEPT { return type(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type quiet_NaN() _NOEXCEPT { return type(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type signaling_NaN() _NOEXCEPT { return type(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type denorm_min() _NOEXCEPT { return type(); } + + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + + static const bool traps = false; + static const bool tinyness_before = false; + static const float_round_style round_style = round_toward_zero; }; template struct __libcpp_compute_min { - static _LIBCPP_CONSTEXPR const _Tp value = _Tp(_Tp(1) << __digits); + static const _Tp value = _Tp(_Tp(1) << __digits); }; template struct __libcpp_compute_min<_Tp, __digits, false> { - static _LIBCPP_CONSTEXPR const _Tp value = _Tp(0); + static const _Tp value = _Tp(0); }; template @@ -190,50 +186,50 @@ class __libcpp_numeric_limits<_Tp, true> { protected: typedef _Tp type; - static _LIBCPP_CONSTEXPR const bool is_specialized = true; - - static _LIBCPP_CONSTEXPR const bool is_signed = type(-1) < type(0); - static _LIBCPP_CONSTEXPR const int digits = static_cast(sizeof(type) * __CHAR_BIT__ - is_signed); - static _LIBCPP_CONSTEXPR const int digits10 = digits * 3 / 10; - static _LIBCPP_CONSTEXPR const int max_digits10 = 0; - static _LIBCPP_CONSTEXPR const type __min = __libcpp_compute_min::value; - static _LIBCPP_CONSTEXPR const type __max = is_signed ? type(type(~0) ^ __min) : type(~0); - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); } - - static _LIBCPP_CONSTEXPR const bool is_integer = true; - static _LIBCPP_CONSTEXPR const bool is_exact = true; - static _LIBCPP_CONSTEXPR const int radix = 2; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); } - - static _LIBCPP_CONSTEXPR const int min_exponent = 0; - static _LIBCPP_CONSTEXPR const int min_exponent10 = 0; - static _LIBCPP_CONSTEXPR const int max_exponent = 0; - static _LIBCPP_CONSTEXPR const int max_exponent10 = 0; - - static _LIBCPP_CONSTEXPR const bool has_infinity = false; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = false; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = false; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = false; - static _LIBCPP_CONSTEXPR const bool is_bounded = true; - static _LIBCPP_CONSTEXPR const bool is_modulo = !std::is_signed<_Tp>::value; + static const bool is_specialized = true; + + static const bool is_signed = type(-1) < type(0); + static const int digits = static_cast(sizeof(type) * __CHAR_BIT__ - is_signed); + static const int digits10 = digits * 3 / 10; + static const int max_digits10 = 0; + static const type __min = __libcpp_compute_min::value; + static const type __max = is_signed ? type(type(~0) ^ __min) : type(~0); + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type min() _NOEXCEPT { return __min; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type max() _NOEXCEPT { return __max; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type lowest() _NOEXCEPT { return min(); } + + static const bool is_integer = true; + static const bool is_exact = true; + static const int radix = 2; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type epsilon() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type round_error() _NOEXCEPT { return type(0); } + + static const int min_exponent = 0; + static const int min_exponent10 = 0; + static const int max_exponent = 0; + static const int max_exponent10 = 0; + + static const bool has_infinity = false; + static const bool has_quiet_NaN = false; + static const bool has_signaling_NaN = false; + static const float_denorm_style has_denorm = denorm_absent; + static const bool has_denorm_loss = false; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type infinity() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type quiet_NaN() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type signaling_NaN() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type denorm_min() _NOEXCEPT { return type(0); } + + static const bool is_iec559 = false; + static const bool is_bounded = true; + static const bool is_modulo = !std::is_signed<_Tp>::value; #if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__) - static _LIBCPP_CONSTEXPR const bool traps = true; + static const bool traps = true; #else - static _LIBCPP_CONSTEXPR const bool traps = false; + static const bool traps = false; #endif - static _LIBCPP_CONSTEXPR const bool tinyness_before = false; - static _LIBCPP_CONSTEXPR const float_round_style round_style = round_toward_zero; + static const bool tinyness_before = false; + static const float_round_style round_style = round_toward_zero; }; template <> @@ -241,46 +237,46 @@ class __libcpp_numeric_limits { protected: typedef bool type; - static _LIBCPP_CONSTEXPR const bool is_specialized = true; - - static _LIBCPP_CONSTEXPR const bool is_signed = false; - static _LIBCPP_CONSTEXPR const int digits = 1; - static _LIBCPP_CONSTEXPR const int digits10 = 0; - static _LIBCPP_CONSTEXPR const int max_digits10 = 0; - static _LIBCPP_CONSTEXPR const type __min = false; - static _LIBCPP_CONSTEXPR const type __max = true; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); } - - static _LIBCPP_CONSTEXPR const bool is_integer = true; - static _LIBCPP_CONSTEXPR const bool is_exact = true; - static _LIBCPP_CONSTEXPR const int radix = 2; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); } - - static _LIBCPP_CONSTEXPR const int min_exponent = 0; - static _LIBCPP_CONSTEXPR const int min_exponent10 = 0; - static _LIBCPP_CONSTEXPR const int max_exponent = 0; - static _LIBCPP_CONSTEXPR const int max_exponent10 = 0; - - static _LIBCPP_CONSTEXPR const bool has_infinity = false; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = false; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = false; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = false; - static _LIBCPP_CONSTEXPR const bool is_bounded = true; - static _LIBCPP_CONSTEXPR const bool is_modulo = false; - - static _LIBCPP_CONSTEXPR const bool traps = false; - static _LIBCPP_CONSTEXPR const bool tinyness_before = false; - static _LIBCPP_CONSTEXPR const float_round_style round_style = round_toward_zero; + static const bool is_specialized = true; + + static const bool is_signed = false; + static const int digits = 1; + static const int digits10 = 0; + static const int max_digits10 = 0; + static const type __min = false; + static const type __max = true; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type min() _NOEXCEPT { return __min; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type max() _NOEXCEPT { return __max; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type lowest() _NOEXCEPT { return min(); } + + static const bool is_integer = true; + static const bool is_exact = true; + static const int radix = 2; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type epsilon() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type round_error() _NOEXCEPT { return type(0); } + + static const int min_exponent = 0; + static const int min_exponent10 = 0; + static const int max_exponent = 0; + static const int max_exponent10 = 0; + + static const bool has_infinity = false; + static const bool has_quiet_NaN = false; + static const bool has_signaling_NaN = false; + static const float_denorm_style has_denorm = denorm_absent; + static const bool has_denorm_loss = false; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type infinity() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type quiet_NaN() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type signaling_NaN() _NOEXCEPT { return type(0); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type denorm_min() _NOEXCEPT { return type(0); } + + static const bool is_iec559 = false; + static const bool is_bounded = true; + static const bool is_modulo = false; + + static const bool traps = false; + static const bool tinyness_before = false; + static const float_round_style round_style = round_toward_zero; }; template <> @@ -288,56 +284,48 @@ class __libcpp_numeric_limits { protected: typedef float type; - static _LIBCPP_CONSTEXPR const bool is_specialized = true; - - static _LIBCPP_CONSTEXPR const bool is_signed = true; - static _LIBCPP_CONSTEXPR const int digits = __FLT_MANT_DIG__; - static _LIBCPP_CONSTEXPR const int digits10 = __FLT_DIG__; - static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __FLT_MIN__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __FLT_MAX__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); } - - static _LIBCPP_CONSTEXPR const bool is_integer = false; - static _LIBCPP_CONSTEXPR const bool is_exact = false; - static _LIBCPP_CONSTEXPR const int radix = __FLT_RADIX__; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __FLT_EPSILON__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5F; } - - static _LIBCPP_CONSTEXPR const int min_exponent = __FLT_MIN_EXP__; - static _LIBCPP_CONSTEXPR const int min_exponent10 = __FLT_MIN_10_EXP__; - static _LIBCPP_CONSTEXPR const int max_exponent = __FLT_MAX_EXP__; - static _LIBCPP_CONSTEXPR const int max_exponent10 = __FLT_MAX_10_EXP__; - - static _LIBCPP_CONSTEXPR const bool has_infinity = true; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = true; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = true; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { - return __builtin_huge_valf(); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { - return __builtin_nanf(""); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { - return __builtin_nansf(""); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { - return __FLT_DENORM_MIN__; - } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = true; - static _LIBCPP_CONSTEXPR const bool is_bounded = true; - static _LIBCPP_CONSTEXPR const bool is_modulo = false; - - static _LIBCPP_CONSTEXPR const bool traps = false; + static const bool is_specialized = true; + + static const bool is_signed = true; + static const int digits = __FLT_MANT_DIG__; + static const int digits10 = __FLT_DIG__; + static const int max_digits10 = 2 + (digits * 30103l) / 100000l; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type min() _NOEXCEPT { return __FLT_MIN__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type max() _NOEXCEPT { return __FLT_MAX__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type lowest() _NOEXCEPT { return -max(); } + + static const bool is_integer = false; + static const bool is_exact = false; + static const int radix = __FLT_RADIX__; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type epsilon() _NOEXCEPT { return __FLT_EPSILON__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type round_error() _NOEXCEPT { return 0.5F; } + + static const int min_exponent = __FLT_MIN_EXP__; + static const int min_exponent10 = __FLT_MIN_10_EXP__; + static const int max_exponent = __FLT_MAX_EXP__; + static const int max_exponent10 = __FLT_MAX_10_EXP__; + + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type infinity() _NOEXCEPT { return __builtin_huge_valf(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type quiet_NaN() _NOEXCEPT { return __builtin_nanf(""); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type signaling_NaN() _NOEXCEPT { return __builtin_nansf(""); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type denorm_min() _NOEXCEPT { return __FLT_DENORM_MIN__; } + + static const bool is_iec559 = true; + static const bool is_bounded = true; + static const bool is_modulo = false; + + static const bool traps = false; #if (defined(__arm__) || defined(__aarch64__)) - static _LIBCPP_CONSTEXPR const bool tinyness_before = true; + static const bool tinyness_before = true; #else - static _LIBCPP_CONSTEXPR const bool tinyness_before = false; + static const bool tinyness_before = false; #endif - static _LIBCPP_CONSTEXPR const float_round_style round_style = round_to_nearest; + static const float_round_style round_style = round_to_nearest; }; template <> @@ -345,56 +333,48 @@ class __libcpp_numeric_limits { protected: typedef double type; - static _LIBCPP_CONSTEXPR const bool is_specialized = true; - - static _LIBCPP_CONSTEXPR const bool is_signed = true; - static _LIBCPP_CONSTEXPR const int digits = __DBL_MANT_DIG__; - static _LIBCPP_CONSTEXPR const int digits10 = __DBL_DIG__; - static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __DBL_MIN__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __DBL_MAX__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); } - - static _LIBCPP_CONSTEXPR const bool is_integer = false; - static _LIBCPP_CONSTEXPR const bool is_exact = false; - static _LIBCPP_CONSTEXPR const int radix = __FLT_RADIX__; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __DBL_EPSILON__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5; } - - static _LIBCPP_CONSTEXPR const int min_exponent = __DBL_MIN_EXP__; - static _LIBCPP_CONSTEXPR const int min_exponent10 = __DBL_MIN_10_EXP__; - static _LIBCPP_CONSTEXPR const int max_exponent = __DBL_MAX_EXP__; - static _LIBCPP_CONSTEXPR const int max_exponent10 = __DBL_MAX_10_EXP__; - - static _LIBCPP_CONSTEXPR const bool has_infinity = true; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = true; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = true; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { - return __builtin_huge_val(); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { - return __builtin_nan(""); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { - return __builtin_nans(""); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { - return __DBL_DENORM_MIN__; - } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = true; - static _LIBCPP_CONSTEXPR const bool is_bounded = true; - static _LIBCPP_CONSTEXPR const bool is_modulo = false; - - static _LIBCPP_CONSTEXPR const bool traps = false; + static const bool is_specialized = true; + + static const bool is_signed = true; + static const int digits = __DBL_MANT_DIG__; + static const int digits10 = __DBL_DIG__; + static const int max_digits10 = 2 + (digits * 30103l) / 100000l; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type min() _NOEXCEPT { return __DBL_MIN__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type max() _NOEXCEPT { return __DBL_MAX__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type lowest() _NOEXCEPT { return -max(); } + + static const bool is_integer = false; + static const bool is_exact = false; + static const int radix = __FLT_RADIX__; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type epsilon() _NOEXCEPT { return __DBL_EPSILON__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type round_error() _NOEXCEPT { return 0.5; } + + static const int min_exponent = __DBL_MIN_EXP__; + static const int min_exponent10 = __DBL_MIN_10_EXP__; + static const int max_exponent = __DBL_MAX_EXP__; + static const int max_exponent10 = __DBL_MAX_10_EXP__; + + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type infinity() _NOEXCEPT { return __builtin_huge_val(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type quiet_NaN() _NOEXCEPT { return __builtin_nan(""); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type signaling_NaN() _NOEXCEPT { return __builtin_nans(""); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type denorm_min() _NOEXCEPT { return __DBL_DENORM_MIN__; } + + static const bool is_iec559 = true; + static const bool is_bounded = true; + static const bool is_modulo = false; + + static const bool traps = false; #if (defined(__arm__) || defined(__aarch64__)) - static _LIBCPP_CONSTEXPR const bool tinyness_before = true; + static const bool tinyness_before = true; #else - static _LIBCPP_CONSTEXPR const bool tinyness_before = false; + static const bool tinyness_before = false; #endif - static _LIBCPP_CONSTEXPR const float_round_style round_style = round_to_nearest; + static const float_round_style round_style = round_to_nearest; }; template <> @@ -402,60 +382,52 @@ class __libcpp_numeric_limits { protected: typedef long double type; - static _LIBCPP_CONSTEXPR const bool is_specialized = true; - - static _LIBCPP_CONSTEXPR const bool is_signed = true; - static _LIBCPP_CONSTEXPR const int digits = __LDBL_MANT_DIG__; - static _LIBCPP_CONSTEXPR const int digits10 = __LDBL_DIG__; - static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __LDBL_MIN__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __LDBL_MAX__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); } - - static _LIBCPP_CONSTEXPR const bool is_integer = false; - static _LIBCPP_CONSTEXPR const bool is_exact = false; - static _LIBCPP_CONSTEXPR const int radix = __FLT_RADIX__; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __LDBL_EPSILON__; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5L; } - - static _LIBCPP_CONSTEXPR const int min_exponent = __LDBL_MIN_EXP__; - static _LIBCPP_CONSTEXPR const int min_exponent10 = __LDBL_MIN_10_EXP__; - static _LIBCPP_CONSTEXPR const int max_exponent = __LDBL_MAX_EXP__; - static _LIBCPP_CONSTEXPR const int max_exponent10 = __LDBL_MAX_10_EXP__; - - static _LIBCPP_CONSTEXPR const bool has_infinity = true; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = true; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = true; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = false; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { - return __builtin_huge_vall(); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { - return __builtin_nanl(""); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { - return __builtin_nansl(""); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { - return __LDBL_DENORM_MIN__; - } + static const bool is_specialized = true; + + static const bool is_signed = true; + static const int digits = __LDBL_MANT_DIG__; + static const int digits10 = __LDBL_DIG__; + static const int max_digits10 = 2 + (digits * 30103l) / 100000l; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type min() _NOEXCEPT { return __LDBL_MIN__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type max() _NOEXCEPT { return __LDBL_MAX__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type lowest() _NOEXCEPT { return -max(); } + + static const bool is_integer = false; + static const bool is_exact = false; + static const int radix = __FLT_RADIX__; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type epsilon() _NOEXCEPT { return __LDBL_EPSILON__; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type round_error() _NOEXCEPT { return 0.5L; } + + static const int min_exponent = __LDBL_MIN_EXP__; + static const int min_exponent10 = __LDBL_MIN_10_EXP__; + static const int max_exponent = __LDBL_MAX_EXP__; + static const int max_exponent10 = __LDBL_MAX_10_EXP__; + + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type infinity() _NOEXCEPT { return __builtin_huge_vall(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type quiet_NaN() _NOEXCEPT { return __builtin_nanl(""); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type signaling_NaN() _NOEXCEPT { return __builtin_nansl(""); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type denorm_min() _NOEXCEPT { return __LDBL_DENORM_MIN__; } #if defined(__powerpc__) && defined(__LONG_DOUBLE_IBM128__) - static _LIBCPP_CONSTEXPR const bool is_iec559 = false; + static const bool is_iec559 = false; #else - static _LIBCPP_CONSTEXPR const bool is_iec559 = true; + static const bool is_iec559 = true; #endif - static _LIBCPP_CONSTEXPR const bool is_bounded = true; - static _LIBCPP_CONSTEXPR const bool is_modulo = false; + static const bool is_bounded = true; + static const bool is_modulo = false; - static _LIBCPP_CONSTEXPR const bool traps = false; + static const bool traps = false; #if (defined(__arm__) || defined(__aarch64__)) - static _LIBCPP_CONSTEXPR const bool tinyness_before = true; + static const bool tinyness_before = true; #else - static _LIBCPP_CONSTEXPR const bool tinyness_before = false; + static const bool tinyness_before = false; #endif - static _LIBCPP_CONSTEXPR const float_round_style round_style = round_to_nearest; + static const float_round_style round_style = round_to_nearest; }; template @@ -464,105 +436,93 @@ class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<_Tp> typedef typename __base::type type; public: - static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); } - - static _LIBCPP_CONSTEXPR const int digits = __base::digits; - static _LIBCPP_CONSTEXPR const int digits10 = __base::digits10; - static _LIBCPP_CONSTEXPR const int max_digits10 = __base::max_digits10; - static _LIBCPP_CONSTEXPR const bool is_signed = __base::is_signed; - static _LIBCPP_CONSTEXPR const bool is_integer = __base::is_integer; - static _LIBCPP_CONSTEXPR const bool is_exact = __base::is_exact; - static _LIBCPP_CONSTEXPR const int radix = __base::radix; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { - return __base::epsilon(); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { - return __base::round_error(); - } - - static _LIBCPP_CONSTEXPR const int min_exponent = __base::min_exponent; - static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10; - static _LIBCPP_CONSTEXPR const int max_exponent = __base::max_exponent; - static _LIBCPP_CONSTEXPR const int max_exponent10 = __base::max_exponent10; - - static _LIBCPP_CONSTEXPR const bool has_infinity = __base::has_infinity; - static _LIBCPP_CONSTEXPR const bool has_quiet_NaN = __base::has_quiet_NaN; - static _LIBCPP_CONSTEXPR const bool has_signaling_NaN = __base::has_signaling_NaN; + static const bool is_specialized = __base::is_specialized; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type min() _NOEXCEPT { return __base::min(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type max() _NOEXCEPT { return __base::max(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type lowest() _NOEXCEPT { return __base::lowest(); } + + static const int digits = __base::digits; + static const int digits10 = __base::digits10; + static const int max_digits10 = __base::max_digits10; + static const bool is_signed = __base::is_signed; + static const bool is_integer = __base::is_integer; + static const bool is_exact = __base::is_exact; + static const int radix = __base::radix; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type epsilon() _NOEXCEPT { return __base::epsilon(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type round_error() _NOEXCEPT { return __base::round_error(); } + + static const int min_exponent = __base::min_exponent; + static const int min_exponent10 = __base::min_exponent10; + static const int max_exponent = __base::max_exponent; + static const int max_exponent10 = __base::max_exponent10; + + static const bool has_infinity = __base::has_infinity; + static const bool has_quiet_NaN = __base::has_quiet_NaN; + static const bool has_signaling_NaN = __base::has_signaling_NaN; _LIBCPP_SUPPRESS_DEPRECATED_PUSH - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm; - static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss = __base::has_denorm_loss; + static const float_denorm_style has_denorm = __base::has_denorm; + static const bool has_denorm_loss = __base::has_denorm_loss; _LIBCPP_SUPPRESS_DEPRECATED_POP - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { - return __base::infinity(); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { - return __base::quiet_NaN(); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { - return __base::signaling_NaN(); - } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { - return __base::denorm_min(); - } - - static _LIBCPP_CONSTEXPR const bool is_iec559 = __base::is_iec559; - static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded; - static _LIBCPP_CONSTEXPR const bool is_modulo = __base::is_modulo; - - static _LIBCPP_CONSTEXPR const bool traps = __base::traps; - static _LIBCPP_CONSTEXPR const bool tinyness_before = __base::tinyness_before; - static _LIBCPP_CONSTEXPR const float_round_style round_style = __base::round_style; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type infinity() _NOEXCEPT { return __base::infinity(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI static type denorm_min() _NOEXCEPT { return __base::denorm_min(); } + + static const bool is_iec559 = __base::is_iec559; + static const bool is_bounded = __base::is_bounded; + static const bool is_modulo = __base::is_modulo; + + static const bool traps = __base::traps; + static const bool tinyness_before = __base::tinyness_before; + static const float_round_style round_style = __base::round_style; }; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::is_specialized; +const bool numeric_limits<_Tp>::is_specialized; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::digits; +const int numeric_limits<_Tp>::digits; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::digits10; +const int numeric_limits<_Tp>::digits10; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::max_digits10; +const int numeric_limits<_Tp>::max_digits10; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::is_signed; +const bool numeric_limits<_Tp>::is_signed; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::is_integer; +const bool numeric_limits<_Tp>::is_integer; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::is_exact; +const bool numeric_limits<_Tp>::is_exact; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::radix; +const int numeric_limits<_Tp>::radix; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::min_exponent; +const int numeric_limits<_Tp>::min_exponent; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::min_exponent10; +const int numeric_limits<_Tp>::min_exponent10; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::max_exponent; +const int numeric_limits<_Tp>::max_exponent; template -_LIBCPP_CONSTEXPR const int numeric_limits<_Tp>::max_exponent10; +const int numeric_limits<_Tp>::max_exponent10; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::has_infinity; +const bool numeric_limits<_Tp>::has_infinity; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::has_quiet_NaN; +const bool numeric_limits<_Tp>::has_quiet_NaN; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::has_signaling_NaN; +const bool numeric_limits<_Tp>::has_signaling_NaN; template -_LIBCPP_CONSTEXPR const float_denorm_style numeric_limits<_Tp>::has_denorm; +const float_denorm_style numeric_limits<_Tp>::has_denorm; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::has_denorm_loss; +const bool numeric_limits<_Tp>::has_denorm_loss; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::is_iec559; +const bool numeric_limits<_Tp>::is_iec559; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::is_bounded; +const bool numeric_limits<_Tp>::is_bounded; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::is_modulo; +const bool numeric_limits<_Tp>::is_modulo; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::traps; +const bool numeric_limits<_Tp>::traps; template -_LIBCPP_CONSTEXPR const bool numeric_limits<_Tp>::tinyness_before; +const bool numeric_limits<_Tp>::tinyness_before; template -_LIBCPP_CONSTEXPR const float_round_style numeric_limits<_Tp>::round_style; +const float_round_style numeric_limits<_Tp>::round_style; template class _LIBCPP_TEMPLATE_VIS numeric_limits : public numeric_limits<_Tp> {}; diff --git a/libcxx/include/__cxx03/list b/libcxx/include/__cxx03/list index 037067d531ddb..f863885fbcf9e 100644 --- a/libcxx/include/__cxx03/list +++ b/libcxx/include/__cxx03/list @@ -487,7 +487,7 @@ protected: } _LIBCPP_HIDE_FROM_ABI static void __unlink_nodes(__link_pointer __f, __link_pointer __l) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI __list_imp() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value); + _LIBCPP_HIDE_FROM_ABI __list_imp(); _LIBCPP_HIDE_FROM_ABI __list_imp(const allocator_type& __a); _LIBCPP_HIDE_FROM_ABI __list_imp(const __node_allocator& __a); _LIBCPP_HIDE_FROM_ABI ~__list_imp(); @@ -499,17 +499,14 @@ protected: _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(__end_as_link()); } _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(__end_as_link()); } - _LIBCPP_HIDE_FROM_ABI void swap(__list_imp& __c) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v); + _LIBCPP_HIDE_FROM_ABI void swap(__list_imp& __c); _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp& __c) { __copy_assign_alloc( __c, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c) - _NOEXCEPT_(!__node_alloc_traits::propagate_on_container_move_assignment::value || - is_nothrow_move_assignable<__node_allocator>::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c) { __move_assign_alloc( __c, integral_constant()); } @@ -550,8 +547,7 @@ private: _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __list_imp&, false_type) {} - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__list_imp& __c, true_type) { __node_alloc() = std::move(__c.__node_alloc()); } @@ -566,8 +562,7 @@ inline void __list_imp<_Tp, _Alloc>::__unlink_nodes(__link_pointer __f, __link_p } template -inline __list_imp<_Tp, _Alloc>::__list_imp() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) - : __size_alloc_(0, __default_init_tag()) {} +inline __list_imp<_Tp, _Alloc>::__list_imp() : __size_alloc_(0, __default_init_tag()) {} template inline __list_imp<_Tp, _Alloc>::__list_imp(const allocator_type& __a) : __size_alloc_(0, __node_allocator(__a)) {} @@ -596,8 +591,7 @@ void __list_imp<_Tp, _Alloc>::clear() _NOEXCEPT { } template -void __list_imp<_Tp, _Alloc>::swap(__list_imp& __c) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v) { +void __list_imp<_Tp, _Alloc>::swap(__list_imp& __c) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __alloc_traits::propagate_on_container_swap::value || this->__node_alloc() == __c.__node_alloc(), "list::swap: Either propagate_on_container_swap must be true" @@ -645,7 +639,7 @@ public: typedef std::reverse_iterator const_reverse_iterator; typedef void __remove_return_type; - _LIBCPP_HIDE_FROM_ABI list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {} + _LIBCPP_HIDE_FROM_ABI list() {} _LIBCPP_HIDE_FROM_ABI explicit list(const allocator_type& __a) : base(__a) {} _LIBCPP_HIDE_FROM_ABI explicit list(size_type __n); _LIBCPP_HIDE_FROM_ABI list(size_type __n, const value_type& __x); @@ -720,11 +714,7 @@ public: template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, _InpIter __f, _InpIter __l); - _LIBCPP_HIDE_FROM_ABI void swap(list& __c) - _NOEXCEPT_(!__node_alloc_traits::propagate_on_container_swap::value || - __is_nothrow_swappable_v<__node_allocator>) { - base::swap(__c); - } + _LIBCPP_HIDE_FROM_ABI void swap(list& __c) { base::swap(__c); } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { base::clear(); } _LIBCPP_HIDE_FROM_ABI void pop_front(); @@ -773,8 +763,7 @@ private: template _LIBCPP_HIDDEN static iterator __sort(iterator __f1, iterator __e2, size_type __n, _Comp& __comp); - _LIBCPP_HIDE_FROM_ABI void __move_assign(list& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value); + _LIBCPP_HIDE_FROM_ABI void __move_assign(list& __c, true_type); _LIBCPP_HIDE_FROM_ABI void __move_assign(list& __c, false_type); }; @@ -1355,8 +1344,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const list<_Tp, _Alloc>& __x, const } template -inline _LIBCPP_HIDE_FROM_ABI void swap(list<_Tp, _Alloc>& __x, list<_Tp, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(list<_Tp, _Alloc>& __x, list<_Tp, _Alloc>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/locale b/libcxx/include/__cxx03/locale index 405a482472901..64162f5a4ff2c 100644 --- a/libcxx/include/__cxx03/locale +++ b/libcxx/include/__cxx03/locale @@ -1328,7 +1328,7 @@ _LIBCPP_HIDE_FROM_ABI inline _OutputIterator num_put<_CharT, _OutputIterator>::_ // Worst case is octal, with showbase enabled. Note that octal is always // printed as an unsigned value. using _Unsigned = typename make_unsigned<_Integral>::type; - _LIBCPP_CONSTEXPR const unsigned __nbuf = + const unsigned __nbuf = (numeric_limits<_Unsigned>::digits / 3) // 1 char per 3 bits + ((numeric_limits<_Unsigned>::digits % 3) != 0) // round up + 2; // base prefix + terminating null character @@ -3140,7 +3140,7 @@ template , class _ByteAlloc = allocator > -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 wstring_convert { +class _LIBCPP_TEMPLATE_VIS wstring_convert { public: typedef basic_string, _ByteAlloc> byte_string; typedef basic_string<_Elem, char_traits<_Elem>, _WideAlloc> wide_string; @@ -3155,11 +3155,10 @@ private: size_t __cvtcount_; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_EXPLICIT_SINCE_CXX14 wstring_convert(_Codecvt* __pcvt = new _Codecvt); + _LIBCPP_HIDE_FROM_ABI wstring_convert(_Codecvt* __pcvt = new _Codecvt); _LIBCPP_HIDE_FROM_ABI wstring_convert(_Codecvt* __pcvt, state_type __state); - _LIBCPP_EXPLICIT_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI - wstring_convert(const byte_string& __byte_err, const wide_string& __wide_err = wide_string()); + _LIBCPP_HIDE_FROM_ABI wstring_convert(const byte_string& __byte_err, const wide_string& __wide_err = wide_string()); _LIBCPP_HIDE_FROM_ABI ~wstring_convert(); wstring_convert(const wstring_convert& __wc) = delete; @@ -3329,7 +3328,7 @@ wstring_convert<_Codecvt, _Elem, _WideAlloc, _ByteAlloc>::to_bytes(const _Elem* } template > -class _LIBCPP_TEMPLATE_VIS _LIBCPP_DEPRECATED_IN_CXX17 wbuffer_convert : public basic_streambuf<_Elem, _Tr> { +class _LIBCPP_TEMPLATE_VIS wbuffer_convert : public basic_streambuf<_Elem, _Tr> { public: // types: typedef _Elem char_type; @@ -3356,7 +3355,7 @@ private: bool __always_noconv_; public: - _LIBCPP_EXPLICIT_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI wbuffer_convert(streambuf* __bytebuf = nullptr, _Codecvt* __pcvt = new _Codecvt, state_type __state = state_type()); _LIBCPP_HIDE_FROM_ABI ~wbuffer_convert(); diff --git a/libcxx/include/__cxx03/map b/libcxx/include/__cxx03/map index 5c648c08ed81e..f4bf7107e8281 100644 --- a/libcxx/include/__cxx03/map +++ b/libcxx/include/__cxx03/map @@ -610,10 +610,8 @@ template ::value && !__libcpp_is_final<_Compare>::value> class __map_value_compare : private _Compare { public: - _LIBCPP_HIDE_FROM_ABI __map_value_compare() _NOEXCEPT_(is_nothrow_default_constructible<_Compare>::value) - : _Compare() {} - _LIBCPP_HIDE_FROM_ABI __map_value_compare(_Compare __c) _NOEXCEPT_(is_nothrow_copy_constructible<_Compare>::value) - : _Compare(__c) {} + _LIBCPP_HIDE_FROM_ABI __map_value_compare() : _Compare() {} + _LIBCPP_HIDE_FROM_ABI __map_value_compare(_Compare __c) : _Compare(__c) {} _LIBCPP_HIDE_FROM_ABI const _Compare& key_comp() const _NOEXCEPT { return *this; } _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _CP& __y) const { return static_cast(*this)(__x.__get_value().first, __y.__get_value().first); @@ -624,7 +622,7 @@ public: _LIBCPP_HIDE_FROM_ABI bool operator()(const _Key& __x, const _CP& __y) const { return static_cast(*this)(__x, __y.__get_value().first); } - _LIBCPP_HIDE_FROM_ABI void swap(__map_value_compare& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Compare>) { + _LIBCPP_HIDE_FROM_ABI void swap(__map_value_compare& __y) { using std::swap; swap(static_cast<_Compare&>(*this), static_cast<_Compare&>(__y)); } @@ -635,10 +633,8 @@ class __map_value_compare<_Key, _CP, _Compare, false> { _Compare __comp_; public: - _LIBCPP_HIDE_FROM_ABI __map_value_compare() _NOEXCEPT_(is_nothrow_default_constructible<_Compare>::value) - : __comp_() {} - _LIBCPP_HIDE_FROM_ABI __map_value_compare(_Compare __c) _NOEXCEPT_(is_nothrow_copy_constructible<_Compare>::value) - : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI __map_value_compare() : __comp_() {} + _LIBCPP_HIDE_FROM_ABI __map_value_compare(_Compare __c) : __comp_(__c) {} _LIBCPP_HIDE_FROM_ABI const _Compare& key_comp() const _NOEXCEPT { return __comp_; } _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _CP& __y) const { @@ -650,7 +646,7 @@ public: _LIBCPP_HIDE_FROM_ABI bool operator()(const _Key& __x, const _CP& __y) const { return __comp_(__x, __y.__get_value().first); } - void swap(__map_value_compare& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Compare>) { + void swap(__map_value_compare& __y) { using std::swap; swap(__comp_, __y.__comp_); } @@ -658,8 +654,7 @@ public: template inline _LIBCPP_HIDE_FROM_ABI void -swap(__map_value_compare<_Key, _CP, _Compare, __b>& __x, __map_value_compare<_Key, _CP, _Compare, __b>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(__map_value_compare<_Key, _CP, _Compare, __b>& __x, __map_value_compare<_Key, _CP, _Compare, __b>& __y) { __x.swap(__y); } @@ -900,14 +895,9 @@ public: template friend class _LIBCPP_TEMPLATE_VIS multimap; - _LIBCPP_HIDE_FROM_ABI map() _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_default_constructible::value&& - is_nothrow_copy_constructible::value) - : __tree_(__vc(key_compare())) {} + _LIBCPP_HIDE_FROM_ABI map() : __tree_(__vc(key_compare())) {} - _LIBCPP_HIDE_FROM_ABI explicit map(const key_compare& __comp) _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_copy_constructible::value) - : __tree_(__vc(__comp)) {} + _LIBCPP_HIDE_FROM_ABI explicit map(const key_compare& __comp) : __tree_(__vc(__comp)) {} _LIBCPP_HIDE_FROM_ABI explicit map(const key_compare& __comp, const allocator_type& __a) : __tree_(__vc(__comp), typename __base::allocator_type(__a)) {} @@ -994,7 +984,7 @@ public: } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __tree_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(map& __m) _NOEXCEPT_(__is_nothrow_swappable_v<__base>) { __tree_.swap(__m.__tree_); } + _LIBCPP_HIDE_FROM_ABI void swap(map& __m) { __tree_.swap(__m.__tree_); } _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); } _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); } @@ -1108,8 +1098,7 @@ operator<=(const map<_Key, _Tp, _Compare, _Allocator>& __x, const map<_Key, _Tp, template inline _LIBCPP_HIDE_FROM_ABI void -swap(map<_Key, _Tp, _Compare, _Allocator>& __x, map<_Key, _Tp, _Compare, _Allocator>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(map<_Key, _Tp, _Compare, _Allocator>& __x, map<_Key, _Tp, _Compare, _Allocator>& __y) { __x.swap(__y); } @@ -1168,14 +1157,9 @@ public: template friend class _LIBCPP_TEMPLATE_VIS multimap; - _LIBCPP_HIDE_FROM_ABI multimap() _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_default_constructible::value&& - is_nothrow_copy_constructible::value) - : __tree_(__vc(key_compare())) {} + _LIBCPP_HIDE_FROM_ABI multimap() : __tree_(__vc(key_compare())) {} - _LIBCPP_HIDE_FROM_ABI explicit multimap(const key_compare& __comp) _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_copy_constructible::value) - : __tree_(__vc(__comp)) {} + _LIBCPP_HIDE_FROM_ABI explicit multimap(const key_compare& __comp) : __tree_(__vc(__comp)) {} _LIBCPP_HIDE_FROM_ABI explicit multimap(const key_compare& __comp, const allocator_type& __a) : __tree_(__vc(__comp), typename __base::allocator_type(__a)) {} @@ -1262,9 +1246,7 @@ public: _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __tree_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(multimap& __m) _NOEXCEPT_(__is_nothrow_swappable_v<__base>) { - __tree_.swap(__m.__tree_); - } + _LIBCPP_HIDE_FROM_ABI void swap(multimap& __m) { __tree_.swap(__m.__tree_); } _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); } _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); } @@ -1331,8 +1313,7 @@ operator<=(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap< template inline _LIBCPP_HIDE_FROM_ABI void -swap(multimap<_Key, _Tp, _Compare, _Allocator>& __x, multimap<_Key, _Tp, _Compare, _Allocator>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(multimap<_Key, _Tp, _Compare, _Allocator>& __x, multimap<_Key, _Tp, _Compare, _Allocator>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/new b/libcxx/include/__cxx03/new index 5a8550c304a09..4d9ed8b4f09d7 100644 --- a/libcxx/include/__cxx03/new +++ b/libcxx/include/__cxx03/new @@ -233,7 +233,7 @@ inline _LIBCPP_HIDE_FROM_ABI void operator delete[](void*, void*) _NOEXCEPT {} _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_CONSTEXPR inline _LIBCPP_HIDE_FROM_ABI bool __is_overaligned_for_new(size_t __align) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool __is_overaligned_for_new(size_t __align) _NOEXCEPT { #ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__ return __align > __STDCPP_DEFAULT_NEW_ALIGNMENT__; #else @@ -310,7 +310,7 @@ inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_ } template -_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp* __launder(_Tp* __p) _NOEXCEPT { +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _Tp* __launder(_Tp* __p) _NOEXCEPT { static_assert(!(is_function<_Tp>::value), "can't launder functions"); static_assert(!(is_same >::value), "can't launder cv-void"); return __builtin_launder(__p); diff --git a/libcxx/include/__cxx03/queue b/libcxx/include/__cxx03/queue index 4bff23f5e45f5..c20ac525741ff 100644 --- a/libcxx/include/__cxx03/queue +++ b/libcxx/include/__cxx03/queue @@ -298,7 +298,7 @@ protected: container_type c; public: - _LIBCPP_HIDE_FROM_ABI queue() _NOEXCEPT_(is_nothrow_default_constructible::value) : c() {} + _LIBCPP_HIDE_FROM_ABI queue() : c() {} _LIBCPP_HIDE_FROM_ABI queue(const queue& __q) : c(__q.c) {} @@ -329,7 +329,7 @@ public: _LIBCPP_HIDE_FROM_ABI void push(const value_type& __v) { c.push_back(__v); } _LIBCPP_HIDE_FROM_ABI void pop() { c.pop_front(); } - _LIBCPP_HIDE_FROM_ABI void swap(queue& __q) _NOEXCEPT_(__is_nothrow_swappable_v) { + _LIBCPP_HIDE_FROM_ABI void swap(queue& __q) { using std::swap; swap(c, __q.c); } @@ -376,8 +376,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const queue<_Tp, _Container>& __x, } template , int> = 0> -inline _LIBCPP_HIDE_FROM_ABI void swap(queue<_Tp, _Container>& __x, queue<_Tp, _Container>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(queue<_Tp, _Container>& __x, queue<_Tp, _Container>& __y) { __x.swap(__y); } @@ -401,9 +400,7 @@ protected: value_compare comp; public: - _LIBCPP_HIDE_FROM_ABI priority_queue() _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_default_constructible::value) - : c(), comp() {} + _LIBCPP_HIDE_FROM_ABI priority_queue() : c(), comp() {} _LIBCPP_HIDE_FROM_ABI priority_queue(const priority_queue& __q) : c(__q.c), comp(__q.comp) {} @@ -463,8 +460,7 @@ public: _LIBCPP_HIDE_FROM_ABI void push(const value_type& __v); _LIBCPP_HIDE_FROM_ABI void pop(); - _LIBCPP_HIDE_FROM_ABI void swap(priority_queue& __q) - _NOEXCEPT_(__is_nothrow_swappable_v&& __is_nothrow_swappable_v); + _LIBCPP_HIDE_FROM_ABI void swap(priority_queue& __q); _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI const _Container& __get_container() const { return c; } }; @@ -560,8 +556,7 @@ inline void priority_queue<_Tp, _Container, _Compare>::pop() { } template -inline void priority_queue<_Tp, _Container, _Compare>::swap(priority_queue& __q) - _NOEXCEPT_(__is_nothrow_swappable_v&& __is_nothrow_swappable_v) { +inline void priority_queue<_Tp, _Container, _Compare>::swap(priority_queue& __q) { using std::swap; swap(c, __q.c); swap(comp, __q.comp); @@ -572,8 +567,7 @@ template && __is_swappable_v<_Compare>, int> = 0> inline _LIBCPP_HIDE_FROM_ABI void -swap(priority_queue<_Tp, _Container, _Compare>& __x, priority_queue<_Tp, _Container, _Compare>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(priority_queue<_Tp, _Container, _Compare>& __x, priority_queue<_Tp, _Container, _Compare>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/ratio b/libcxx/include/__cxx03/ratio index 1280a272d6db7..6012efd49fcb3 100644 --- a/libcxx/include/__cxx03/ratio +++ b/libcxx/include/__cxx03/ratio @@ -242,23 +242,23 @@ class _LIBCPP_TEMPLATE_VIS ratio { static_assert(__static_abs<_Num>::value >= 0, "ratio numerator is out of range"); static_assert(_Den != 0, "ratio divide by 0"); static_assert(__static_abs<_Den>::value > 0, "ratio denominator is out of range"); - static _LIBCPP_CONSTEXPR const intmax_t __na = __static_abs<_Num>::value; - static _LIBCPP_CONSTEXPR const intmax_t __da = __static_abs<_Den>::value; - static _LIBCPP_CONSTEXPR const intmax_t __s = __static_sign<_Num>::value * __static_sign<_Den>::value; - static _LIBCPP_CONSTEXPR const intmax_t __gcd = __static_gcd<__na, __da>::value; + static const intmax_t __na = __static_abs<_Num>::value; + static const intmax_t __da = __static_abs<_Den>::value; + static const intmax_t __s = __static_sign<_Num>::value * __static_sign<_Den>::value; + static const intmax_t __gcd = __static_gcd<__na, __da>::value; public: - static _LIBCPP_CONSTEXPR const intmax_t num = __s * __na / __gcd; - static _LIBCPP_CONSTEXPR const intmax_t den = __da / __gcd; + static const intmax_t num = __s * __na / __gcd; + static const intmax_t den = __da / __gcd; typedef ratio type; }; template -_LIBCPP_CONSTEXPR const intmax_t ratio<_Num, _Den>::num; +const intmax_t ratio<_Num, _Den>::num; template -_LIBCPP_CONSTEXPR const intmax_t ratio<_Num, _Den>::den; +const intmax_t ratio<_Num, _Den>::den; template struct __is_ratio : false_type {}; diff --git a/libcxx/include/__cxx03/regex b/libcxx/include/__cxx03/regex index 4e4cbd20e609f..b96d59d3a252a 100644 --- a/libcxx/include/__cxx03/regex +++ b/libcxx/include/__cxx03/regex @@ -847,7 +847,7 @@ enum syntax_option_type { multiline = 1 << 10 }; -_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR syntax_option_type __get_grammar(syntax_option_type __g) { +_LIBCPP_HIDE_FROM_ABI inline syntax_option_type __get_grammar(syntax_option_type __g) { #ifdef _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO return static_cast(__g & 0x3F0); #else @@ -855,22 +855,19 @@ _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR syntax_option_type __get_grammar( #endif } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR syntax_option_type operator~(syntax_option_type __x) { +inline _LIBCPP_HIDE_FROM_ABI syntax_option_type operator~(syntax_option_type __x) { return syntax_option_type(~int(__x) & 0x1FF); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR syntax_option_type -operator&(syntax_option_type __x, syntax_option_type __y) { +inline _LIBCPP_HIDE_FROM_ABI syntax_option_type operator&(syntax_option_type __x, syntax_option_type __y) { return syntax_option_type(int(__x) & int(__y)); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR syntax_option_type -operator|(syntax_option_type __x, syntax_option_type __y) { +inline _LIBCPP_HIDE_FROM_ABI syntax_option_type operator|(syntax_option_type __x, syntax_option_type __y) { return syntax_option_type(int(__x) | int(__y)); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR syntax_option_type -operator^(syntax_option_type __x, syntax_option_type __y) { +inline _LIBCPP_HIDE_FROM_ABI syntax_option_type operator^(syntax_option_type __x, syntax_option_type __y) { return syntax_option_type(int(__x) ^ int(__y)); } @@ -909,19 +906,19 @@ enum match_flag_type { __full_match = 1 << 12 }; -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR match_flag_type operator~(match_flag_type __x) { +inline _LIBCPP_HIDE_FROM_ABI match_flag_type operator~(match_flag_type __x) { return match_flag_type(~int(__x) & 0x0FFF); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR match_flag_type operator&(match_flag_type __x, match_flag_type __y) { +inline _LIBCPP_HIDE_FROM_ABI match_flag_type operator&(match_flag_type __x, match_flag_type __y) { return match_flag_type(int(__x) & int(__y)); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR match_flag_type operator|(match_flag_type __x, match_flag_type __y) { +inline _LIBCPP_HIDE_FROM_ABI match_flag_type operator|(match_flag_type __x, match_flag_type __y) { return match_flag_type(int(__x) | int(__y)); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR match_flag_type operator^(match_flag_type __x, match_flag_type __y) { +inline _LIBCPP_HIDE_FROM_ABI match_flag_type operator^(match_flag_type __x, match_flag_type __y) { return match_flag_type(int(__x) ^ int(__y)); } @@ -1822,7 +1819,7 @@ void __word_boundary<_CharT, _Traits>::__exec(__state& __s) const { // __l_anchor template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __is_eol(_CharT __c) { +_LIBCPP_HIDE_FROM_ABI bool __is_eol(_CharT __c) { return __c == '\r' || __c == '\n'; } @@ -4162,7 +4159,7 @@ public: bool matched; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR sub_match() : matched() {} + _LIBCPP_HIDE_FROM_ABI sub_match() : matched() {} _LIBCPP_HIDE_FROM_ABI difference_type length() const { return matched ? std::distance(this->first, this->second) : 0; @@ -4176,7 +4173,7 @@ public: _LIBCPP_HIDE_FROM_ABI int compare(const string_type& __s) const { return str().compare(__s); } _LIBCPP_HIDE_FROM_ABI int compare(const value_type* __s) const { return str().compare(__s); } - _LIBCPP_HIDE_FROM_ABI void swap(sub_match& __s) _NOEXCEPT_(__is_nothrow_swappable_v<_BidirectionalIterator>) { + _LIBCPP_HIDE_FROM_ABI void swap(sub_match& __s) { this->pair<_BidirectionalIterator, _BidirectionalIterator>::swap(__s); std::swap(matched, __s.matched); } diff --git a/libcxx/include/__cxx03/set b/libcxx/include/__cxx03/set index 208581cbc52b0..8ddb425333eb6 100644 --- a/libcxx/include/__cxx03/set +++ b/libcxx/include/__cxx03/set @@ -581,14 +581,9 @@ public: template friend class _LIBCPP_TEMPLATE_VIS multiset; - _LIBCPP_HIDE_FROM_ABI set() _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_default_constructible::value&& - is_nothrow_copy_constructible::value) - : __tree_(value_compare()) {} + _LIBCPP_HIDE_FROM_ABI set() : __tree_(value_compare()) {} - _LIBCPP_HIDE_FROM_ABI explicit set(const value_compare& __comp) _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_copy_constructible::value) - : __tree_(__comp) {} + _LIBCPP_HIDE_FROM_ABI explicit set(const value_compare& __comp) : __tree_(__comp) {} _LIBCPP_HIDE_FROM_ABI explicit set(const value_compare& __comp, const allocator_type& __a) : __tree_(__comp, __a) {} template @@ -655,7 +650,7 @@ public: _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __f, const_iterator __l) { return __tree_.erase(__f, __l); } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __tree_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(set& __s) _NOEXCEPT_(__is_nothrow_swappable_v<__base>) { __tree_.swap(__s.__tree_); } + _LIBCPP_HIDE_FROM_ABI void swap(set& __s) { __tree_.swap(__s.__tree_); } _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return __tree_.__alloc(); } _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __tree_.value_comp(); } @@ -719,8 +714,7 @@ operator<=(const set<_Key, _Compare, _Allocator>& __x, const set<_Key, _Compare, // specialized algorithms: template -inline _LIBCPP_HIDE_FROM_ABI void swap(set<_Key, _Compare, _Allocator>& __x, set<_Key, _Compare, _Allocator>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(set<_Key, _Compare, _Allocator>& __x, set<_Key, _Compare, _Allocator>& __y) { __x.swap(__y); } @@ -763,14 +757,9 @@ public: friend class _LIBCPP_TEMPLATE_VIS multiset; // construct/copy/destroy: - _LIBCPP_HIDE_FROM_ABI multiset() _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_default_constructible::value&& - is_nothrow_copy_constructible::value) - : __tree_(value_compare()) {} + _LIBCPP_HIDE_FROM_ABI multiset() : __tree_(value_compare()) {} - _LIBCPP_HIDE_FROM_ABI explicit multiset(const value_compare& __comp) _NOEXCEPT_( - is_nothrow_default_constructible::value&& is_nothrow_copy_constructible::value) - : __tree_(__comp) {} + _LIBCPP_HIDE_FROM_ABI explicit multiset(const value_compare& __comp) : __tree_(__comp) {} _LIBCPP_HIDE_FROM_ABI explicit multiset(const value_compare& __comp, const allocator_type& __a) : __tree_(__comp, __a) {} @@ -842,9 +831,7 @@ public: _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __f, const_iterator __l) { return __tree_.erase(__f, __l); } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __tree_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(multiset& __s) _NOEXCEPT_(__is_nothrow_swappable_v<__base>) { - __tree_.swap(__s.__tree_); - } + _LIBCPP_HIDE_FROM_ABI void swap(multiset& __s) { __tree_.swap(__s.__tree_); } _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return __tree_.__alloc(); } _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __tree_.value_comp(); } @@ -908,8 +895,7 @@ operator<=(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key, template inline _LIBCPP_HIDE_FROM_ABI void -swap(multiset<_Key, _Compare, _Allocator>& __x, multiset<_Key, _Compare, _Allocator>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(multiset<_Key, _Compare, _Allocator>& __x, multiset<_Key, _Compare, _Allocator>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/stack b/libcxx/include/__cxx03/stack index f216f842376ed..3c76006fd61c1 100644 --- a/libcxx/include/__cxx03/stack +++ b/libcxx/include/__cxx03/stack @@ -152,7 +152,7 @@ protected: container_type c; public: - _LIBCPP_HIDE_FROM_ABI stack() _NOEXCEPT_(is_nothrow_default_constructible::value) : c() {} + _LIBCPP_HIDE_FROM_ABI stack() : c() {} _LIBCPP_HIDE_FROM_ABI stack(const stack& __q) : c(__q.c) {} @@ -185,7 +185,7 @@ public: _LIBCPP_HIDE_FROM_ABI void pop() { c.pop_back(); } - _LIBCPP_HIDE_FROM_ABI void swap(stack& __s) _NOEXCEPT_(__is_nothrow_swappable_v) { + _LIBCPP_HIDE_FROM_ABI void swap(stack& __s) { using std::swap; swap(c, __s.c); } @@ -230,8 +230,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const stack<_Tp, _Container>& __x, } template , int> = 0> -inline _LIBCPP_HIDE_FROM_ABI void swap(stack<_Tp, _Container>& __x, stack<_Tp, _Container>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(stack<_Tp, _Container>& __x, stack<_Tp, _Container>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/string b/libcxx/include/__cxx03/string index 574c1ac0a00e0..7d54030d0b660 100644 --- a/libcxx/include/__cxx03/string +++ b/libcxx/include/__cxx03/string @@ -672,23 +672,23 @@ _LIBCPP_BEGIN_NAMESPACE_STD // basic_string template -basic_string<_CharT, _Traits, _Allocator> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 +basic_string<_CharT, _Traits, _Allocator> _LIBCPP_HIDE_FROM_ABI operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const basic_string<_CharT, _Traits, _Allocator>& __y); template -_LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDDEN basic_string<_CharT, _Traits, _Allocator> operator+(const _CharT* __x, const basic_string<_CharT, _Traits, _Allocator>& __y); template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> operator+(_CharT __x, const basic_string<_CharT, _Traits, _Allocator>& __y); template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +inline _LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const _CharT* __y); template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, _CharT __y); extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ @@ -754,7 +754,7 @@ public: void>; #endif #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __asan_volatile_wrapper(pointer const& __ptr) const { + _LIBCPP_HIDE_FROM_ABI pointer __asan_volatile_wrapper(pointer const& __ptr) const { if (__libcpp_is_constant_evaluated()) return __ptr; @@ -763,8 +763,7 @@ public: return const_cast(__copy_ptr); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_pointer - __asan_volatile_wrapper(const_pointer const& __ptr) const { + _LIBCPP_HIDE_FROM_ABI const_pointer __asan_volatile_wrapper(const_pointer const& __ptr) const { if (__libcpp_is_constant_evaluated()) return __ptr; @@ -885,8 +884,7 @@ private: // Construct a string with the given allocator and enough storage to hold `__size` characters, but // don't initialize the characters. The contents of the string, including the null terminator, must be // initialized separately. - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string( - __uninitialized_size_tag, size_type __size, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI explicit basic_string(__uninitialized_size_tag, size_type __size, const allocator_type& __a) : __r_(__default_init_tag(), __a) { if (__size > max_size()) __throw_length_error(); @@ -905,13 +903,12 @@ private: } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - basic_string(__init_with_sentinel_tag, _Iter __first, _Sent __last, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI basic_string(__init_with_sentinel_tag, _Iter __first, _Sent __last, const allocator_type& __a) : __r_(__default_init_tag(), __a) { __init_with_sentinel(std::move(__first), std::move(__last)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator __make_iterator(pointer __p) { + _LIBCPP_HIDE_FROM_ABI iterator __make_iterator(pointer __p) { #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING // Bound the iterator according to the size (and not the capacity, unlike vector). // @@ -928,7 +925,7 @@ private: #endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator __make_const_iterator(const_pointer __p) const { + _LIBCPP_HIDE_FROM_ABI const_iterator __make_const_iterator(const_pointer __p) const { #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING // Bound the iterator according to the size (and not the capacity, unlike vector). return std::__make_bounded_iter( @@ -943,19 +940,13 @@ private: public: _LIBCPP_TEMPLATE_DATA_VIS static const size_type npos = -1; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string() - _NOEXCEPT_(is_nothrow_default_constructible::value) - : __r_(__value_init_tag(), __default_init_tag()) { - __annotate_new(0); - } + _LIBCPP_HIDE_FROM_ABI basic_string() : __r_(__value_init_tag(), __default_init_tag()) { __annotate_new(0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const allocator_type& __a) - _NOEXCEPT_(is_nothrow_copy_constructible::value) - : __r_(__value_init_tag(), __a) { + _LIBCPP_HIDE_FROM_ABI explicit basic_string(const allocator_type& __a) : __r_(__value_init_tag(), __a) { __annotate_new(0); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS basic_string(const basic_string& __str) + _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS basic_string(const basic_string& __str) : __r_(__default_init_tag(), __alloc_traits::select_on_container_copy_construction(__str.__alloc())) { if (!__str.__is_long()) { __r_.first() = __str.__r_.first(); @@ -964,9 +955,8 @@ public: __init_copy_ctor_external(std::__to_address(__str.__get_long_pointer()), __str.__get_long_size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS - basic_string(const basic_string& __str, const allocator_type& __a) - : __r_(__default_init_tag(), __a) { + _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS + basic_string(const basic_string& __str, const allocator_type& __a) : __r_(__default_init_tag(), __a) { if (!__str.__is_long()) { __r_.first() = __str.__r_.first(); __annotate_new(__get_short_size()); @@ -975,44 +965,39 @@ public: } template <__enable_if_t<__is_allocator<_Allocator>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s) - : __r_(__default_init_tag(), __default_init_tag()) { + _LIBCPP_HIDE_FROM_ABI basic_string(const _CharT* __s) : __r_(__default_init_tag(), __default_init_tag()) { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "basic_string(const char*) detected nullptr"); __init(__s, traits_type::length(__s)); } template <__enable_if_t<__is_allocator<_Allocator>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s, const _Allocator& __a) - : __r_(__default_init_tag(), __a) { + _LIBCPP_HIDE_FROM_ABI basic_string(const _CharT* __s, const _Allocator& __a) : __r_(__default_init_tag(), __a) { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "basic_string(const char*, allocator) detected nullptr"); __init(__s, traits_type::length(__s)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s, size_type __n) + _LIBCPP_HIDE_FROM_ABI basic_string(const _CharT* __s, size_type __n) : __r_(__default_init_tag(), __default_init_tag()) { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "basic_string(const char*, n) detected nullptr"); __init(__s, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - basic_string(const _CharT* __s, size_type __n, const _Allocator& __a) + _LIBCPP_HIDE_FROM_ABI basic_string(const _CharT* __s, size_type __n, const _Allocator& __a) : __r_(__default_init_tag(), __a) { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "basic_string(const char*, n, allocator) detected nullptr"); __init(__s, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(size_type __n, _CharT __c) - : __r_(__default_init_tag(), __default_init_tag()) { + _LIBCPP_HIDE_FROM_ABI basic_string(size_type __n, _CharT __c) : __r_(__default_init_tag(), __default_init_tag()) { __init(__n, __c); } template <__enable_if_t<__is_allocator<_Allocator>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(size_type __n, _CharT __c, const _Allocator& __a) + _LIBCPP_HIDE_FROM_ABI basic_string(size_type __n, _CharT __c, const _Allocator& __a) : __r_(__default_init_tag(), __a) { __init(__n, __c); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const basic_string& __str, size_type __pos, size_type __n, const _Allocator& __a = _Allocator()) : __r_(__default_init_tag(), __a) { size_type __str_sz = __str.size(); @@ -1021,8 +1006,7 @@ public: __init(__str.data() + __pos, std::min(__n, __str_sz - __pos)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - basic_string(const basic_string& __str, size_type __pos, const _Allocator& __a = _Allocator()) + _LIBCPP_HIDE_FROM_ABI basic_string(const basic_string& __str, size_type __pos, const _Allocator& __a = _Allocator()) : __r_(__default_init_tag(), __a) { size_type __str_sz = __str.size(); if (__pos > __str_sz) @@ -1034,7 +1018,7 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string(const _Tp& __t, size_type __pos, size_type __n, const allocator_type& __a = allocator_type()) : __r_(__default_init_tag(), __a) { __self_view __sv0 = __t; @@ -1046,7 +1030,7 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t) + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS explicit basic_string(const _Tp& __t) : __r_(__default_init_tag(), __default_init_tag()) { __self_view __sv = __t; __init(__sv.data(), __sv.size()); @@ -1056,92 +1040,67 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS - _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t, const allocator_type& __a) + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS explicit basic_string(const _Tp& __t, const allocator_type& __a) : __r_(__default_init_tag(), __a) { __self_view __sv = __t; __init(__sv.data(), __sv.size()); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(_InputIterator __first, _InputIterator __last) + _LIBCPP_HIDE_FROM_ABI basic_string(_InputIterator __first, _InputIterator __last) : __r_(__default_init_tag(), __default_init_tag()) { __init(__first, __last); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - basic_string(_InputIterator __first, _InputIterator __last, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI basic_string(_InputIterator __first, _InputIterator __last, const allocator_type& __a) : __r_(__default_init_tag(), __a) { __init(__first, __last); } - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { + inline ~basic_string() { __annotate_delete(); if (__is_long()) __alloc_traits::deallocate(__alloc(), __get_long_pointer(), __get_long_cap()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT { - return __self_view(data(), size()); - } + _LIBCPP_HIDE_FROM_ABI operator __self_view() const _NOEXCEPT { return __self_view(data(), size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS basic_string& - operator=(const basic_string& __str); + _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS basic_string& operator=(const basic_string& __str); template ::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(const _Tp& __t) { + basic_string& operator=(const _Tp& __t) { __self_view __sv = __t; return assign(__sv); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(const value_type* __s) { - return assign(__s); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(value_type __c); + _LIBCPP_HIDE_FROM_ABI basic_string& operator=(const value_type* __s) { return assign(__s); } + basic_string& operator=(value_type __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator begin() _NOEXCEPT { - return __make_iterator(__get_pointer()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator begin() const _NOEXCEPT { - return __make_const_iterator(__get_pointer()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator end() _NOEXCEPT { - return __make_iterator(__get_pointer() + size()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator end() const _NOEXCEPT { - return __make_const_iterator(__get_pointer() + size()); - } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __make_iterator(__get_pointer()); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __make_const_iterator(__get_pointer()); } + _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __make_iterator(__get_pointer() + size()); } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __make_const_iterator(__get_pointer() + size()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rbegin() _NOEXCEPT { - return reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rbegin() const _NOEXCEPT { - return const_reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rend() _NOEXCEPT { - return reverse_iterator(begin()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rend() const _NOEXCEPT { - return const_reverse_iterator(begin()); - } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cbegin() const _NOEXCEPT { return begin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cend() const _NOEXCEPT { return end(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crbegin() const _NOEXCEPT { - return rbegin(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return end(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return rbegin(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return rend(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type size() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __is_long() ? __get_long_size() : __get_short_size(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type length() const _NOEXCEPT { return size(); } + _LIBCPP_HIDE_FROM_ABI size_type length() const _NOEXCEPT { return size(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type max_size() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { size_type __m = __alloc_traits::max_size(__alloc()); if (__m <= std::numeric_limits::max() / 2) { return __m - __alignment; @@ -1151,26 +1110,24 @@ public: } } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type capacity() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type capacity() const _NOEXCEPT { return (__is_long() ? __get_long_cap() : static_cast(__min_cap)) - 1; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 void resize(size_type __n, value_type __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void resize(size_type __n) { resize(__n, value_type()); } + void resize(size_type __n, value_type __c); + _LIBCPP_HIDE_FROM_ABI void resize(size_type __n) { resize(__n, value_type()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 void reserve(size_type __requested_capacity); + void reserve(size_type __requested_capacity); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __resize_default_init(size_type __n); + _LIBCPP_HIDE_FROM_ABI void __resize_default_init(size_type __n); - _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_HIDE_FROM_ABI void reserve() _NOEXCEPT { shrink_to_fit(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void shrink_to_fit() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void clear() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void reserve() _NOEXCEPT { shrink_to_fit(); } + _LIBCPP_HIDE_FROM_ABI void shrink_to_fit() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool empty() const _NOEXCEPT { - return size() == 0; - } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return size() == 0; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference operator[](size_type __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds"); if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) { return *(__get_long_pointer() + __pos); @@ -1178,7 +1135,7 @@ public: return *(data() + __pos); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __pos) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference operator[](size_type __pos) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds"); if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) { return *(__get_long_pointer() + __pos); @@ -1186,65 +1143,55 @@ public: return *(__get_pointer() + __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference at(size_type __n) const; - _LIBCPP_CONSTEXPR_SINCE_CXX20 reference at(size_type __n); + const_reference at(size_type __n) const; + reference at(size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const basic_string& __str) { - return append(__str); - } + _LIBCPP_HIDE_FROM_ABI basic_string& operator+=(const basic_string& __str) { return append(__str); } template ::value && !__is_same_uncvref<_Tp, basic_string >::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - operator+=(const _Tp& __t) { + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& operator+=(const _Tp& __t) { __self_view __sv = __t; return append(__sv); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const value_type* __s) { - return append(__s); - } + _LIBCPP_HIDE_FROM_ABI basic_string& operator+=(const value_type* __s) { return append(__s); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(value_type __c) { + _LIBCPP_HIDE_FROM_ABI basic_string& operator+=(value_type __c) { push_back(__c); return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const basic_string& __str) { - return append(__str.data(), __str.size()); - } + _LIBCPP_HIDE_FROM_ABI basic_string& append(const basic_string& __str) { return append(__str.data(), __str.size()); } template ::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - append(const _Tp& __t) { + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& append(const _Tp& __t) { __self_view __sv = __t; return append(__sv.data(), __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const basic_string& __str, size_type __pos, size_type __n = npos); + basic_string& append(const basic_string& __str, size_type __pos, size_type __n = npos); template ::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 - - basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& append(const _Tp& __t, size_type __pos, size_type __n = npos); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* __s, size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* __s); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(size_type __n, value_type __c); + basic_string& append(const value_type* __s, size_type __n); + basic_string& append(const value_type* __s); + basic_string& append(size_type __n, value_type __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __append_default_init(size_type __n); + _LIBCPP_HIDE_FROM_ABI void __append_default_init(size_type __n); template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_HIDE_FROM_ABI basic_string& append(_InputIterator __first, _InputIterator __last) { const basic_string __temp(__first, __last, __alloc()); append(__temp.data(), __temp.size()); @@ -1252,71 +1199,66 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_HIDE_FROM_ABI basic_string& append(_ForwardIterator __first, _ForwardIterator __last); - _LIBCPP_CONSTEXPR_SINCE_CXX20 void push_back(value_type __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back(); + void push_back(value_type __c); + _LIBCPP_HIDE_FROM_ABI void pop_back(); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference front() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::front(): string is empty"); return *__get_pointer(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::front(): string is empty"); return *data(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference back() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::back(): string is empty"); return *(__get_pointer() + size() - 1); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::back(): string is empty"); return *(data() + size() - 1); } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - assign(const _Tp& __t) { + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& assign(const _Tp& __t) { __self_view __sv = __t; return assign(__sv.data(), __sv.size()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const basic_string& __str) { - return *this = __str; - } + _LIBCPP_HIDE_FROM_ABI basic_string& assign(const basic_string& __str) { return *this = __str; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const basic_string& __str, size_type __pos, size_type __n = npos); + basic_string& assign(const basic_string& __str, size_type __pos, size_type __n = npos); template ::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& assign(const _Tp& __t, size_type __pos, size_type __n = npos); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s, size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(size_type __n, value_type __c); + basic_string& assign(const value_type* __s, size_type __n); + basic_string& assign(const value_type* __s); + basic_string& assign(size_type __n, value_type __c); template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& assign(_InputIterator __first, _InputIterator __last); template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& assign(_ForwardIterator __first, _ForwardIterator __last); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - insert(size_type __pos1, const basic_string& __str) { + _LIBCPP_HIDE_FROM_ABI basic_string& insert(size_type __pos1, const basic_string& __str) { return insert(__pos1, __str.data(), __str.size()); } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - insert(size_type __pos1, const _Tp& __t) { + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& insert(size_type __pos1, const _Tp& __t) { __self_view __sv = __t; return insert(__pos1, __sv.data(), __sv.size()); } @@ -1325,257 +1267,219 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& insert(size_type __pos1, const _Tp& __t, size_type __pos2, size_type __n = npos); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - insert(size_type __pos1, const basic_string& __str, size_type __pos2, size_type __n = npos); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* __s, size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* __s); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, size_type __n, value_type __c); - _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator insert(const_iterator __pos, value_type __c); + basic_string& insert(size_type __pos1, const basic_string& __str, size_type __pos2, size_type __n = npos); + basic_string& insert(size_type __pos, const value_type* __s, size_type __n); + basic_string& insert(size_type __pos, const value_type* __s); + basic_string& insert(size_type __pos, size_type __n, value_type __c); + iterator insert(const_iterator __pos, value_type __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator - insert(const_iterator __pos, size_type __n, value_type __c) { + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __pos, size_type __n, value_type __c) { difference_type __p = __pos - begin(); insert(static_cast(__p), __n, __c); return begin() + __p; } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS iterator insert(const_iterator __pos, _InputIterator __first, _InputIterator __last); template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS iterator insert(const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& erase(size_type __pos = 0, size_type __n = npos); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator erase(const_iterator __pos); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator erase(const_iterator __first, const_iterator __last); + basic_string& erase(size_type __pos = 0, size_type __n = npos); + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __pos); + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(size_type __pos1, size_type __n1, const basic_string& __str) { + _LIBCPP_HIDE_FROM_ABI basic_string& replace(size_type __pos1, size_type __n1, const basic_string& __str) { return replace(__pos1, __n1, __str.data(), __str.size()); } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& replace(size_type __pos1, size_type __n1, const _Tp& __t) { __self_view __sv = __t; return replace(__pos1, __n1, __sv.data(), __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + basic_string& replace(size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos); template ::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& replace(size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2 = npos); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(size_type __pos, size_type __n1, const value_type* __s, size_type __n2); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos, size_type __n1, const value_type* __s); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos, size_type __n1, size_type __n2, value_type __c); + basic_string& replace(size_type __pos, size_type __n1, const value_type* __s, size_type __n2); + basic_string& replace(size_type __pos, size_type __n1, const value_type* __s); + basic_string& replace(size_type __pos, size_type __n1, size_type __n2, value_type __c); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(const_iterator __i1, const_iterator __i2, const basic_string& __str) { + _LIBCPP_HIDE_FROM_ABI basic_string& replace(const_iterator __i1, const_iterator __i2, const basic_string& __str) { return replace( static_cast(__i1 - begin()), static_cast(__i2 - __i1), __str.data(), __str.size()); } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& replace(const_iterator __i1, const_iterator __i2, const _Tp& __t) { __self_view __sv = __t; return replace(__i1 - begin(), __i2 - __i1, __sv); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_HIDE_FROM_ABI basic_string& replace(const_iterator __i1, const_iterator __i2, const value_type* __s, size_type __n) { return replace(static_cast(__i1 - begin()), static_cast(__i2 - __i1), __s, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(const_iterator __i1, const_iterator __i2, const value_type* __s) { + _LIBCPP_HIDE_FROM_ABI basic_string& replace(const_iterator __i1, const_iterator __i2, const value_type* __s) { return replace(static_cast(__i1 - begin()), static_cast(__i2 - __i1), __s); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(const_iterator __i1, const_iterator __i2, size_type __n, value_type __c) { + _LIBCPP_HIDE_FROM_ABI basic_string& replace(const_iterator __i1, const_iterator __i2, size_type __n, value_type __c) { return replace(static_cast(__i1 - begin()), static_cast(__i2 - __i1), __n, __c); } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS basic_string& replace(const_iterator __i1, const_iterator __i2, _InputIterator __j1, _InputIterator __j2); - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type copy(value_type* __s, size_type __n, size_type __pos = 0) const; + size_type copy(value_type* __s, size_type __n, size_type __pos = 0) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string - substr(size_type __pos = 0, size_type __n = npos) const { + _LIBCPP_HIDE_FROM_ABI basic_string substr(size_type __pos = 0, size_type __n = npos) const { return basic_string(*this, __pos, __n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(basic_string& __str) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v); + _LIBCPP_HIDE_FROM_ABI void swap(basic_string& __str); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* c_str() const _NOEXCEPT { return data(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const value_type* data() const _NOEXCEPT { - return std::__to_address(__get_pointer()); - } + _LIBCPP_HIDE_FROM_ABI const value_type* c_str() const _NOEXCEPT { return data(); } + _LIBCPP_HIDE_FROM_ABI const value_type* data() const _NOEXCEPT { return std::__to_address(__get_pointer()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator_type get_allocator() const _NOEXCEPT { - return __alloc(); - } + _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return __alloc(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS size_type find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find(const value_type* __s, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(value_type __c, size_type __pos = 0) const _NOEXCEPT; + size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find(const value_type* __s, size_type __pos = 0) const _NOEXCEPT; + size_type find(value_type __c, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - rfind(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type rfind(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS size_type rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - rfind(const value_type* __s, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(value_type __c, size_type __pos = npos) const _NOEXCEPT; + size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type rfind(const value_type* __s, size_type __pos = npos) const _NOEXCEPT; + size_type rfind(value_type __c, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_first_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS size_type find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_of(const value_type* __s, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_of(value_type __c, size_type __pos = 0) const _NOEXCEPT; + size_type find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_first_of(const value_type* __s, size_type __pos = 0) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_first_of(value_type __c, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_last_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS size_type find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_of(const value_type* __s, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_of(value_type __c, size_type __pos = npos) const _NOEXCEPT; + size_type find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_last_of(const value_type* __s, size_type __pos = npos) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_last_of(value_type __c, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_not_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_first_not_of(const basic_string& __str, size_type __pos = 0) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS size_type find_first_not_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_not_of(const value_type* __s, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_not_of(value_type __c, size_type __pos = 0) const _NOEXCEPT; + size_type find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_first_not_of(const value_type* __s, size_type __pos = 0) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_first_not_of(value_type __c, size_type __pos = 0) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_not_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_last_not_of(const basic_string& __str, size_type __pos = npos) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS size_type find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_not_of(const value_type* __s, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_not_of(value_type __c, size_type __pos = npos) const _NOEXCEPT; + size_type find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_last_not_of(const value_type* __s, size_type __pos = npos) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type find_last_not_of(value_type __c, size_type __pos = npos) const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const basic_string& __str) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI int compare(const basic_string& __str) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 int - compare(const _Tp& __t) const _NOEXCEPT; + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS int compare(const _Tp& __t) const _NOEXCEPT; template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 int + _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS int compare(size_type __pos1, size_type __n1, const _Tp& __t) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int - compare(size_type __pos1, size_type __n1, const basic_string& __str) const; - _LIBCPP_CONSTEXPR_SINCE_CXX20 int - compare(size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos) const; + _LIBCPP_HIDE_FROM_ABI int compare(size_type __pos1, size_type __n1, const basic_string& __str) const; + int compare( + size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos) const; template ::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int + inline _LIBCPP_HIDE_FROM_ABI int compare(size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2 = npos) const; - _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const value_type* __s) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const value_type* __s) const; - _LIBCPP_CONSTEXPR_SINCE_CXX20 int - compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const; + int compare(const value_type* __s) const _NOEXCEPT; + int compare(size_type __pos1, size_type __n1, const value_type* __s) const; + int compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __invariants() const; + _LIBCPP_HIDE_FROM_ABI bool __invariants() const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __clear_and_shrink() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void __clear_and_shrink() _NOEXCEPT; private: template - inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool friend + inline _LIBCPP_HIDE_FROM_ABI bool friend operator==(const basic_string, _Alloc>& __lhs, const basic_string, _Alloc>& __rhs) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __shrink_or_extend(size_type __target_capacity); + _LIBCPP_HIDE_FROM_ABI void __shrink_or_extend(size_type __target_capacity); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS bool - __is_long() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS bool __is_long() const _NOEXCEPT { if (__libcpp_is_constant_evaluated() && __builtin_constant_p(__r_.first().__l.__is_long_)) { return __r_.first().__l.__is_long_; } return __r_.first().__s.__is_long_; } - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __begin_lifetime(pointer __begin, size_type __n) { + static _LIBCPP_HIDE_FROM_ABI void __begin_lifetime(pointer __begin, size_type __n) { (void)__begin; (void)__n; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI static bool __fits_in_sso(size_type __sz) { return __sz < __min_cap; } + _LIBCPP_HIDE_FROM_ABI static bool __fits_in_sso(size_type __sz) { return __sz < __min_cap; } template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __assign_trivial(_Iterator __first, _Sentinel __last, size_type __n); + _LIBCPP_HIDE_FROM_ABI void __assign_trivial(_Iterator __first, _Sentinel __last, size_type __n); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last); + _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last); // Copy [__first, __last) into [__dest, __dest + (__last - __first)). Assumes that the ranges don't overlap. template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static value_type* + _LIBCPP_HIDE_FROM_ABI static value_type* __copy_non_overlapping_range(_ForwardIter __first, _Sent __last, value_type* __dest) { for (; __first != __last; ++__first) traits_type::assign(*__dest++, *__first); @@ -1583,7 +1487,7 @@ private: } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 iterator + _LIBCPP_HIDE_FROM_ABI iterator __insert_from_safe_copy(size_type __n, size_type __ip, _ForwardIterator __first, _Sentinel __last) { size_type __sz = size(); size_type __cap = capacity(); @@ -1607,74 +1511,60 @@ private: } template - _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator - __insert_with_size(const_iterator __pos, _Iterator __first, _Sentinel __last, size_type __n); + iterator __insert_with_size(const_iterator __pos, _Iterator __first, _Sentinel __last, size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 allocator_type& __alloc() _NOEXCEPT { return __r_.second(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const allocator_type& __alloc() const _NOEXCEPT { return __r_.second(); } + _LIBCPP_HIDE_FROM_ABI allocator_type& __alloc() _NOEXCEPT { return __r_.second(); } + _LIBCPP_HIDE_FROM_ABI const allocator_type& __alloc() const _NOEXCEPT { return __r_.second(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS void - __set_short_size(size_type __s) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS void __set_short_size(size_type __s) _NOEXCEPT { _LIBCPP_ASSERT_INTERNAL(__s < __min_cap, "__s should never be greater than or equal to the short string capacity"); __r_.first().__s.__size_ = __s; __r_.first().__s.__is_long_ = false; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS size_type - __get_short_size() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS size_type __get_short_size() const _NOEXCEPT { _LIBCPP_ASSERT_INTERNAL(!__r_.first().__s.__is_long_, "String has to be short when trying to get the short size"); return __r_.first().__s.__size_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_long_size(size_type __s) _NOEXCEPT { - __r_.first().__l.__size_ = __s; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __get_long_size() const _NOEXCEPT { - return __r_.first().__l.__size_; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_size(size_type __s) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __set_long_size(size_type __s) _NOEXCEPT { __r_.first().__l.__size_ = __s; } + _LIBCPP_HIDE_FROM_ABI size_type __get_long_size() const _NOEXCEPT { return __r_.first().__l.__size_; } + _LIBCPP_HIDE_FROM_ABI void __set_size(size_type __s) _NOEXCEPT { if (__is_long()) __set_long_size(__s); else __set_short_size(__s); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_long_cap(size_type __s) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __set_long_cap(size_type __s) _NOEXCEPT { __r_.first().__l.__cap_ = __s / __endian_factor; __r_.first().__l.__is_long_ = true; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __get_long_cap() const _NOEXCEPT { - return __r_.first().__l.__cap_ * __endian_factor; - } + _LIBCPP_HIDE_FROM_ABI size_type __get_long_cap() const _NOEXCEPT { return __r_.first().__l.__cap_ * __endian_factor; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_long_pointer(pointer __p) _NOEXCEPT { - __r_.first().__l.__data_ = __p; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __get_long_pointer() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __set_long_pointer(pointer __p) _NOEXCEPT { __r_.first().__l.__data_ = __p; } + _LIBCPP_HIDE_FROM_ABI pointer __get_long_pointer() _NOEXCEPT { return _LIBCPP_ASAN_VOLATILE_WRAPPER(__r_.first().__l.__data_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_pointer __get_long_pointer() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_pointer __get_long_pointer() const _NOEXCEPT { return _LIBCPP_ASAN_VOLATILE_WRAPPER(__r_.first().__l.__data_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS pointer - __get_short_pointer() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS pointer __get_short_pointer() _NOEXCEPT { return _LIBCPP_ASAN_VOLATILE_WRAPPER(pointer_traits::pointer_to(__r_.first().__s.__data_[0])); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS const_pointer - __get_short_pointer() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS const_pointer __get_short_pointer() const _NOEXCEPT { return _LIBCPP_ASAN_VOLATILE_WRAPPER(pointer_traits::pointer_to(__r_.first().__s.__data_[0])); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __get_pointer() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI pointer __get_pointer() _NOEXCEPT { return __is_long() ? __get_long_pointer() : __get_short_pointer(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_pointer __get_pointer() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_pointer __get_pointer() const _NOEXCEPT { return __is_long() ? __get_long_pointer() : __get_short_pointer(); } // The following functions are no-ops outside of AddressSanitizer mode. - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - __annotate_contiguous_container(const void* __old_mid, const void* __new_mid) const { + _LIBCPP_HIDE_FROM_ABI void __annotate_contiguous_container(const void* __old_mid, const void* __new_mid) const { (void)__old_mid; (void)__new_mid; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) @@ -1687,7 +1577,7 @@ private: #endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT { (void)__current_size; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) if (!__libcpp_is_constant_evaluated()) @@ -1695,14 +1585,14 @@ private: #endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_delete() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT { #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) if (!__libcpp_is_constant_evaluated()) __annotate_contiguous_container(data() + size() + 1, data() + capacity() + 1); #endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_increase(size_type __n) const _NOEXCEPT { (void)__n; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) if (!__libcpp_is_constant_evaluated()) @@ -1710,7 +1600,7 @@ private: #endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_shrink(size_type __old_size) const _NOEXCEPT { (void)__old_size; #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) if (!__libcpp_is_constant_evaluated()) @@ -1719,11 +1609,11 @@ private: } template - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __align_it(size_type __s) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI size_type __align_it(size_type __s) _NOEXCEPT { return (__s + (__a - 1)) & ~(__a - 1); } enum { __alignment = 8 }; - static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __s) _NOEXCEPT { + static _LIBCPP_HIDE_FROM_ABI size_type __recommend(size_type __s) _NOEXCEPT { if (__s < __min_cap) { return static_cast(__min_cap) - 1; } @@ -1734,9 +1624,9 @@ private: return __guess; } - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz, size_type __reserve); - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz); - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(size_type __n, value_type __c); + inline void __init(const value_type* __s, size_type __sz, size_type __reserve); + inline void __init(const value_type* __s, size_type __sz); + inline void __init(size_type __n, value_type __c); // Slow path for the (inlined) copy constructor for 'long' strings. // Always externally instantiated and not inlined. @@ -1746,22 +1636,19 @@ private: // to call the __init() functions as those are marked as inline which may // result in over-aggressive inlining by the compiler, where our aim is // to only inline the fast path code directly in the ctor. - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE void __init_copy_ctor_external(const value_type* __s, size_type __sz); + _LIBCPP_NOINLINE void __init_copy_ctor_external(const value_type* __s, size_type __sz); template ::value, int> = 0> - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(_InputIterator __first, _InputIterator __last); + inline void __init(_InputIterator __first, _InputIterator __last); template ::value, int> = 0> - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(_ForwardIterator __first, _ForwardIterator __last); + inline void __init(_ForwardIterator __first, _ForwardIterator __last); template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - __init_with_sentinel(_InputIterator __first, _Sentinel __last); + _LIBCPP_HIDE_FROM_ABI void __init_with_sentinel(_InputIterator __first, _Sentinel __last); template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - __init_with_size(_InputIterator __first, _Sentinel __last, size_type __sz); + _LIBCPP_HIDE_FROM_ABI void __init_with_size(_InputIterator __first, _Sentinel __last, size_type __sz); - _LIBCPP_CONSTEXPR_SINCE_CXX20 #if _LIBCPP_ABI_VERSION >= 2 // We want to use the function in the dylib in ABIv1 _LIBCPP_HIDE_FROM_ABI #endif @@ -1772,14 +1659,14 @@ private: size_type __n_copy, size_type __n_del, size_type __n_add = 0); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __grow_by_without_replace( + _LIBCPP_HIDE_FROM_ABI void __grow_by_without_replace( size_type __old_cap, size_type __delta_cap, size_type __old_sz, size_type __n_copy, size_type __n_del, size_type __n_add = 0); - _LIBCPP_CONSTEXPR_SINCE_CXX20 void __grow_by_and_replace( + void __grow_by_and_replace( size_type __old_cap, size_type __delta_cap, size_type __old_sz, @@ -1792,22 +1679,22 @@ private: // have proof that the input does not alias the current instance. // For example, operator=(basic_string) performs a 'self' check. template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE basic_string& __assign_no_alias(const value_type* __s, size_type __n); + _LIBCPP_NOINLINE basic_string& __assign_no_alias(const value_type* __s, size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __erase_to_end(size_type __pos) { + _LIBCPP_HIDE_FROM_ABI void __erase_to_end(size_type __pos) { __null_terminate_at(std::__to_address(__get_pointer()), __pos); } // __erase_external_with_move is invoked for erase() invocations where // `n ~= npos`, likely requiring memory moves on the string data. - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE void __erase_external_with_move(size_type __pos, size_type __n); + _LIBCPP_NOINLINE void __erase_external_with_move(size_type __pos, size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __copy_assign_alloc(const basic_string& __str) { + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const basic_string& __str) { __copy_assign_alloc( __str, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __copy_assign_alloc(const basic_string& __str, true_type) { + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const basic_string& __str, true_type) { if (__alloc() == __str.__alloc()) __alloc() = __str.__alloc(); else { @@ -1830,28 +1717,22 @@ private: } } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - __copy_assign_alloc(const basic_string&, false_type) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const basic_string&, false_type) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign_alloc(basic_string& __str) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_move_assignment::value || - is_nothrow_move_assignable::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(basic_string& __str) { __move_assign_alloc( __str, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign_alloc(basic_string& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value) { - __alloc() = std::move(__c.__alloc()); - } + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(basic_string& __c, true_type) { __alloc() = std::move(__c.__alloc()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign_alloc(basic_string&, false_type) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(basic_string&, false_type) _NOEXCEPT {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE basic_string& __assign_external(const value_type* __s); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE basic_string& __assign_external(const value_type* __s, size_type __n); + _LIBCPP_NOINLINE basic_string& __assign_external(const value_type* __s); + _LIBCPP_NOINLINE basic_string& __assign_external(const value_type* __s, size_type __n); // Assigns the value in __s, guaranteed to be __n < __min_cap in length. - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& __assign_short(const value_type* __s, size_type __n) { + inline basic_string& __assign_short(const value_type* __s, size_type __n) { size_type __old_size = size(); if (__n > __old_size) __annotate_increase(__n - __old_size); @@ -1864,8 +1745,7 @@ private: return *this; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - __null_terminate_at(value_type* __p, size_type __newsz) { + _LIBCPP_HIDE_FROM_ABI basic_string& __null_terminate_at(value_type* __p, size_type __newsz) { size_type __old_size = size(); if (__newsz > __old_size) __annotate_increase(__newsz - __old_size); @@ -1877,7 +1757,7 @@ private: } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __addr_in_range(const _Tp& __v) const { + _LIBCPP_HIDE_FROM_ABI bool __addr_in_range(const _Tp& __v) const { return std::__is_pointer_in_range(data(), data() + size() + 1, std::addressof(__v)); } @@ -1889,11 +1769,11 @@ private: std::__throw_out_of_range("basic_string"); } - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const value_type*, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(value_type, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const value_type*); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, value_type); + friend basic_string operator+ <>(const basic_string&, const basic_string&); + friend basic_string operator+ <>(const value_type*, const basic_string&); + friend basic_string operator+ <>(value_type, const basic_string&); + friend basic_string operator+ <>(const basic_string&, const value_type*); + friend basic_string operator+ <>(const basic_string&, value_type); }; // These declarations must appear before any functions are implicitly used @@ -1913,8 +1793,7 @@ _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_LIBCPP_DECLARE, wchar_t) #undef _LIBCPP_DECLARE template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz, size_type __reserve) { +void basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz, size_type __reserve) { if (__libcpp_is_constant_evaluated()) __r_.first() = __rep(); if (__reserve > max_size()) @@ -1937,8 +1816,7 @@ basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_ty } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz) { +void basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_type __sz) { if (__libcpp_is_constant_evaluated()) __r_.first() = __rep(); if (__sz > max_size()) @@ -1961,7 +1839,7 @@ basic_string<_CharT, _Traits, _Allocator>::__init(const value_type* __s, size_ty } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE void +_LIBCPP_NOINLINE void basic_string<_CharT, _Traits, _Allocator>::__init_copy_ctor_external(const value_type* __s, size_type __sz) { if (__libcpp_is_constant_evaluated()) __r_.first() = __rep(); @@ -1985,7 +1863,7 @@ basic_string<_CharT, _Traits, _Allocator>::__init_copy_ctor_external(const value } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__init(size_type __n, value_type __c) { +void basic_string<_CharT, _Traits, _Allocator>::__init(size_type __n, value_type __c) { if (__libcpp_is_constant_evaluated()) __r_.first() = __rep(); @@ -2010,14 +1888,13 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__ template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -basic_string<_CharT, _Traits, _Allocator>::__init(_InputIterator __first, _InputIterator __last) { +void basic_string<_CharT, _Traits, _Allocator>::__init(_InputIterator __first, _InputIterator __last) { __init_with_sentinel(std::move(__first), std::move(__last)); } template template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +_LIBCPP_HIDE_FROM_ABI void basic_string<_CharT, _Traits, _Allocator>::__init_with_sentinel(_InputIterator __first, _Sentinel __last) { __r_.first() = __rep(); __annotate_new(0); @@ -2039,15 +1916,14 @@ basic_string<_CharT, _Traits, _Allocator>::__init_with_sentinel(_InputIterator _ template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -basic_string<_CharT, _Traits, _Allocator>::__init(_ForwardIterator __first, _ForwardIterator __last) { +void basic_string<_CharT, _Traits, _Allocator>::__init(_ForwardIterator __first, _ForwardIterator __last) { size_type __sz = static_cast(std::distance(__first, __last)); __init_with_size(__first, __last, __sz); } template template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +_LIBCPP_HIDE_FROM_ABI void basic_string<_CharT, _Traits, _Allocator>::__init_with_size(_InputIterator __first, _Sentinel __last, size_type __sz) { if (__libcpp_is_constant_evaluated()) __r_.first() = __rep(); @@ -2085,7 +1961,7 @@ basic_string<_CharT, _Traits, _Allocator>::__init_with_size(_InputIterator __fir } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__grow_by_and_replace( +void basic_string<_CharT, _Traits, _Allocator>::__grow_by_and_replace( size_type __old_cap, size_type __delta_cap, size_type __old_sz, @@ -2125,17 +2001,17 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__ // may also not set the size at all when the string was short initially. This leads to unpredictable size value. It is // not removed or changed to avoid breaking the ABI. template -void _LIBCPP_CONSTEXPR_SINCE_CXX20 +void #if _LIBCPP_ABI_VERSION >= 2 // We want to use the function in the dylib in ABIv1 -_LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI #endif -_LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Traits, _Allocator>::__grow_by( - size_type __old_cap, - size_type __delta_cap, - size_type __old_sz, - size_type __n_copy, - size_type __n_del, - size_type __n_add) { + _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Traits, _Allocator>::__grow_by( + size_type __old_cap, + size_type __delta_cap, + size_type __old_sz, + size_type __n_copy, + size_type __n_del, + size_type __n_add) { size_type __ms = max_size(); if (__delta_cap > __ms - __old_cap) __throw_length_error(); @@ -2159,8 +2035,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait } template -void _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI -basic_string<_CharT, _Traits, _Allocator>::__grow_by_without_replace( +void _LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator>::__grow_by_without_replace( size_type __old_cap, size_type __delta_cap, size_type __old_sz, @@ -2178,7 +2053,7 @@ basic_string<_CharT, _Traits, _Allocator>::__grow_by_without_replace( template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE basic_string<_CharT, _Traits, _Allocator>& +_LIBCPP_NOINLINE basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::__assign_no_alias(const value_type* __s, size_type __n) { size_type __cap = __is_short ? static_cast(__min_cap) : __get_long_cap(); if (__n < __cap) { @@ -2199,7 +2074,7 @@ basic_string<_CharT, _Traits, _Allocator>::__assign_no_alias(const value_type* _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE basic_string<_CharT, _Traits, _Allocator>& +_LIBCPP_NOINLINE basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::__assign_external(const value_type* __s, size_type __n) { size_type __cap = capacity(); if (__cap >= __n) { @@ -2217,14 +2092,14 @@ basic_string<_CharT, _Traits, _Allocator>::__assign_external(const value_type* _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::assign(const value_type* __s, size_type __n) { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::assign received nullptr"); return (__builtin_constant_p(__n) && __fits_in_sso(__n)) ? __assign_short(__s, __n) : __assign_external(__s, __n); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::assign(size_type __n, value_type __c) { size_type __cap = capacity(); size_type __old_size = size(); @@ -2240,8 +2115,7 @@ basic_string<_CharT, _Traits, _Allocator>::assign(size_type __n, value_type __c) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::operator=(value_type __c) { +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::operator=(value_type __c) { pointer __p; size_type __old_size = size(); if (__old_size == 0) @@ -2261,7 +2135,7 @@ basic_string<_CharT, _Traits, _Allocator>::operator=(value_type __c) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS basic_string<_CharT, _Traits, _Allocator>& +_LIBCPP_STRING_INTERNAL_MEMORY_ACCESS basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::operator=(const basic_string& __str) { if (this != std::addressof(__str)) { __copy_assign_alloc(__str); @@ -2285,7 +2159,7 @@ basic_string<_CharT, _Traits, _Allocator>::operator=(const basic_string& __str) template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::assign(_InputIterator __first, _InputIterator __last) { __assign_with_sentinel(__first, __last); return *this; @@ -2293,7 +2167,7 @@ basic_string<_CharT, _Traits, _Allocator>::assign(_InputIterator __first, _Input template template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void +_LIBCPP_HIDE_FROM_ABI void basic_string<_CharT, _Traits, _Allocator>::__assign_with_sentinel(_InputIterator __first, _Sentinel __last) { const basic_string __temp(__init_with_sentinel_tag(), std::move(__first), std::move(__last), __alloc()); assign(__temp.data(), __temp.size()); @@ -2301,7 +2175,7 @@ basic_string<_CharT, _Traits, _Allocator>::__assign_with_sentinel(_InputIterator template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::assign(_ForwardIterator __first, _ForwardIterator __last) { if (__string_is_trivial_iterator<_ForwardIterator>::value) { size_type __n = static_cast(std::distance(__first, __last)); @@ -2315,7 +2189,7 @@ basic_string<_CharT, _Traits, _Allocator>::assign(_ForwardIterator __first, _For template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void +_LIBCPP_HIDE_FROM_ABI void basic_string<_CharT, _Traits, _Allocator>::__assign_trivial(_Iterator __first, _Sentinel __last, size_type __n) { _LIBCPP_ASSERT_INTERNAL( __string_is_trivial_iterator<_Iterator>::value, "The iterator type given to `__assign_trivial` must be trivial"); @@ -2344,7 +2218,7 @@ basic_string<_CharT, _Traits, _Allocator>::__assign_trivial(_Iterator __first, _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::assign(const basic_string& __str, size_type __pos, size_type __n) { size_type __sz = __str.size(); if (__pos > __sz) @@ -2357,7 +2231,7 @@ template ::value && !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::assign(const _Tp& __t, size_type __pos, size_type __n) { __self_view __sv = __t; size_type __sz = __sv.size(); @@ -2367,14 +2241,13 @@ basic_string<_CharT, _Traits, _Allocator>::assign(const _Tp& __t, size_type __po } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE basic_string<_CharT, _Traits, _Allocator>& +_LIBCPP_NOINLINE basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::__assign_external(const value_type* __s) { return __assign_external(__s, traits_type::length(__s)); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::assign(const value_type* __s) { +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::assign(const value_type* __s) { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::assign received nullptr"); return __builtin_constant_p(*__s) ? (__fits_in_sso(traits_type::length(__s)) ? __assign_short(__s, traits_type::length(__s)) @@ -2384,7 +2257,7 @@ basic_string<_CharT, _Traits, _Allocator>::assign(const value_type* __s) { // append template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(const value_type* __s, size_type __n) { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::append received nullptr"); size_type __cap = capacity(); @@ -2404,7 +2277,7 @@ basic_string<_CharT, _Traits, _Allocator>::append(const value_type* __s, size_ty } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(size_type __n, value_type __c) { if (__n) { size_type __cap = capacity(); @@ -2422,8 +2295,7 @@ basic_string<_CharT, _Traits, _Allocator>::append(size_type __n, value_type __c) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void -basic_string<_CharT, _Traits, _Allocator>::__append_default_init(size_type __n) { +inline void basic_string<_CharT, _Traits, _Allocator>::__append_default_init(size_type __n) { if (__n) { size_type __cap = capacity(); size_type __sz = size(); @@ -2438,7 +2310,7 @@ basic_string<_CharT, _Traits, _Allocator>::__append_default_init(size_type __n) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::push_back(value_type __c) { +void basic_string<_CharT, _Traits, _Allocator>::push_back(value_type __c) { bool __is_short = !__is_long(); size_type __cap; size_type __sz; @@ -2469,7 +2341,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::pu template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(_ForwardIterator __first, _ForwardIterator __last) { size_type __sz = size(); size_type __cap = capacity(); @@ -2491,7 +2363,7 @@ basic_string<_CharT, _Traits, _Allocator>::append(_ForwardIterator __first, _For } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(const basic_string& __str, size_type __pos, size_type __n) { size_type __sz = __str.size(); if (__pos > __sz) @@ -2504,7 +2376,7 @@ template ::value && !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(const _Tp& __t, size_type __pos, size_type __n) { __self_view __sv = __t; size_type __sz = __sv.size(); @@ -2514,8 +2386,7 @@ basic_string<_CharT, _Traits, _Allocator>::append(const _Tp& __t, size_type __po } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::append(const value_type* __s) { +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(const value_type* __s) { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::append received nullptr"); return append(__s, traits_type::length(__s)); } @@ -2523,7 +2394,7 @@ basic_string<_CharT, _Traits, _Allocator>::append(const value_type* __s) { // insert template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos, const value_type* __s, size_type __n) { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::insert received nullptr"); size_type __sz = size(); @@ -2551,7 +2422,7 @@ basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos, const value_t } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos, size_type __n, value_type __c) { size_type __sz = size(); if (__pos > __sz) @@ -2579,7 +2450,7 @@ basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos, size_type __n template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator +typename basic_string<_CharT, _Traits, _Allocator>::iterator basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _InputIterator __first, _InputIterator __last) { const basic_string __temp(__first, __last, __alloc()); return insert(__pos, __temp.data(), __temp.data() + __temp.size()); @@ -2587,8 +2458,7 @@ basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _InputIt template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator -basic_string<_CharT, _Traits, _Allocator>::insert( +typename basic_string<_CharT, _Traits, _Allocator>::iterator basic_string<_CharT, _Traits, _Allocator>::insert( const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last) { auto __n = static_cast(std::distance(__first, __last)); return __insert_with_size(__pos, __first, __last, __n); @@ -2596,7 +2466,7 @@ basic_string<_CharT, _Traits, _Allocator>::insert( template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator +typename basic_string<_CharT, _Traits, _Allocator>::iterator basic_string<_CharT, _Traits, _Allocator>::__insert_with_size( const_iterator __pos, _Iterator __first, _Sentinel __last, size_type __n) { size_type __ip = static_cast(__pos - begin()); @@ -2612,8 +2482,7 @@ basic_string<_CharT, _Traits, _Allocator>::__insert_with_size( } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::insert( +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::insert( size_type __pos1, const basic_string& __str, size_type __pos2, size_type __n) { size_type __str_sz = __str.size(); if (__pos2 > __str_sz) @@ -2626,7 +2495,7 @@ template ::value && !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos1, const _Tp& __t, size_type __pos2, size_type __n) { __self_view __sv = __t; size_type __str_sz = __sv.size(); @@ -2636,14 +2505,14 @@ basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos1, const _Tp& _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos, const value_type* __s) { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::insert received nullptr"); return insert(__pos, __s, traits_type::length(__s)); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator +typename basic_string<_CharT, _Traits, _Allocator>::iterator basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, value_type __c) { size_type __ip = static_cast(__pos - begin()); size_type __sz = size(); @@ -2668,8 +2537,7 @@ basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, value_ty // replace template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::replace( +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace( size_type __pos, size_type __n1, const value_type* __s, size_type __n2) _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK { _LIBCPP_ASSERT_NON_NULL(__n2 == 0 || __s != nullptr, "string::replace received nullptr"); @@ -2713,7 +2581,7 @@ basic_string<_CharT, _Traits, _Allocator>::replace( } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace(size_type __pos, size_type __n1, size_type __n2, value_type __c) { size_type __sz = size(); if (__pos > __sz) @@ -2740,16 +2608,14 @@ basic_string<_CharT, _Traits, _Allocator>::replace(size_type __pos, size_type __ template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::replace( +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace( const_iterator __i1, const_iterator __i2, _InputIterator __j1, _InputIterator __j2) { const basic_string __temp(__j1, __j2, __alloc()); return replace(__i1, __i2, __temp); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::replace( +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace( size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2) { size_type __str_sz = __str.size(); if (__pos2 > __str_sz) @@ -2762,8 +2628,7 @@ template ::value && !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::replace( +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace( size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2) { __self_view __sv = __t; size_type __str_sz = __sv.size(); @@ -2773,7 +2638,7 @@ basic_string<_CharT, _Traits, _Allocator>::replace( } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace(size_type __pos, size_type __n1, const value_type* __s) { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::replace received nullptr"); return replace(__pos, __n1, __s, traits_type::length(__s)); @@ -2784,7 +2649,7 @@ basic_string<_CharT, _Traits, _Allocator>::replace(size_type __pos, size_type __ // 'externally instantiated' erase() implementation, called when __n != npos. // Does not check __pos against size() template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE void +_LIBCPP_NOINLINE void basic_string<_CharT, _Traits, _Allocator>::__erase_external_with_move(size_type __pos, size_type __n) { if (__n) { size_type __sz = size(); @@ -2798,7 +2663,7 @@ basic_string<_CharT, _Traits, _Allocator>::__erase_external_with_move(size_type } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& +basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::erase(size_type __pos, size_type __n) { if (__pos > size()) __throw_out_of_range(); @@ -2811,7 +2676,7 @@ basic_string<_CharT, _Traits, _Allocator>::erase(size_type __pos, size_type __n) } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator +inline typename basic_string<_CharT, _Traits, _Allocator>::iterator basic_string<_CharT, _Traits, _Allocator>::erase(const_iterator __pos) { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __pos != end(), "string::erase(iterator) called with a non-dereferenceable iterator"); @@ -2822,7 +2687,7 @@ basic_string<_CharT, _Traits, _Allocator>::erase(const_iterator __pos) { } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator +inline typename basic_string<_CharT, _Traits, _Allocator>::iterator basic_string<_CharT, _Traits, _Allocator>::erase(const_iterator __first, const_iterator __last) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "string::erase(first, last) called with invalid range"); iterator __b = begin(); @@ -2832,13 +2697,13 @@ basic_string<_CharT, _Traits, _Allocator>::erase(const_iterator __first, const_i } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::pop_back() { +inline void basic_string<_CharT, _Traits, _Allocator>::pop_back() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string::pop_back(): string is already empty"); __erase_to_end(size() - 1); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::clear() _NOEXCEPT { +inline void basic_string<_CharT, _Traits, _Allocator>::clear() _NOEXCEPT { size_type __old_size = size(); if (__is_long()) { traits_type::assign(*__get_long_pointer(), value_type()); @@ -2851,7 +2716,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::resize(size_type __n, value_type __c) { +void basic_string<_CharT, _Traits, _Allocator>::resize(size_type __n, value_type __c) { size_type __sz = size(); if (__n > __sz) append(__n - __sz, __c); @@ -2860,8 +2725,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void -basic_string<_CharT, _Traits, _Allocator>::__resize_default_init(size_type __n) { +inline void basic_string<_CharT, _Traits, _Allocator>::__resize_default_init(size_type __n) { size_type __sz = size(); if (__n > __sz) { __append_default_init(__n - __sz); @@ -2870,7 +2734,7 @@ basic_string<_CharT, _Traits, _Allocator>::__resize_default_init(size_type __n) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::reserve(size_type __requested_capacity) { +void basic_string<_CharT, _Traits, _Allocator>::reserve(size_type __requested_capacity) { if (__requested_capacity > max_size()) __throw_length_error(); @@ -2889,7 +2753,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::shrink_to_fit() _NOEXCEPT { +inline void basic_string<_CharT, _Traits, _Allocator>::shrink_to_fit() _NOEXCEPT { size_type __target_capacity = __recommend(size()); if (__target_capacity == capacity()) return; @@ -2898,8 +2762,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void -basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target_capacity) { +inline void basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target_capacity) { __annotate_delete(); size_type __cap = capacity(); size_type __sz = size(); @@ -2961,7 +2824,7 @@ basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::const_reference +typename basic_string<_CharT, _Traits, _Allocator>::const_reference basic_string<_CharT, _Traits, _Allocator>::at(size_type __n) const { if (__n >= size()) __throw_out_of_range(); @@ -2969,7 +2832,7 @@ basic_string<_CharT, _Traits, _Allocator>::at(size_type __n) const { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::reference +typename basic_string<_CharT, _Traits, _Allocator>::reference basic_string<_CharT, _Traits, _Allocator>::at(size_type __n) { if (__n >= size()) __throw_out_of_range(); @@ -2977,7 +2840,7 @@ basic_string<_CharT, _Traits, _Allocator>::at(size_type __n) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::copy(value_type* __s, size_type __n, size_type __pos) const { size_type __sz = size(); if (__pos > __sz) @@ -2988,8 +2851,7 @@ basic_string<_CharT, _Traits, _Allocator>::copy(value_type* __s, size_type __n, } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::swap(basic_string& __str) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v) { +inline void basic_string<_CharT, _Traits, _Allocator>::swap(basic_string& __str) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __alloc_traits::propagate_on_container_swap::value || __alloc_traits::is_always_equal::value || __alloc() == __str.__alloc(), @@ -3017,28 +2879,28 @@ struct _LIBCPP_HIDDEN __traits_eq { }; template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find(): received nullptr"); return std::__str_find(data(), size(), __s, __pos, __n); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find(const basic_string& __str, size_type __pos) const _NOEXCEPT { return std::__str_find(data(), size(), __str.data(), __pos, __str.size()); } template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find(const _Tp& __t, size_type __pos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find(data(), size(), __sv.data(), __pos, __sv.size()); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find(const value_type* __s, size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find(): received nullptr"); return std::__str_find( @@ -3046,7 +2908,7 @@ basic_string<_CharT, _Traits, _Allocator>::find(const value_type* __s, size_type } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find(value_type __c, size_type __pos) const _NOEXCEPT { return std::__str_find(data(), size(), __c, __pos); } @@ -3054,29 +2916,28 @@ basic_string<_CharT, _Traits, _Allocator>::find(value_type __c, size_type __pos) // rfind template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type -basic_string<_CharT, _Traits, _Allocator>::rfind( +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::rfind( const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::rfind(): received nullptr"); return std::__str_rfind(data(), size(), __s, __pos, __n); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::rfind(const basic_string& __str, size_type __pos) const _NOEXCEPT { return std::__str_rfind(data(), size(), __str.data(), __pos, __str.size()); } template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::rfind(const _Tp& __t, size_type __pos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_rfind(data(), size(), __sv.data(), __pos, __sv.size()); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::rfind(const value_type* __s, size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::rfind(): received nullptr"); return std::__str_rfind( @@ -3084,7 +2945,7 @@ basic_string<_CharT, _Traits, _Allocator>::rfind(const value_type* __s, size_typ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::rfind(value_type __c, size_type __pos) const _NOEXCEPT { return std::__str_rfind(data(), size(), __c, __pos); } @@ -3092,15 +2953,14 @@ basic_string<_CharT, _Traits, _Allocator>::rfind(value_type __c, size_type __pos // find_first_of template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type -basic_string<_CharT, _Traits, _Allocator>::find_first_of( +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_of( const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_of(): received nullptr"); return std::__str_find_first_of(data(), size(), __s, __pos, __n); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_of(const basic_string& __str, size_type __pos) const _NOEXCEPT { return std::__str_find_first_of( data(), size(), __str.data(), __pos, __str.size()); @@ -3108,7 +2968,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_of(const basic_string& __s template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_of(const _Tp& __t, size_type __pos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_first_of( @@ -3116,7 +2976,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_of(const _Tp& __t, size_ty } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_of(const value_type* __s, size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_first_of(): received nullptr"); return std::__str_find_first_of( @@ -3124,7 +2984,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_of(const value_type* __s, } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_of(value_type __c, size_type __pos) const _NOEXCEPT { return find(__c, __pos); } @@ -3132,7 +2992,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_of(value_type __c, size_ty // find_last_of template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_of( const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_of(): received nullptr"); @@ -3140,7 +3000,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_of( } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_of(const basic_string& __str, size_type __pos) const _NOEXCEPT { return std::__str_find_last_of( data(), size(), __str.data(), __pos, __str.size()); @@ -3148,7 +3008,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_of(const basic_string& __st template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_of(const _Tp& __t, size_type __pos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_last_of( @@ -3156,7 +3016,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_of(const _Tp& __t, size_typ } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_of(const value_type* __s, size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_last_of(): received nullptr"); return std::__str_find_last_of( @@ -3164,7 +3024,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_of(const value_type* __s, s } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_of(value_type __c, size_type __pos) const _NOEXCEPT { return rfind(__c, __pos); } @@ -3172,7 +3032,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_of(value_type __c, size_typ // find_first_not_of template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_not_of( const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_not_of(): received nullptr"); @@ -3180,7 +3040,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_not_of( } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_not_of( const basic_string& __str, size_type __pos) const _NOEXCEPT { return std::__str_find_first_not_of( @@ -3189,7 +3049,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_not_of( template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_not_of(const _Tp& __t, size_type __pos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_first_not_of( @@ -3197,7 +3057,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_not_of(const _Tp& __t, siz } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_not_of(const value_type* __s, size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of( @@ -3205,7 +3065,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_not_of(const value_type* _ } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_first_not_of(value_type __c, size_type __pos) const _NOEXCEPT { return std::__str_find_first_not_of(data(), size(), __c, __pos); } @@ -3213,7 +3073,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_first_not_of(value_type __c, siz // find_last_not_of template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_not_of( const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_not_of(): received nullptr"); @@ -3221,7 +3081,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_not_of( } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_not_of( const basic_string& __str, size_type __pos) const _NOEXCEPT { return std::__str_find_last_not_of( @@ -3230,7 +3090,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_not_of( template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_not_of(const _Tp& __t, size_type __pos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_last_not_of( @@ -3238,7 +3098,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_not_of(const _Tp& __t, size } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_not_of(const value_type* __s, size_type __pos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of( @@ -3246,7 +3106,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_not_of(const value_type* __ } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::size_type +inline typename basic_string<_CharT, _Traits, _Allocator>::size_type basic_string<_CharT, _Traits, _Allocator>::find_last_not_of(value_type __c, size_type __pos) const _NOEXCEPT { return std::__str_find_last_not_of(data(), size(), __c, __pos); } @@ -3255,7 +3115,7 @@ basic_string<_CharT, _Traits, _Allocator>::find_last_not_of(value_type __c, size template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocator>::compare(const _Tp& __t) const _NOEXCEPT { +int basic_string<_CharT, _Traits, _Allocator>::compare(const _Tp& __t) const _NOEXCEPT { __self_view __sv = __t; size_t __lhs_sz = size(); size_t __rhs_sz = __sv.size(); @@ -3270,13 +3130,12 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocator>::com } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 int -basic_string<_CharT, _Traits, _Allocator>::compare(const basic_string& __str) const _NOEXCEPT { +inline int basic_string<_CharT, _Traits, _Allocator>::compare(const basic_string& __str) const _NOEXCEPT { return compare(__self_view(__str)); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocator>::compare( +inline int basic_string<_CharT, _Traits, _Allocator>::compare( size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const { _LIBCPP_ASSERT_NON_NULL(__n2 == 0 || __s != nullptr, "string::compare(): received nullptr"); size_type __sz = size(); @@ -3295,14 +3154,13 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocato template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 int -basic_string<_CharT, _Traits, _Allocator>::compare(size_type __pos1, size_type __n1, const _Tp& __t) const { +int basic_string<_CharT, _Traits, _Allocator>::compare(size_type __pos1, size_type __n1, const _Tp& __t) const { __self_view __sv = __t; return compare(__pos1, __n1, __sv.data(), __sv.size()); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 int +inline int basic_string<_CharT, _Traits, _Allocator>::compare(size_type __pos1, size_type __n1, const basic_string& __str) const { return compare(__pos1, __n1, __str.data(), __str.size()); } @@ -3312,28 +3170,26 @@ template ::value && !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocator>::compare( +int basic_string<_CharT, _Traits, _Allocator>::compare( size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2) const { __self_view __sv = __t; return __self_view(*this).substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2)); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocator>::compare( +int basic_string<_CharT, _Traits, _Allocator>::compare( size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2) const { return compare(__pos1, __n1, __self_view(__str), __pos2, __n2); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 int -basic_string<_CharT, _Traits, _Allocator>::compare(const value_type* __s) const _NOEXCEPT { +int basic_string<_CharT, _Traits, _Allocator>::compare(const value_type* __s) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::compare(): received nullptr"); return compare(0, npos, __s, traits_type::length(__s)); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 int -basic_string<_CharT, _Traits, _Allocator>::compare(size_type __pos1, size_type __n1, const value_type* __s) const { +int basic_string<_CharT, _Traits, _Allocator>::compare(size_type __pos1, size_type __n1, const value_type* __s) const { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string::compare(): received nullptr"); return compare(__pos1, __n1, __s, traits_type::length(__s)); } @@ -3341,7 +3197,7 @@ basic_string<_CharT, _Traits, _Allocator>::compare(size_type __pos1, size_type _ // __invariants template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 bool basic_string<_CharT, _Traits, _Allocator>::__invariants() const { +inline bool basic_string<_CharT, _Traits, _Allocator>::__invariants() const { if (size() > capacity()) return false; if (capacity() < __min_cap - 1) @@ -3356,7 +3212,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 bool basic_string<_CharT, _Traits, _Allocat // __clear_and_shrink template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__clear_and_shrink() _NOEXCEPT { +inline void basic_string<_CharT, _Traits, _Allocator>::__clear_and_shrink() _NOEXCEPT { clear(); if (__is_long()) { __annotate_delete(); @@ -3368,17 +3224,15 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat // operator== template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool -operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) _NOEXCEPT { size_t __lhs_sz = __lhs.size(); return __lhs_sz == __rhs.size() && _Traits::compare(__lhs.data(), __rhs.data(), __lhs_sz) == 0; } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool -operator==(const basic_string, _Allocator>& __lhs, - const basic_string, _Allocator>& __rhs) _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const basic_string, _Allocator>& __lhs, + const basic_string, _Allocator>& __rhs) _NOEXCEPT { size_t __sz = __lhs.size(); if (__sz != __rhs.size()) return false; @@ -3397,7 +3251,7 @@ operator==(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) _NOEXCEPT { typedef basic_string<_CharT, _Traits, _Allocator> _String; _LIBCPP_ASSERT_NON_NULL(__rhs != nullptr, "operator==(basic_string, char*): received nullptr"); @@ -3508,7 +3362,7 @@ operator>=(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& // operator + template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { using _String = basic_string<_CharT, _Traits, _Allocator>; @@ -3525,7 +3379,7 @@ operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, } template -_LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDDEN basic_string<_CharT, _Traits, _Allocator> operator+(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { using _String = basic_string<_CharT, _Traits, _Allocator>; auto __lhs_sz = _Traits::length(__lhs); @@ -3541,7 +3395,7 @@ operator+(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> operator+(_CharT __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { using _String = basic_string<_CharT, _Traits, _Allocator>; typename _String::size_type __rhs_sz = __rhs.size(); @@ -3556,7 +3410,7 @@ operator+(_CharT __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +inline basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) { using _String = basic_string<_CharT, _Traits, _Allocator>; typename _String::size_type __lhs_sz = __lhs.size(); @@ -3572,7 +3426,7 @@ operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* } template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, _CharT __rhs) { using _String = basic_string<_CharT, _Traits, _Allocator>; typename _String::size_type __lhs_sz = __lhs.size(); @@ -3589,9 +3443,8 @@ operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, _CharT __rhs) // swap template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -swap(basic_string<_CharT, _Traits, _Allocator>& __lhs, basic_string<_CharT, _Traits, _Allocator>& __rhs) - _NOEXCEPT_(_NOEXCEPT_(__lhs.swap(__rhs))) { +inline _LIBCPP_HIDE_FROM_ABI void +swap(basic_string<_CharT, _Traits, _Allocator>& __lhs, basic_string<_CharT, _Traits, _Allocator>& __rhs) { __lhs.swap(__rhs); } diff --git a/libcxx/include/__cxx03/string_view b/libcxx/include/__cxx03/string_view index 9e5f0acb6495d..da9003b635bb9 100644 --- a/libcxx/include/__cxx03/string_view +++ b/libcxx/include/__cxx03/string_view @@ -249,8 +249,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // TODO: This is a workaround for some vendors to carry a downstream diff to accept `nullptr` in // string_view constructors. This can be refactored when this exact form isn't needed anymore. template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR inline size_t -__char_traits_length_checked(const typename _Traits::char_type* __s) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI inline size_t __char_traits_length_checked(const typename _Traits::char_type* __s) _NOEXCEPT { // This needs to be a single statement for C++11 constexpr return _LIBCPP_ASSERT_NON_NULL( __s != nullptr, "null pointer passed to non-null argument of char_traits<...>::length"), @@ -274,12 +273,12 @@ public: #else using const_iterator = const_pointer; #endif - using iterator = const_iterator; - using const_reverse_iterator = std::reverse_iterator; - using reverse_iterator = const_reverse_iterator; - using size_type = size_t; - using difference_type = ptrdiff_t; - static _LIBCPP_CONSTEXPR const size_type npos = -1; // size_type(-1); + using iterator = const_iterator; + using const_reverse_iterator = std::reverse_iterator; + using reverse_iterator = const_reverse_iterator; + using size_type = size_t; + using difference_type = ptrdiff_t; + static const size_type npos = -1; // size_type(-1); static_assert(!is_array::value, "Character type of basic_string_view must not be an array"); static_assert(is_standard_layout::value, "Character type of basic_string_view must be standard-layout"); @@ -288,25 +287,25 @@ public: "traits_type::char_type must be the same type as CharT"); // [string.view.cons], construct/copy - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view() _NOEXCEPT : __data_(nullptr), __size_(0) {} + _LIBCPP_HIDE_FROM_ABI basic_string_view() _NOEXCEPT : __data_(nullptr), __size_(0) {} _LIBCPP_HIDE_FROM_ABI basic_string_view(const basic_string_view&) _NOEXCEPT = default; _LIBCPP_HIDE_FROM_ABI basic_string_view& operator=(const basic_string_view&) _NOEXCEPT = default; - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s, size_type __len) _NOEXCEPT + _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s, size_type __len) _NOEXCEPT : __data_(__s), __size_(__len) {} - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s) + _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s) : __data_(__s), __size_(std::__char_traits_length_checked<_Traits>(__s)) {} // [string.view.iterators], iterators - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return cbegin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return cbegin(); } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return cend(); } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return cend(); } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS return std::__make_bounded_iter(data(), data(), data() + size()); #else @@ -314,7 +313,7 @@ public: #endif } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS return std::__make_bounded_iter(data() + size(), data(), data() + size()); #else @@ -322,65 +321,57 @@ public: #endif } - _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { - return const_reverse_iterator(cend()); - } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(cend()); } - _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { - return const_reverse_iterator(cbegin()); - } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(cbegin()); } - _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { - return const_reverse_iterator(cend()); - } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return const_reverse_iterator(cend()); } - _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { - return const_reverse_iterator(cbegin()); - } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return const_reverse_iterator(cbegin()); } // [string.view.capacity], capacity - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; } + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type length() const _NOEXCEPT { return __size_; } + _LIBCPP_HIDE_FROM_ABI size_type length() const _NOEXCEPT { return __size_; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return numeric_limits::max() / sizeof(value_type); } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return __size_ == 0; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __size_ == 0; } // [string.view.access], element access - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __pos) const _NOEXCEPT { return _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos < size(), "string_view[] index out of bounds"), __data_[__pos]; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __pos) const { + _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __pos) const { return __pos >= size() ? (__throw_out_of_range("string_view::at"), __data_[0]) : __data_[__pos]; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT { return _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string_view::front(): string is empty"), __data_[0]; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { return _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string_view::back(): string is empty"), __data_[__size_ - 1]; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_pointer data() const _NOEXCEPT { return __data_; } + _LIBCPP_HIDE_FROM_ABI const_pointer data() const _NOEXCEPT { return __data_; } // [string.view.modifiers], modifiers: - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI void remove_prefix(size_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void remove_prefix(size_type __n) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n <= size(), "remove_prefix() can't remove more than size()"); __data_ += __n; __size_ -= __n; } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI void remove_suffix(size_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void remove_suffix(size_type __n) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n <= size(), "remove_suffix() can't remove more than size()"); __size_ -= __n; } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI void swap(basic_string_view& __other) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void swap(basic_string_view& __other) _NOEXCEPT { const value_type* __p = __data_; __data_ = __other.__data_; __other.__data_ = __p; @@ -390,8 +381,7 @@ public: __other.__size_ = __sz; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - copy(_CharT* __s, size_type __n, size_type __pos = 0) const { + _LIBCPP_HIDE_FROM_ABI size_type copy(_CharT* __s, size_type __n, size_type __pos = 0) const { if (__pos > size()) __throw_out_of_range("string_view::copy"); size_type __rlen = std::min(__n, size() - __pos); @@ -399,12 +389,12 @@ public: return __rlen; } - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view substr(size_type __pos = 0, size_type __n = npos) const { + _LIBCPP_HIDE_FROM_ABI basic_string_view substr(size_type __pos = 0, size_type __n = npos) const { return __pos > size() ? (__throw_out_of_range("string_view::substr"), basic_string_view()) : basic_string_view(data() + __pos, std::min(__n, size() - __pos)); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 int compare(basic_string_view __sv) const _NOEXCEPT { + int compare(basic_string_view __sv) const _NOEXCEPT { size_type __rlen = std::min(size(), __sv.size()); int __retval = _Traits::compare(data(), __sv.data(), __rlen); if (__retval == 0) // first __rlen chars matched @@ -412,180 +402,152 @@ public: return __retval; } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int - compare(size_type __pos1, size_type __n1, basic_string_view __sv) const { + _LIBCPP_HIDE_FROM_ABI int compare(size_type __pos1, size_type __n1, basic_string_view __sv) const { return substr(__pos1, __n1).compare(__sv); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int + _LIBCPP_HIDE_FROM_ABI int compare(size_type __pos1, size_type __n1, basic_string_view __sv, size_type __pos2, size_type __n2) const { return substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2)); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int compare(const _CharT* __s) const _NOEXCEPT { - return compare(basic_string_view(__s)); - } + _LIBCPP_HIDE_FROM_ABI int compare(const _CharT* __s) const _NOEXCEPT { return compare(basic_string_view(__s)); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int - compare(size_type __pos1, size_type __n1, const _CharT* __s) const { + _LIBCPP_HIDE_FROM_ABI int compare(size_type __pos1, size_type __n1, const _CharT* __s) const { return substr(__pos1, __n1).compare(basic_string_view(__s)); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int - compare(size_type __pos1, size_type __n1, const _CharT* __s, size_type __n2) const { + _LIBCPP_HIDE_FROM_ABI int compare(size_type __pos1, size_type __n1, const _CharT* __s, size_type __n2) const { return substr(__pos1, __n1).compare(basic_string_view(__s, __n2)); } // find - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find(): received nullptr"); return std::__str_find(data(), size(), __s.data(), __pos, __s.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type find(_CharT __c, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find(_CharT __c, size_type __pos = 0) const _NOEXCEPT { return std::__str_find(data(), size(), __c, __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find(): received nullptr"); return std::__str_find(data(), size(), __s, __pos, __n); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find(): received nullptr"); return std::__str_find( data(), size(), __s, __pos, traits_type::length(__s)); } // rfind - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - rfind(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type rfind(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find(): received nullptr"); return std::__str_rfind(data(), size(), __s.data(), __pos, __s.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - rfind(_CharT __c, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type rfind(_CharT __c, size_type __pos = npos) const _NOEXCEPT { return std::__str_rfind(data(), size(), __c, __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - rfind(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type rfind(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::rfind(): received nullptr"); return std::__str_rfind(data(), size(), __s, __pos, __n); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - rfind(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type rfind(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::rfind(): received nullptr"); return std::__str_rfind( data(), size(), __s, __pos, traits_type::length(__s)); } // find_first_of - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find_first_of(): received nullptr"); return std::__str_find_first_of( data(), size(), __s.data(), __pos, __s.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_of(_CharT __c, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_of(_CharT __c, size_type __pos = 0) const _NOEXCEPT { return find(__c, __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_of(): received nullptr"); return std::__str_find_first_of(data(), size(), __s, __pos, __n); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_of(): received nullptr"); return std::__str_find_first_of( data(), size(), __s, __pos, traits_type::length(__s)); } // find_last_of - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s.size() == 0 || __s.data() != nullptr, "string_view::find_last_of(): received nullptr"); return std::__str_find_last_of( data(), size(), __s.data(), __pos, __s.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_of(_CharT __c, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_of(_CharT __c, size_type __pos = npos) const _NOEXCEPT { return rfind(__c, __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_of(): received nullptr"); return std::__str_find_last_of(data(), size(), __s, __pos, __n); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_of(): received nullptr"); return std::__str_find_last_of( data(), size(), __s, __pos, traits_type::length(__s)); } // find_first_not_of - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_not_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_not_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL( __s.size() == 0 || __s.data() != nullptr, "string_view::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of( data(), size(), __s.data(), __pos, __s.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_not_of(_CharT __c, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_not_of(_CharT __c, size_type __pos = 0) const _NOEXCEPT { return std::__str_find_first_not_of(data(), size(), __c, __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of(data(), size(), __s, __pos, __n); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_not_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_first_not_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of( data(), size(), __s, __pos, traits_type::length(__s)); } // find_last_not_of - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_not_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_not_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL( __s.size() == 0 || __s.data() != nullptr, "string_view::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of( data(), size(), __s.data(), __pos, __s.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_not_of(_CharT __c, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_not_of(_CharT __c, size_type __pos = npos) const _NOEXCEPT { return std::__str_find_last_not_of(data(), size(), __c, __pos); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of(data(), size(), __s, __pos, __n); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_not_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type find_last_not_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of( data(), size(), __s, __pos, traits_type::length(__s)); @@ -600,7 +562,7 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(basic_string_view); // operator == template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_HIDE_FROM_ABI bool operator==(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { if (__lhs.size() != __rhs.size()) return false; @@ -610,18 +572,16 @@ operator==(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _ // The dummy default template parameters are used to work around a MSVC issue with mangling, see VSO-409326 for details. // This applies to the other sufficient overloads below for the other comparison operators. template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator==(basic_string_view<_CharT, _Traits> __lhs, - __type_identity_t > __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator==(basic_string_view<_CharT, _Traits> __lhs, + __type_identity_t > __rhs) _NOEXCEPT { if (__lhs.size() != __rhs.size()) return false; return __lhs.compare(__rhs) == 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator==(__type_identity_t > __lhs, - basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator==(__type_identity_t > __lhs, + basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { if (__lhs.size() != __rhs.size()) return false; return __lhs.compare(__rhs) == 0; @@ -629,7 +589,7 @@ operator==(__type_identity_t > __lhs, // operator != template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_HIDE_FROM_ABI bool operator!=(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { if (__lhs.size() != __rhs.size()) return true; @@ -637,18 +597,16 @@ operator!=(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator!=(basic_string_view<_CharT, _Traits> __lhs, - __type_identity_t > __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator!=(basic_string_view<_CharT, _Traits> __lhs, + __type_identity_t > __rhs) _NOEXCEPT { if (__lhs.size() != __rhs.size()) return true; return __lhs.compare(__rhs) != 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator!=(__type_identity_t > __lhs, - basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator!=(__type_identity_t > __lhs, + basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { if (__lhs.size() != __rhs.size()) return true; return __lhs.compare(__rhs) != 0; @@ -656,85 +614,77 @@ operator!=(__type_identity_t > __lhs, // operator < template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_HIDE_FROM_ABI bool operator<(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) < 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator<(basic_string_view<_CharT, _Traits> __lhs, - __type_identity_t > __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<(basic_string_view<_CharT, _Traits> __lhs, + __type_identity_t > __rhs) _NOEXCEPT { return __lhs.compare(__rhs) < 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator<(__type_identity_t > __lhs, - basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<(__type_identity_t > __lhs, + basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) < 0; } // operator > template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_HIDE_FROM_ABI bool operator>(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) > 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator>(basic_string_view<_CharT, _Traits> __lhs, - __type_identity_t > __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>(basic_string_view<_CharT, _Traits> __lhs, + __type_identity_t > __rhs) _NOEXCEPT { return __lhs.compare(__rhs) > 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator>(__type_identity_t > __lhs, - basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>(__type_identity_t > __lhs, + basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) > 0; } // operator <= template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_HIDE_FROM_ABI bool operator<=(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) <= 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator<=(basic_string_view<_CharT, _Traits> __lhs, - __type_identity_t > __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<=(basic_string_view<_CharT, _Traits> __lhs, + __type_identity_t > __rhs) _NOEXCEPT { return __lhs.compare(__rhs) <= 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator<=(__type_identity_t > __lhs, - basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator<=(__type_identity_t > __lhs, + basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) <= 0; } // operator >= template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_HIDE_FROM_ABI bool operator>=(basic_string_view<_CharT, _Traits> __lhs, basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) >= 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator>=(basic_string_view<_CharT, _Traits> __lhs, - __type_identity_t > __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>=(basic_string_view<_CharT, _Traits> __lhs, + __type_identity_t > __rhs) _NOEXCEPT { return __lhs.compare(__rhs) >= 0; } template -_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI bool -operator>=(__type_identity_t > __lhs, - basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { +_LIBCPP_HIDE_FROM_ABI bool operator>=(__type_identity_t > __lhs, + basic_string_view<_CharT, _Traits> __rhs) _NOEXCEPT { return __lhs.compare(__rhs) >= 0; } diff --git a/libcxx/include/__cxx03/typeinfo b/libcxx/include/__cxx03/typeinfo index ec291ccc5446e..5944d2e1926c7 100644 --- a/libcxx/include/__cxx03/typeinfo +++ b/libcxx/include/__cxx03/typeinfo @@ -96,7 +96,7 @@ public: size_t hash_code() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator==(const type_info& __arg) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI bool operator==(const type_info& __arg) const _NOEXCEPT { // When evaluated in a constant expression, both type infos simply can't come // from different translation units, so it is sufficient to compare their addresses. if (__libcpp_is_constant_evaluated()) { @@ -182,12 +182,10 @@ public: struct __type_info_implementations { struct __string_impl_base { typedef const char* __type_name_t; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char* - __type_name_to_string(__type_name_t __v) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT { return __v; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t - __string_to_type_name(const char* __v) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT { return __v; } }; @@ -308,7 +306,7 @@ public: _LIBCPP_HIDE_FROM_ABI size_t hash_code() const _NOEXCEPT { return __impl::__hash(__type_name); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool operator==(const type_info& __arg) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI bool operator==(const type_info& __arg) const _NOEXCEPT { // When evaluated in a constant expression, both type infos simply can't come // from different translation units, so it is sufficient to compare their addresses. if (__libcpp_is_constant_evaluated()) { diff --git a/libcxx/include/__cxx03/unordered_map b/libcxx/include/__cxx03/unordered_map index 233b6e58031a8..db5c8b60ab65b 100644 --- a/libcxx/include/__cxx03/unordered_map +++ b/libcxx/include/__cxx03/unordered_map @@ -620,15 +620,14 @@ template ::value && !__libcpp_is_final<_Hash>::value> class __unordered_map_hasher : private _Hash { public: - _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher() _NOEXCEPT_(is_nothrow_default_constructible<_Hash>::value) : _Hash() {} - _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher(const _Hash& __h) _NOEXCEPT_(is_nothrow_copy_constructible<_Hash>::value) - : _Hash(__h) {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher() : _Hash() {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher(const _Hash& __h) : _Hash(__h) {} _LIBCPP_HIDE_FROM_ABI const _Hash& hash_function() const _NOEXCEPT { return *this; } _LIBCPP_HIDE_FROM_ABI size_t operator()(const _Cp& __x) const { return static_cast(*this)(__x.__get_value().first); } _LIBCPP_HIDE_FROM_ABI size_t operator()(const _Key& __x) const { return static_cast(*this)(__x); } - _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_hasher& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Hash>) { + _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_hasher& __y) { using std::swap; swap(static_cast<_Hash&>(*this), static_cast<_Hash&>(__y)); } @@ -639,23 +638,20 @@ class __unordered_map_hasher<_Key, _Cp, _Hash, _Pred, false> { _Hash __hash_; public: - _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher() _NOEXCEPT_(is_nothrow_default_constructible<_Hash>::value) - : __hash_() {} - _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher(const _Hash& __h) _NOEXCEPT_(is_nothrow_copy_constructible<_Hash>::value) - : __hash_(__h) {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher() : __hash_() {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_hasher(const _Hash& __h) : __hash_(__h) {} _LIBCPP_HIDE_FROM_ABI const _Hash& hash_function() const _NOEXCEPT { return __hash_; } _LIBCPP_HIDE_FROM_ABI size_t operator()(const _Cp& __x) const { return __hash_(__x.__get_value().first); } _LIBCPP_HIDE_FROM_ABI size_t operator()(const _Key& __x) const { return __hash_(__x); } - _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_hasher& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Hash>) { + _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_hasher& __y) { using std::swap; swap(__hash_, __y.__hash_); } }; template -inline _LIBCPP_HIDE_FROM_ABI void -swap(__unordered_map_hasher<_Key, _Cp, _Hash, _Pred, __b>& __x, - __unordered_map_hasher<_Key, _Cp, _Hash, _Pred, __b>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_hasher<_Key, _Cp, _Hash, _Pred, __b>& __x, + __unordered_map_hasher<_Key, _Cp, _Hash, _Pred, __b>& __y) { __x.swap(__y); } @@ -666,9 +662,8 @@ template ::value && !__libcpp_is_final<_Pred>::value> class __unordered_map_equal : private _Pred { public: - _LIBCPP_HIDE_FROM_ABI __unordered_map_equal() _NOEXCEPT_(is_nothrow_default_constructible<_Pred>::value) : _Pred() {} - _LIBCPP_HIDE_FROM_ABI __unordered_map_equal(const _Pred& __p) _NOEXCEPT_(is_nothrow_copy_constructible<_Pred>::value) - : _Pred(__p) {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_equal() : _Pred() {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_equal(const _Pred& __p) : _Pred(__p) {} _LIBCPP_HIDE_FROM_ABI const _Pred& key_eq() const _NOEXCEPT { return *this; } _LIBCPP_HIDE_FROM_ABI bool operator()(const _Cp& __x, const _Cp& __y) const { return static_cast(*this)(__x.__get_value().first, __y.__get_value().first); @@ -679,7 +674,7 @@ public: _LIBCPP_HIDE_FROM_ABI bool operator()(const _Key& __x, const _Cp& __y) const { return static_cast(*this)(__x, __y.__get_value().first); } - _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_equal& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Pred>) { + _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_equal& __y) { using std::swap; swap(static_cast<_Pred&>(*this), static_cast<_Pred&>(__y)); } @@ -690,10 +685,8 @@ class __unordered_map_equal<_Key, _Cp, _Pred, _Hash, false> { _Pred __pred_; public: - _LIBCPP_HIDE_FROM_ABI __unordered_map_equal() _NOEXCEPT_(is_nothrow_default_constructible<_Pred>::value) - : __pred_() {} - _LIBCPP_HIDE_FROM_ABI __unordered_map_equal(const _Pred& __p) _NOEXCEPT_(is_nothrow_copy_constructible<_Pred>::value) - : __pred_(__p) {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_equal() : __pred_() {} + _LIBCPP_HIDE_FROM_ABI __unordered_map_equal(const _Pred& __p) : __pred_(__p) {} _LIBCPP_HIDE_FROM_ABI const _Pred& key_eq() const _NOEXCEPT { return __pred_; } _LIBCPP_HIDE_FROM_ABI bool operator()(const _Cp& __x, const _Cp& __y) const { return __pred_(__x.__get_value().first, __y.__get_value().first); @@ -704,16 +697,15 @@ public: _LIBCPP_HIDE_FROM_ABI bool operator()(const _Key& __x, const _Cp& __y) const { return __pred_(__x, __y.__get_value().first); } - _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_equal& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Pred>) { + _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_equal& __y) { using std::swap; swap(__pred_, __y.__pred_); } }; template -inline _LIBCPP_HIDE_FROM_ABI void -swap(__unordered_map_equal<_Key, _Cp, _Pred, _Hash, __b>& __x, __unordered_map_equal<_Key, _Cp, _Pred, _Hash, __b>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(__unordered_map_equal<_Key, _Cp, _Pred, _Hash, __b>& __x, + __unordered_map_equal<_Key, _Cp, _Pred, _Hash, __b>& __y) { __x.swap(__y); } @@ -934,7 +926,7 @@ public: template friend class _LIBCPP_TEMPLATE_VIS unordered_multimap; - _LIBCPP_HIDE_FROM_ABI unordered_map() _NOEXCEPT_(is_nothrow_default_constructible<__table>::value) {} + _LIBCPP_HIDE_FROM_ABI unordered_map() {} explicit _LIBCPP_HIDE_FROM_ABI unordered_map(size_type __n, const hasher& __hf = hasher(), const key_equal& __eql = key_equal()); _LIBCPP_HIDE_FROM_ABI @@ -1007,9 +999,7 @@ public: } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __table_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(unordered_map& __u) _NOEXCEPT_(__is_nothrow_swappable_v<__table>) { - __table_.swap(__u.__table_); - } + _LIBCPP_HIDE_FROM_ABI void swap(unordered_map& __u) { __table_.swap(__u.__table_); } _LIBCPP_HIDE_FROM_ABI hasher hash_function() const { return __table_.hash_function().hash_function(); } _LIBCPP_HIDE_FROM_ABI key_equal key_eq() const { return __table_.key_eq().key_eq(); } @@ -1161,8 +1151,7 @@ const _Tp& unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>::at(const key_type& __ template inline _LIBCPP_HIDE_FROM_ABI void -swap(unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>& __x, unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>& __x, unordered_map<_Key, _Tp, _Hash, _Pred, _Alloc>& __y) { __x.swap(__y); } @@ -1242,7 +1231,7 @@ public: template friend class _LIBCPP_TEMPLATE_VIS unordered_multimap; - _LIBCPP_HIDE_FROM_ABI unordered_multimap() _NOEXCEPT_(is_nothrow_default_constructible<__table>::value) {} + _LIBCPP_HIDE_FROM_ABI unordered_multimap() {} explicit _LIBCPP_HIDE_FROM_ABI unordered_multimap(size_type __n, const hasher& __hf = hasher(), const key_equal& __eql = key_equal()); _LIBCPP_HIDE_FROM_ABI @@ -1317,9 +1306,7 @@ public: } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __table_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(unordered_multimap& __u) _NOEXCEPT_(__is_nothrow_swappable_v<__table>) { - __table_.swap(__u.__table_); - } + _LIBCPP_HIDE_FROM_ABI void swap(unordered_multimap& __u) { __table_.swap(__u.__table_); } _LIBCPP_HIDE_FROM_ABI hasher hash_function() const { return __table_.hash_function().hash_function(); } _LIBCPP_HIDE_FROM_ABI key_equal key_eq() const { return __table_.key_eq().key_eq(); } @@ -1426,9 +1413,8 @@ inline void unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::insert(_InputIt } template -inline _LIBCPP_HIDE_FROM_ABI void -swap(unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>& __x, unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>& __x, + unordered_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/unordered_set b/libcxx/include/__cxx03/unordered_set index cdbdfa91986c5..e354bd973aa13 100644 --- a/libcxx/include/__cxx03/unordered_set +++ b/libcxx/include/__cxx03/unordered_set @@ -598,7 +598,7 @@ public: template friend class _LIBCPP_TEMPLATE_VIS unordered_multiset; - _LIBCPP_HIDE_FROM_ABI unordered_set() _NOEXCEPT_(is_nothrow_default_constructible<__table>::value) {} + _LIBCPP_HIDE_FROM_ABI unordered_set() {} explicit _LIBCPP_HIDE_FROM_ABI unordered_set(size_type __n, const hasher& __hf = hasher(), const key_equal& __eql = key_equal()); @@ -663,9 +663,7 @@ public: } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __table_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(unordered_set& __u) _NOEXCEPT_(__is_nothrow_swappable_v<__table>) { - __table_.swap(__u.__table_); - } + _LIBCPP_HIDE_FROM_ABI void swap(unordered_set& __u) { __table_.swap(__u.__table_); } _LIBCPP_HIDE_FROM_ABI hasher hash_function() const { return __table_.hash_function(); } _LIBCPP_HIDE_FROM_ABI key_equal key_eq() const { return __table_.key_eq(); } @@ -769,8 +767,7 @@ inline void unordered_set<_Value, _Hash, _Pred, _Alloc>::insert(_InputIterator _ template inline _LIBCPP_HIDE_FROM_ABI void -swap(unordered_set<_Value, _Hash, _Pred, _Alloc>& __x, unordered_set<_Value, _Hash, _Pred, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(unordered_set<_Value, _Hash, _Pred, _Alloc>& __x, unordered_set<_Value, _Hash, _Pred, _Alloc>& __y) { __x.swap(__y); } @@ -829,7 +826,7 @@ public: template friend class _LIBCPP_TEMPLATE_VIS unordered_multiset; - _LIBCPP_HIDE_FROM_ABI unordered_multiset() _NOEXCEPT_(is_nothrow_default_constructible<__table>::value) {} + _LIBCPP_HIDE_FROM_ABI unordered_multiset() {} explicit _LIBCPP_HIDE_FROM_ABI unordered_multiset(size_type __n, const hasher& __hf = hasher(), const key_equal& __eql = key_equal()); _LIBCPP_HIDE_FROM_ABI @@ -897,9 +894,7 @@ public: } _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __table_.clear(); } - _LIBCPP_HIDE_FROM_ABI void swap(unordered_multiset& __u) _NOEXCEPT_(__is_nothrow_swappable_v<__table>) { - __table_.swap(__u.__table_); - } + _LIBCPP_HIDE_FROM_ABI void swap(unordered_multiset& __u) { __table_.swap(__u.__table_); } _LIBCPP_HIDE_FROM_ABI hasher hash_function() const { return __table_.hash_function(); } _LIBCPP_HIDE_FROM_ABI key_equal key_eq() const { return __table_.key_eq(); } @@ -1007,8 +1002,7 @@ inline void unordered_multiset<_Value, _Hash, _Pred, _Alloc>::insert(_InputItera template inline _LIBCPP_HIDE_FROM_ABI void -swap(unordered_multiset<_Value, _Hash, _Pred, _Alloc>& __x, unordered_multiset<_Value, _Hash, _Pred, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +swap(unordered_multiset<_Value, _Hash, _Pred, _Alloc>& __x, unordered_multiset<_Value, _Hash, _Pred, _Alloc>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__cxx03/vector b/libcxx/include/__cxx03/vector index 5047ed7430d41..8192ffc1a0dae 100644 --- a/libcxx/include/__cxx03/vector +++ b/libcxx/include/__cxx03/vector @@ -408,13 +408,10 @@ public: static_assert(is_same::value, "Allocator::value_type must be same type as value_type"); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector() - _NOEXCEPT_(is_nothrow_default_constructible::value) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(const allocator_type& __a) - _NOEXCEPT_(is_nothrow_copy_constructible::value) - : __end_cap_(nullptr, __a) {} + _LIBCPP_HIDE_FROM_ABI vector() {} + _LIBCPP_HIDE_FROM_ABI explicit vector(const allocator_type& __a) : __end_cap_(nullptr, __a) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n) { + _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n) { auto __guard = std::__make_exception_guard(__destroy_vector(*this)); if (__n > 0) { __vallocate(__n); @@ -423,7 +420,7 @@ public: __guard.__complete(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x) { + _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x) { auto __guard = std::__make_exception_guard(__destroy_vector(*this)); if (__n > 0) { __vallocate(__n); @@ -433,8 +430,7 @@ public: } template <__enable_if_t<__is_allocator<_Allocator>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - vector(size_type __n, const value_type& __x, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x, const allocator_type& __a) : __end_cap_(nullptr, __a) { if (__n > 0) { __vallocate(__n); @@ -446,35 +442,33 @@ public: __enable_if_t<__has_exactly_input_iterator_category<_InputIterator>::value && is_constructible::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(_InputIterator __first, _InputIterator __last); + _LIBCPP_HIDE_FROM_ABI vector(_InputIterator __first, _InputIterator __last); template ::value && is_constructible::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - vector(_InputIterator __first, _InputIterator __last, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI vector(_InputIterator __first, _InputIterator __last, const allocator_type& __a); template < class _ForwardIterator, __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value && is_constructible::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(_ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI vector(_ForwardIterator __first, _ForwardIterator __last); template < class _ForwardIterator, __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value && is_constructible::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - vector(_ForwardIterator __first, _ForwardIterator __last, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI vector(_ForwardIterator __first, _ForwardIterator __last, const allocator_type& __a); private: class __destroy_vector { public: - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI __destroy_vector(vector& __vec) : __vec_(__vec) {} + _LIBCPP_HIDE_FROM_ABI __destroy_vector(vector& __vec) : __vec_(__vec) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void operator()() { + _LIBCPP_HIDE_FROM_ABI void operator()() { if (__vec_.__begin_ != nullptr) { __vec_.__clear(); __vec_.__annotate_delete(); @@ -487,156 +481,129 @@ private: }; public: - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~vector() { __destroy_vector (*this)(); } + _LIBCPP_HIDE_FROM_ABI ~vector() { __destroy_vector (*this)(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(const vector& __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - vector(const vector& __x, const __type_identity_t& __a); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector& operator=(const vector& __x); + _LIBCPP_HIDE_FROM_ABI vector(const vector& __x); + _LIBCPP_HIDE_FROM_ABI vector(const vector& __x, const __type_identity_t& __a); + _LIBCPP_HIDE_FROM_ABI vector& operator=(const vector& __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(vector&& __x) - _NOEXCEPT_(is_nothrow_move_constructible::value); + _LIBCPP_HIDE_FROM_ABI vector(vector&& __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - vector(vector&& __x, const __type_identity_t& __a); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector& operator=(vector&& __x) - _NOEXCEPT_(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value); + _LIBCPP_HIDE_FROM_ABI vector(vector&& __x, const __type_identity_t& __a); + _LIBCPP_HIDE_FROM_ABI vector& operator=(vector&& __x); template ::value && is_constructible::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void assign(_InputIterator __first, _InputIterator __last); + _LIBCPP_HIDE_FROM_ABI void assign(_InputIterator __first, _InputIterator __last); template < class _ForwardIterator, __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value && is_constructible::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void assign(_ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI void assign(_ForwardIterator __first, _ForwardIterator __last); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const_reference __u); + _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const_reference __u); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { - return this->__alloc(); - } + _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return this->__alloc(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { - return reverse_iterator(end()); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { - return const_reverse_iterator(end()); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { - return reverse_iterator(begin()); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { - return const_reverse_iterator(begin()); - } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return begin(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return end(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { - return rbegin(); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return end(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return rbegin(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return rend(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return static_cast(this->__end_ - this->__begin_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type capacity() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_type capacity() const _NOEXCEPT { return static_cast(__end_cap() - this->__begin_); } - _LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { - return this->__begin_ == this->__end_; - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void reserve(size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void shrink_to_fit() _NOEXCEPT; + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return this->__begin_ == this->__end_; } + _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void reserve(size_type __n); + _LIBCPP_HIDE_FROM_ABI void shrink_to_fit() _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference operator[](size_type __n) _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __n) const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference at(size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __n) const; + _LIBCPP_HIDE_FROM_ABI reference operator[](size_type __n) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __n) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI reference at(size_type __n); + _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __n) const; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference front() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference front() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "front() called on an empty vector"); return *this->__begin_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "front() called on an empty vector"); return *this->__begin_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI reference back() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI reference back() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "back() called on an empty vector"); return *(this->__end_ - 1); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "back() called on an empty vector"); return *(this->__end_ - 1); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI value_type* data() _NOEXCEPT { - return std::__to_address(this->__begin_); - } + _LIBCPP_HIDE_FROM_ABI value_type* data() _NOEXCEPT { return std::__to_address(this->__begin_); } + _LIBCPP_HIDE_FROM_ABI const value_type* data() const _NOEXCEPT { return std::__to_address(this->__begin_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const value_type* data() const _NOEXCEPT { - return std::__to_address(this->__begin_); - } - - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void push_back(const_reference __x); + _LIBCPP_HIDE_FROM_ABI void push_back(const_reference __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __x); + _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __x); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args); + _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void pop_back(); + _LIBCPP_HIDE_FROM_ABI void pop_back(); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, const_reference __x); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, const_reference __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, value_type&& __x); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, value_type&& __x); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __position, _Args&&... __args); + _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __position, _Args&&... __args); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator - insert(const_iterator __position, size_type __n, const_reference __x); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, size_type __n, const_reference __x); template ::value && is_constructible< value_type, typename iterator_traits<_InputIterator>::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator - insert(const_iterator __position, _InputIterator __first, _InputIterator __last); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, _InputIterator __first, _InputIterator __last); template < class _ForwardIterator, __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value && is_constructible< value_type, typename iterator_traits<_ForwardIterator>::reference>::value, int> = 0> - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator - insert(const_iterator __position, _ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, _ForwardIterator __first, _ForwardIterator __last); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last); + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position); + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { size_type __old_size = size(); __clear(); __annotate_shrink(__old_size); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void resize(size_type __sz); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void resize(size_type __sz, const_reference __x); + _LIBCPP_HIDE_FROM_ABI void resize(size_type __sz); + _LIBCPP_HIDE_FROM_ABI void resize(size_type __sz, const_reference __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void swap(vector&) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v); + _LIBCPP_HIDE_FROM_ABI void swap(vector&); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool __invariants() const; + _LIBCPP_HIDE_FROM_ABI bool __invariants() const; private: pointer __begin_ = nullptr; @@ -651,7 +618,7 @@ private: // Precondition: __n > 0 // Postcondition: capacity() >= __n // Postcondition: size() == 0 - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __vallocate(size_type __n) { + _LIBCPP_HIDE_FROM_ABI void __vallocate(size_type __n) { if (__n > max_size()) __throw_length_error(); auto __allocation = std::__allocate_at_least(__alloc(), __n); @@ -661,14 +628,13 @@ private: __annotate_new(0); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __vdeallocate() _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI size_type __recommend(size_type __new_size) const; - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n, const_reference __x); + _LIBCPP_HIDE_FROM_ABI void __vdeallocate() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type __recommend(size_type __new_size) const; + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n, const_reference __x); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __init_with_size(_InputIterator __first, _Sentinel __last, size_type __n) { + _LIBCPP_HIDE_FROM_ABI void __init_with_size(_InputIterator __first, _Sentinel __last, size_type __n) { auto __guard = std::__make_exception_guard(__destroy_vector(*this)); if (__n > 0) { @@ -680,8 +646,7 @@ private: } template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __init_with_sentinel(_InputIterator __first, _Sentinel __last) { + _LIBCPP_HIDE_FROM_ABI void __init_with_sentinel(_InputIterator __first, _Sentinel __last) { auto __guard = std::__make_exception_guard(__destroy_vector(*this)); for (; __first != __last; ++__first) @@ -691,28 +656,26 @@ private: } template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last); + _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __assign_with_size(_ForwardIterator __first, _Sentinel __last, difference_type __n); + _LIBCPP_HIDE_FROM_ABI void __assign_with_size(_ForwardIterator __first, _Sentinel __last, difference_type __n); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator + _LIBCPP_HIDE_FROM_ABI iterator __insert_with_sentinel(const_iterator __position, _InputIterator __first, _Sentinel __last); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator + _LIBCPP_HIDE_FROM_ABI iterator __insert_with_size(const_iterator __position, _Iterator __first, _Sentinel __last, difference_type __n); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n); + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __append(size_type __n, const_reference __x); + _LIBCPP_HIDE_FROM_ABI void __append(size_type __n); + _LIBCPP_HIDE_FROM_ABI void __append(size_type __n, const_reference __x); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator __make_iter(pointer __p) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI iterator __make_iter(pointer __p) _NOEXCEPT { #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR // Bound the iterator according to the capacity, rather than the size. // @@ -732,7 +695,7 @@ private: #endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(const_pointer __p) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(const_pointer __p) const _NOEXCEPT { #ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR // Bound the iterator according to the capacity, rather than the size. return std::__make_bounded_iter( @@ -744,27 +707,23 @@ private: #endif // _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __swap_out_circular_buffer(__split_buffer& __v); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer + _LIBCPP_HIDE_FROM_ABI void __swap_out_circular_buffer(__split_buffer& __v); + _LIBCPP_HIDE_FROM_ABI pointer __swap_out_circular_buffer(__split_buffer& __v, pointer __p); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __move_range(pointer __from_s, pointer __from_e, pointer __to); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, false_type) - _NOEXCEPT_(__alloc_traits::is_always_equal::value); - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __move_range(pointer __from_s, pointer __from_e, pointer __to); + _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, true_type); + _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, false_type); + _LIBCPP_HIDE_FROM_ABI void __destruct_at_end(pointer __new_last) _NOEXCEPT { size_type __old_size = size(); __base_destruct_at_end(__new_last); __annotate_shrink(__old_size); } template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI inline pointer __push_back_slow_path(_Up&& __x); + _LIBCPP_HIDE_FROM_ABI inline pointer __push_back_slow_path(_Up&& __x); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI inline pointer __emplace_back_slow_path(_Args&&... __args); + _LIBCPP_HIDE_FROM_ABI inline pointer __emplace_back_slow_path(_Args&&... __args); // The following functions are no-ops outside of AddressSanitizer mode. // We call annotations for every allocator, unless explicitly disabled. @@ -774,32 +733,31 @@ private: // For more details, see the "Using libc++" documentation page or // the documentation for __sanitizer_annotate_contiguous_container. - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __annotate_contiguous_container(const void* __old_mid, const void* __new_mid) const { + _LIBCPP_HIDE_FROM_ABI void __annotate_contiguous_container(const void* __old_mid, const void* __new_mid) const { std::__annotate_contiguous_container<_Allocator>(data(), data() + capacity(), __old_mid, __new_mid); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT { (void)__current_size; #ifndef _LIBCPP_HAS_NO_ASAN __annotate_contiguous_container(data() + capacity(), data() + __current_size); #endif } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT { #ifndef _LIBCPP_HAS_NO_ASAN __annotate_contiguous_container(data() + size(), data() + capacity()); #endif } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_increase(size_type __n) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_increase(size_type __n) const _NOEXCEPT { (void)__n; #ifndef _LIBCPP_HAS_NO_ASAN __annotate_contiguous_container(data() + size(), data() + size() + __n); #endif } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_shrink(size_type __old_size) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __annotate_shrink(size_type __old_size) const _NOEXCEPT { (void)__old_size; #ifndef _LIBCPP_HAS_NO_ASAN __annotate_contiguous_container(data() + __old_size, data() + size()); @@ -807,14 +765,14 @@ private: } struct _ConstructTransaction { - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit _ConstructTransaction(vector& __v, size_type __n) + _LIBCPP_HIDE_FROM_ABI explicit _ConstructTransaction(vector& __v, size_type __n) : __v_(__v), __pos_(__v.__end_), __new_end_(__v.__end_ + __n) { #ifndef _LIBCPP_HAS_NO_ASAN __v_.__annotate_increase(__n); #endif } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI ~_ConstructTransaction() { + _LIBCPP_HIDE_FROM_ABI ~_ConstructTransaction() { __v_.__end_ = __pos_; #ifndef _LIBCPP_HAS_NO_ASAN if (__pos_ != __new_end_) { @@ -832,43 +790,31 @@ private: }; template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __construct_one_at_end(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI void __construct_one_at_end(_Args&&... __args) { _ConstructTransaction __tx(*this, 1); __alloc_traits::construct(this->__alloc(), std::__to_address(__tx.__pos_), std::forward<_Args>(__args)...); ++__tx.__pos_; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI allocator_type& __alloc() _NOEXCEPT { - return this->__end_cap_.second(); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const allocator_type& __alloc() const _NOEXCEPT { - return this->__end_cap_.second(); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI pointer& __end_cap() _NOEXCEPT { - return this->__end_cap_.first(); - } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI const pointer& __end_cap() const _NOEXCEPT { - return this->__end_cap_.first(); - } + _LIBCPP_HIDE_FROM_ABI allocator_type& __alloc() _NOEXCEPT { return this->__end_cap_.second(); } + _LIBCPP_HIDE_FROM_ABI const allocator_type& __alloc() const _NOEXCEPT { return this->__end_cap_.second(); } + _LIBCPP_HIDE_FROM_ABI pointer& __end_cap() _NOEXCEPT { return this->__end_cap_.first(); } + _LIBCPP_HIDE_FROM_ABI const pointer& __end_cap() const _NOEXCEPT { return this->__end_cap_.first(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __clear() _NOEXCEPT { - __base_destruct_at_end(this->__begin_); - } + _LIBCPP_HIDE_FROM_ABI void __clear() _NOEXCEPT { __base_destruct_at_end(this->__begin_); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __base_destruct_at_end(pointer __new_last) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __base_destruct_at_end(pointer __new_last) _NOEXCEPT { pointer __soon_to_be_end = this->__end_; while (__new_last != __soon_to_be_end) __alloc_traits::destroy(__alloc(), std::__to_address(--__soon_to_be_end)); this->__end_ = __new_last; } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c) { + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c) { __copy_assign_alloc(__c, integral_constant()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector& __c) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_move_assignment::value || - is_nothrow_move_assignable::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector& __c) { __move_assign_alloc(__c, integral_constant()); } @@ -876,7 +822,7 @@ private: _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c, true_type) { + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c, true_type) { if (__alloc() != __c.__alloc()) { __clear(); __annotate_delete(); @@ -886,22 +832,17 @@ private: __alloc() = __c.__alloc(); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector&, false_type) {} - - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value) { - __alloc() = std::move(__c.__alloc()); - } + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector&, false_type) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector&, false_type) _NOEXCEPT {} + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector& __c, true_type) { __alloc() = std::move(__c.__alloc()); } + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector&, false_type) _NOEXCEPT {} }; // __swap_out_circular_buffer relocates the objects in [__begin_, __end_) into the front of __v and swaps the buffers of // *this and __v. It is assumed that __v provides space for exactly (__end_ - __begin_) objects in the front. This // function has a strong exception guarantee. template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer& __v) { +void vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer& __v) { __annotate_delete(); auto __new_begin = __v.__begin_ - (__end_ - __begin_); std::__uninitialized_allocator_relocate( @@ -920,7 +861,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer +typename vector<_Tp, _Allocator>::pointer vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer& __v, pointer __p) { __annotate_delete(); pointer __ret = __v.__begin_; @@ -947,7 +888,7 @@ vector<_Tp, _Allocator>::__swap_out_circular_buffer(__split_buffer -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__vdeallocate() _NOEXCEPT { +void vector<_Tp, _Allocator>::__vdeallocate() _NOEXCEPT { if (this->__begin_ != nullptr) { clear(); __annotate_delete(); @@ -957,14 +898,13 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__vdeallocate() _NOE } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::size_type -vector<_Tp, _Allocator>::max_size() const _NOEXCEPT { +typename vector<_Tp, _Allocator>::size_type vector<_Tp, _Allocator>::max_size() const _NOEXCEPT { return std::min(__alloc_traits::max_size(this->__alloc()), numeric_limits::max()); } // Precondition: __new_size > capacity() template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::size_type +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::size_type vector<_Tp, _Allocator>::__recommend(size_type __new_size) const { const size_type __ms = max_size(); if (__new_size > __ms) @@ -981,7 +921,7 @@ vector<_Tp, _Allocator>::__recommend(size_type __new_size) const { // Precondition: size() + __n <= capacity() // Postcondition: size() == size() + __n template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__construct_at_end(size_type __n) { +void vector<_Tp, _Allocator>::__construct_at_end(size_type __n) { _ConstructTransaction __tx(*this, __n); const_pointer __new_end = __tx.__new_end_; for (pointer __pos = __tx.__pos_; __pos != __new_end; __tx.__pos_ = ++__pos) { @@ -996,8 +936,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__construct_at_end(s // Postcondition: size() == old size() + __n // Postcondition: [i] == __x for all i in [size() - __n, __n) template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void -vector<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x) { +inline void vector<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x) { _ConstructTransaction __tx(*this, __n); const_pointer __new_end = __tx.__new_end_; for (pointer __pos = __tx.__pos_; __pos != __new_end; __tx.__pos_ = ++__pos) { @@ -1007,8 +946,7 @@ vector<_Tp, _Allocator>::__construct_at_end(size_type __n, const_reference __x) template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -vector<_Tp, _Allocator>::__construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n) { +void vector<_Tp, _Allocator>::__construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n) { _ConstructTransaction __tx(*this, __n); __tx.__pos_ = std::__uninitialized_allocator_copy(__alloc(), __first, __last, __tx.__pos_); } @@ -1018,7 +956,7 @@ vector<_Tp, _Allocator>::__construct_at_end(_InputIterator __first, _Sentinel __ // Postcondition: size() == size() + __n // Exception safety: strong. template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type __n) { +void vector<_Tp, _Allocator>::__append(size_type __n) { if (static_cast(this->__end_cap() - this->__end_) >= __n) this->__construct_at_end(__n); else { @@ -1034,7 +972,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type _ // Postcondition: size() == size() + __n // Exception safety: strong. template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type __n, const_reference __x) { +void vector<_Tp, _Allocator>::__append(size_type __n, const_reference __x) { if (static_cast(this->__end_cap() - this->__end_) >= __n) this->__construct_at_end(__n, __x); else { @@ -1050,7 +988,7 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_InputIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(_InputIterator __first, _InputIterator __last) { +vector<_Tp, _Allocator>::vector(_InputIterator __first, _InputIterator __last) { __init_with_sentinel(__first, __last); } @@ -1059,7 +997,6 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_InputIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(_InputIterator __first, _InputIterator __last, const allocator_type& __a) : __end_cap_(nullptr, __a) { __init_with_sentinel(__first, __last); @@ -1070,7 +1007,7 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_ForwardIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(_ForwardIterator __first, _ForwardIterator __last) { +vector<_Tp, _Allocator>::vector(_ForwardIterator __first, _ForwardIterator __last) { size_type __n = static_cast(std::distance(__first, __last)); __init_with_size(__first, __last, __n); } @@ -1080,7 +1017,6 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_ForwardIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(_ForwardIterator __first, _ForwardIterator __last, const allocator_type& __a) : __end_cap_(nullptr, __a) { size_type __n = static_cast(std::distance(__first, __last)); @@ -1088,21 +1024,19 @@ vector<_Tp, _Allocator>::vector(_ForwardIterator __first, _ForwardIterator __las } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(const vector& __x) +vector<_Tp, _Allocator>::vector(const vector& __x) : __end_cap_(nullptr, __alloc_traits::select_on_container_copy_construction(__x.__alloc())) { __init_with_size(__x.__begin_, __x.__end_, __x.size()); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(const vector& __x, const __type_identity_t& __a) : __end_cap_(nullptr, __a) { __init_with_size(__x.__begin_, __x.__end_, __x.size()); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>::vector(vector&& __x) - _NOEXCEPT_(is_nothrow_move_constructible::value) +inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>::vector(vector&& __x) : __end_cap_(nullptr, std::move(__x.__alloc())) { this->__begin_ = __x.__begin_; this->__end_ = __x.__end_; @@ -1111,8 +1045,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocato } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI -vector<_Tp, _Allocator>::vector(vector&& __x, const __type_identity_t& __a) +inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>::vector(vector&& __x, const __type_identity_t& __a) : __end_cap_(nullptr, __a) { if (__a == __x.__alloc()) { this->__begin_ = __x.__begin_; @@ -1128,16 +1061,13 @@ vector<_Tp, _Allocator>::vector(vector&& __x, const __type_identity_t -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>& -vector<_Tp, _Allocator>::operator=(vector&& __x) - _NOEXCEPT_(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value) { +inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>& vector<_Tp, _Allocator>::operator=(vector&& __x) { __move_assign(__x, integral_constant()); return *this; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__move_assign(vector& __c, false_type) - _NOEXCEPT_(__alloc_traits::is_always_equal::value) { +void vector<_Tp, _Allocator>::__move_assign(vector& __c, false_type) { if (__alloc() != __c.__alloc()) { typedef move_iterator _Ip; assign(_Ip(__c.begin()), _Ip(__c.end())); @@ -1146,8 +1076,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__move_assign(vector } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__move_assign(vector& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value) { +void vector<_Tp, _Allocator>::__move_assign(vector& __c, true_type) { __vdeallocate(); __move_assign_alloc(__c); // this can throw this->__begin_ = __c.__begin_; @@ -1157,8 +1086,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__move_assign(vector } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>& -vector<_Tp, _Allocator>::operator=(const vector& __x) { +inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>& vector<_Tp, _Allocator>::operator=(const vector& __x) { if (this != std::addressof(__x)) { __copy_assign_alloc(__x); assign(__x.__begin_, __x.__end_); @@ -1171,14 +1099,13 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_InputIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::assign(_InputIterator __first, _InputIterator __last) { +void vector<_Tp, _Allocator>::assign(_InputIterator __first, _InputIterator __last) { __assign_with_sentinel(__first, __last); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void -vector<_Tp, _Allocator>::__assign_with_sentinel(_Iterator __first, _Sentinel __last) { +_LIBCPP_HIDE_FROM_ABI void vector<_Tp, _Allocator>::__assign_with_sentinel(_Iterator __first, _Sentinel __last) { clear(); for (; __first != __last; ++__first) emplace_back(*__first); @@ -1189,13 +1116,13 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_ForwardIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::assign(_ForwardIterator __first, _ForwardIterator __last) { +void vector<_Tp, _Allocator>::assign(_ForwardIterator __first, _ForwardIterator __last) { __assign_with_size(__first, __last, std::distance(__first, __last)); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void +_LIBCPP_HIDE_FROM_ABI void vector<_Tp, _Allocator>::__assign_with_size(_ForwardIterator __first, _Sentinel __last, difference_type __n) { size_type __new_size = static_cast(__n); if (__new_size <= capacity()) { @@ -1215,7 +1142,7 @@ vector<_Tp, _Allocator>::__assign_with_size(_ForwardIterator __first, _Sentinel } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::assign(size_type __n, const_reference __u) { +void vector<_Tp, _Allocator>::assign(size_type __n, const_reference __u) { if (__n <= capacity()) { size_type __s = size(); std::fill_n(this->__begin_, std::min(__n, __s), __u); @@ -1231,60 +1158,57 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::assign(size_type __n } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator -vector<_Tp, _Allocator>::begin() _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::begin() _NOEXCEPT { return __make_iter(this->__begin_); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::const_iterator +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::const_iterator vector<_Tp, _Allocator>::begin() const _NOEXCEPT { return __make_iter(this->__begin_); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator -vector<_Tp, _Allocator>::end() _NOEXCEPT { +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::end() _NOEXCEPT { return __make_iter(this->__end_); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::const_iterator +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::const_iterator vector<_Tp, _Allocator>::end() const _NOEXCEPT { return __make_iter(this->__end_); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::reference +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::reference vector<_Tp, _Allocator>::operator[](size_type __n) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < size(), "vector[] index out of bounds"); return this->__begin_[__n]; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::const_reference +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::const_reference vector<_Tp, _Allocator>::operator[](size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < size(), "vector[] index out of bounds"); return this->__begin_[__n]; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::reference vector<_Tp, _Allocator>::at(size_type __n) { +typename vector<_Tp, _Allocator>::reference vector<_Tp, _Allocator>::at(size_type __n) { if (__n >= size()) this->__throw_out_of_range(); return this->__begin_[__n]; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::const_reference -vector<_Tp, _Allocator>::at(size_type __n) const { +typename vector<_Tp, _Allocator>::const_reference vector<_Tp, _Allocator>::at(size_type __n) const { if (__n >= size()) this->__throw_out_of_range(); return this->__begin_[__n]; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::reserve(size_type __n) { +void vector<_Tp, _Allocator>::reserve(size_type __n) { if (__n > capacity()) { if (__n > max_size()) this->__throw_length_error(); @@ -1295,7 +1219,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::reserve(size_type __ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT { +void vector<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT { if (capacity() > size()) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { @@ -1316,8 +1240,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::shrink_to_fit() _NOE template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer -vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x) { +typename vector<_Tp, _Allocator>::pointer vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x) { allocator_type& __a = this->__alloc(); __split_buffer __v(__recommend(size() + 1), size(), __a); // __v.push_back(std::forward<_Up>(__x)); @@ -1328,8 +1251,7 @@ vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void -vector<_Tp, _Allocator>::push_back(const_reference __x) { +inline _LIBCPP_HIDE_FROM_ABI void vector<_Tp, _Allocator>::push_back(const_reference __x) { pointer __end = this->__end_; if (__end < this->__end_cap()) { __construct_one_at_end(__x); @@ -1341,7 +1263,7 @@ vector<_Tp, _Allocator>::push_back(const_reference __x) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void vector<_Tp, _Allocator>::push_back(value_type&& __x) { +inline _LIBCPP_HIDE_FROM_ABI void vector<_Tp, _Allocator>::push_back(value_type&& __x) { pointer __end = this->__end_; if (__end < this->__end_cap()) { __construct_one_at_end(std::move(__x)); @@ -1354,8 +1276,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void vector<_Tp, _All template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::pointer -vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) { +typename vector<_Tp, _Allocator>::pointer vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) { allocator_type& __a = this->__alloc(); __split_buffer __v(__recommend(size() + 1), size(), __a); // __v.emplace_back(std::forward<_Args>(__args)...); @@ -1367,7 +1288,7 @@ vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) { template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void vector<_Tp, _Allocator>::emplace_back(_Args&&... __args) { +inline void vector<_Tp, _Allocator>::emplace_back(_Args&&... __args) { pointer __end = this->__end_; if (__end < this->__end_cap()) { __construct_one_at_end(std::forward<_Args>(__args)...); @@ -1379,13 +1300,13 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 inline void vector<_Tp, _Allocator>::emplace_back( } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline void vector<_Tp, _Allocator>::pop_back() { +inline void vector<_Tp, _Allocator>::pop_back() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector::pop_back called on an empty vector"); this->__destruct_at_end(this->__end_ - 1); } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator +inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::erase(const_iterator __position) { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS( __position != end(), "vector::erase(iterator) called with a non-dereferenceable iterator"); @@ -1396,7 +1317,7 @@ vector<_Tp, _Allocator>::erase(const_iterator __position) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator +typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::erase(const_iterator __first, const_iterator __last) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "vector::erase(first, last) called with invalid range"); pointer __p = this->__begin_ + (__first - begin()); @@ -1407,8 +1328,7 @@ vector<_Tp, _Allocator>::erase(const_iterator __first, const_iterator __last) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -vector<_Tp, _Allocator>::__move_range(pointer __from_s, pointer __from_e, pointer __to) { +void vector<_Tp, _Allocator>::__move_range(pointer __from_s, pointer __from_e, pointer __to) { pointer __old_last = this->__end_; difference_type __n = __old_last - __to; { @@ -1422,7 +1342,7 @@ vector<_Tp, _Allocator>::__move_range(pointer __from_s, pointer __from_e, pointe } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator +typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x) { pointer __p = this->__begin_ + (__position - begin()); if (this->__end_ < this->__end_cap()) { @@ -1445,7 +1365,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, const_reference __x) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator +typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::insert(const_iterator __position, value_type&& __x) { pointer __p = this->__begin_ + (__position - begin()); if (this->__end_ < this->__end_cap()) { @@ -1466,7 +1386,7 @@ vector<_Tp, _Allocator>::insert(const_iterator __position, value_type&& __x) { template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator +typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::emplace(const_iterator __position, _Args&&... __args) { pointer __p = this->__begin_ + (__position - begin()); if (this->__end_ < this->__end_cap()) { @@ -1487,7 +1407,7 @@ vector<_Tp, _Allocator>::emplace(const_iterator __position, _Args&&... __args) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator +typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::insert(const_iterator __position, size_type __n, const_reference __x) { pointer __p = this->__begin_ + (__position - begin()); if (__n > 0) { @@ -1521,14 +1441,14 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_InputIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator +typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::insert(const_iterator __position, _InputIterator __first, _InputIterator __last) { return __insert_with_sentinel(__position, __first, __last); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator +_LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::__insert_with_sentinel(const_iterator __position, _InputIterator __first, _Sentinel __last) { difference_type __off = __position - begin(); pointer __p = this->__begin_ + __off; @@ -1565,15 +1485,14 @@ template ::value && is_constructible<_Tp, typename iterator_traits<_ForwardIterator>::reference>::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator +typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::insert(const_iterator __position, _ForwardIterator __first, _ForwardIterator __last) { return __insert_with_size(__position, __first, __last, std::distance(__first, __last)); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator -vector<_Tp, _Allocator>::__insert_with_size( +_LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator vector<_Tp, _Allocator>::__insert_with_size( const_iterator __position, _Iterator __first, _Sentinel __last, difference_type __n) { auto __insertion_size = __n; pointer __p = this->__begin_ + (__position - begin()); @@ -1605,7 +1524,7 @@ vector<_Tp, _Allocator>::__insert_with_size( } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __sz) { +void vector<_Tp, _Allocator>::resize(size_type __sz) { size_type __cs = size(); if (__cs < __sz) this->__append(__sz - __cs); @@ -1614,7 +1533,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __s } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __sz, const_reference __x) { +void vector<_Tp, _Allocator>::resize(size_type __sz, const_reference __x) { size_type __cs = size(); if (__cs < __sz) this->__append(__sz - __cs, __x); @@ -1623,8 +1542,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::resize(size_type __s } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::swap(vector& __x) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v) { +void vector<_Tp, _Allocator>::swap(vector& __x) { _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR( __alloc_traits::propagate_on_container_swap::value || this->__alloc() == __x.__alloc(), "vector::swap: Either propagate_on_container_swap must be true" @@ -1637,7 +1555,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::swap(vector& __x) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 bool vector<_Tp, _Allocator>::__invariants() const { +bool vector<_Tp, _Allocator>::__invariants() const { if (this->__begin_ == nullptr) { if (this->__end_ != nullptr || this->__end_cap() != nullptr) return false; @@ -1701,41 +1619,31 @@ public: #endif private: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type& __cap() _NOEXCEPT { return __cap_alloc_.first(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const size_type& __cap() const _NOEXCEPT { - return __cap_alloc_.first(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __storage_allocator& __alloc() _NOEXCEPT { - return __cap_alloc_.second(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const __storage_allocator& __alloc() const _NOEXCEPT { - return __cap_alloc_.second(); - } + _LIBCPP_HIDE_FROM_ABI size_type& __cap() _NOEXCEPT { return __cap_alloc_.first(); } + _LIBCPP_HIDE_FROM_ABI const size_type& __cap() const _NOEXCEPT { return __cap_alloc_.first(); } + _LIBCPP_HIDE_FROM_ABI __storage_allocator& __alloc() _NOEXCEPT { return __cap_alloc_.second(); } + _LIBCPP_HIDE_FROM_ABI const __storage_allocator& __alloc() const _NOEXCEPT { return __cap_alloc_.second(); } static const unsigned __bits_per_word = static_cast(sizeof(__storage_type) * CHAR_BIT); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static size_type - __internal_cap_to_external(size_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static size_type __internal_cap_to_external(size_type __n) _NOEXCEPT { return __n * __bits_per_word; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static size_type - __external_cap_to_internal(size_type __n) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI static size_type __external_cap_to_internal(size_type __n) _NOEXCEPT { return (__n - 1) / __bits_per_word + 1; } public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector() - _NOEXCEPT_(is_nothrow_default_constructible::value); + _LIBCPP_HIDE_FROM_ABI vector(); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit vector(const allocator_type& __a) - _NOEXCEPT_(is_nothrow_copy_constructible::value); + _LIBCPP_HIDE_FROM_ABI explicit vector(const allocator_type& __a); private: class __destroy_vector { public: - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI __destroy_vector(vector& __vec) : __vec_(__vec) {} + _LIBCPP_HIDE_FROM_ABI __destroy_vector(vector& __vec) : __vec_(__vec) {} - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void operator()() { + _LIBCPP_HIDE_FROM_ABI void operator()() { if (__vec_.__begin_ != nullptr) __storage_traits::deallocate(__vec_.__alloc(), __vec_.__begin_, __vec_.__cap()); } @@ -1745,125 +1653,91 @@ private: }; public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~vector() { __destroy_vector (*this)(); } + _LIBCPP_HIDE_FROM_ABI ~vector() { __destroy_vector (*this)(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit vector(size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector(size_type __n, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - vector(size_type __n, const value_type& __v, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n); + _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __v); + _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __v, const allocator_type& __a); template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector(_InputIterator __first, _InputIterator __last); + _LIBCPP_HIDE_FROM_ABI vector(_InputIterator __first, _InputIterator __last); template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - vector(_InputIterator __first, _InputIterator __last, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI vector(_InputIterator __first, _InputIterator __last, const allocator_type& __a); template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector(_ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI vector(_ForwardIterator __first, _ForwardIterator __last); template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - vector(_ForwardIterator __first, _ForwardIterator __last, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI vector(_ForwardIterator __first, _ForwardIterator __last, const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector(const vector& __v); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector(const vector& __v, const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector& operator=(const vector& __v); + _LIBCPP_HIDE_FROM_ABI vector(const vector& __v); + _LIBCPP_HIDE_FROM_ABI vector(const vector& __v, const allocator_type& __a); + _LIBCPP_HIDE_FROM_ABI vector& operator=(const vector& __v); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector(vector&& __v) - _NOEXCEPT_(is_nothrow_move_constructible::value); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - vector(vector&& __v, const __type_identity_t& __a); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector& operator=(vector&& __v) - _NOEXCEPT_(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value); + _LIBCPP_HIDE_FROM_ABI vector(vector&& __v); + _LIBCPP_HIDE_FROM_ABI vector(vector&& __v, const __type_identity_t& __a); + _LIBCPP_HIDE_FROM_ABI vector& operator=(vector&& __v); template ::value, int> = 0> - void _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 assign(_InputIterator __first, _InputIterator __last); + void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __first, _InputIterator __last); template ::value, int> = 0> - void _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 assign(_ForwardIterator __first, _ForwardIterator __last); + void _LIBCPP_HIDE_FROM_ABI assign(_ForwardIterator __first, _ForwardIterator __last); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void assign(size_type __n, const value_type& __x); + _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __x); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator_type get_allocator() const _NOEXCEPT { - return allocator_type(this->__alloc()); - } + _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return allocator_type(this->__alloc()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type max_size() const _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type capacity() const _NOEXCEPT { - return __internal_cap_to_external(__cap()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type size() const _NOEXCEPT { return __size_; } - _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool empty() const _NOEXCEPT { - return __size_ == 0; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void reserve(size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void shrink_to_fit() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_type capacity() const _NOEXCEPT { return __internal_cap_to_external(__cap()); } + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; } + _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __size_ == 0; } + _LIBCPP_HIDE_FROM_ABI void reserve(size_type __n); + _LIBCPP_HIDE_FROM_ABI void shrink_to_fit() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator begin() _NOEXCEPT { return __make_iter(0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator begin() const _NOEXCEPT { return __make_iter(0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator end() _NOEXCEPT { return __make_iter(__size_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator end() const _NOEXCEPT { - return __make_iter(__size_); - } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __make_iter(0); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __make_iter(0); } + _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __make_iter(__size_); } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __make_iter(__size_); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rbegin() _NOEXCEPT { - return reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rbegin() const _NOEXCEPT { - return const_reverse_iterator(end()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reverse_iterator rend() _NOEXCEPT { - return reverse_iterator(begin()); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator rend() const _NOEXCEPT { - return const_reverse_iterator(begin()); - } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cbegin() const _NOEXCEPT { return __make_iter(0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator cend() const _NOEXCEPT { - return __make_iter(__size_); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crbegin() const _NOEXCEPT { - return rbegin(); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return __make_iter(0); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return __make_iter(__size_); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { return rbegin(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return rend(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __n) { return __make_ref(__n); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference operator[](size_type __n) const { - return __make_ref(__n); - } + _LIBCPP_HIDE_FROM_ABI reference operator[](size_type __n) { return __make_ref(__n); } + _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __n) const { return __make_ref(__n); } _LIBCPP_HIDE_FROM_ABI reference at(size_type __n); _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __n) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() { return __make_ref(0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const { return __make_ref(0); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() { return __make_ref(__size_ - 1); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const { return __make_ref(__size_ - 1); } + _LIBCPP_HIDE_FROM_ABI reference front() { return __make_ref(0); } + _LIBCPP_HIDE_FROM_ABI const_reference front() const { return __make_ref(0); } + _LIBCPP_HIDE_FROM_ABI reference back() { return __make_ref(__size_ - 1); } + _LIBCPP_HIDE_FROM_ABI const_reference back() const { return __make_ref(__size_ - 1); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void push_back(const value_type& __x); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back() { --__size_; } + _LIBCPP_HIDE_FROM_ABI void push_back(const value_type& __x); + _LIBCPP_HIDE_FROM_ABI void pop_back() { --__size_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator insert(const_iterator __position, const value_type& __x); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator - insert(const_iterator __position, size_type __n, const value_type& __x); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, const value_type& __x); + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __position, size_type __n, const value_type& __x); template ::value, int> = 0> - iterator _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - insert(const_iterator __position, _InputIterator __first, _InputIterator __last); + iterator _LIBCPP_HIDE_FROM_ABI insert(const_iterator __position, _InputIterator __first, _InputIterator __last); template ::value, int> = 0> - iterator _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 - insert(const_iterator __position, _ForwardIterator __first, _ForwardIterator __last); + iterator _LIBCPP_HIDE_FROM_ABI insert(const_iterator __position, _ForwardIterator __first, _ForwardIterator __last); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator erase(const_iterator __position); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator erase(const_iterator __first, const_iterator __last); + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position); + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void clear() _NOEXCEPT { __size_ = 0; } + _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __size_ = 0; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(vector&) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static void swap(reference __x, reference __y) _NOEXCEPT { - std::swap(__x, __y); - } + _LIBCPP_HIDE_FROM_ABI void swap(vector&); + _LIBCPP_HIDE_FROM_ABI static void swap(reference __x, reference __y) _NOEXCEPT { std::swap(__x, __y); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void resize(size_type __sz, value_type __x = false); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void flip() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI void resize(size_type __sz, value_type __x = false); + _LIBCPP_HIDE_FROM_ABI void flip() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __invariants() const; + _LIBCPP_HIDE_FROM_ABI bool __invariants() const; private: _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); } @@ -1871,8 +1745,7 @@ private: _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - __init_with_size(_InputIterator __first, _Sentinel __last, size_type __n) { + _LIBCPP_HIDE_FROM_ABI void __init_with_size(_InputIterator __first, _Sentinel __last, size_type __n) { auto __guard = std::__make_exception_guard(__destroy_vector(*this)); if (__n > 0) { @@ -1884,8 +1757,7 @@ private: } template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - __init_with_sentinel(_InputIterator __first, _Sentinel __last) { + _LIBCPP_HIDE_FROM_ABI void __init_with_sentinel(_InputIterator __first, _Sentinel __last) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { #endif // _LIBCPP_HAS_NO_EXCEPTIONS @@ -1901,18 +1773,17 @@ private: } template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last); + _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void - __assign_with_size(_ForwardIterator __first, _Sentinel __last, difference_type __ns); + _LIBCPP_HIDE_FROM_ABI void __assign_with_size(_ForwardIterator __first, _Sentinel __last, difference_type __ns); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator + _LIBCPP_HIDE_FROM_ABI iterator __insert_with_sentinel(const_iterator __position, _InputIterator __first, _Sentinel __last); template - _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator + _LIBCPP_HIDE_FROM_ABI iterator __insert_with_size(const_iterator __position, _Iterator __first, _Sentinel __last, difference_type __n); // Allocate space for __n objects @@ -1922,7 +1793,7 @@ private: // Precondition: __n > 0 // Postcondition: capacity() >= __n // Postcondition: size() == 0 - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __vallocate(size_type __n) { + _LIBCPP_HIDE_FROM_ABI void __vallocate(size_type __n) { if (__n > max_size()) __throw_length_error(); auto __allocation = std::__allocate_at_least(__alloc(), __external_cap_to_internal(__n)); @@ -1935,62 +1806,54 @@ private: } } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __vdeallocate() _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static size_type __align_it(size_type __new_size) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __vdeallocate() _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI static size_type __align_it(size_type __new_size) _NOEXCEPT { return (__new_size + (__bits_per_word - 1)) & ~((size_type)__bits_per_word - 1); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type __recommend(size_type __new_size) const; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __construct_at_end(size_type __n, bool __x); + _LIBCPP_HIDE_FROM_ABI size_type __recommend(size_type __new_size) const; + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(size_type __n, bool __x); template - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void - __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __append(size_type __n, const_reference __x); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference __make_ref(size_type __pos) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI void __construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n); + _LIBCPP_HIDE_FROM_ABI void __append(size_type __n, const_reference __x); + _LIBCPP_HIDE_FROM_ABI reference __make_ref(size_type __pos) _NOEXCEPT { return reference(__begin_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference __make_ref(size_type __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_reference __make_ref(size_type __pos) const _NOEXCEPT { return __bit_const_reference( __begin_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator __make_iter(size_type __pos) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI iterator __make_iter(size_type __pos) _NOEXCEPT { return iterator(__begin_ + __pos / __bits_per_word, static_cast(__pos % __bits_per_word)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_iterator __make_iter(size_type __pos) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI const_iterator __make_iter(size_type __pos) const _NOEXCEPT { return const_iterator(__begin_ + __pos / __bits_per_word, static_cast(__pos % __bits_per_word)); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator __const_iterator_cast(const_iterator __p) _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI iterator __const_iterator_cast(const_iterator __p) _NOEXCEPT { return begin() + (__p - cbegin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __copy_assign_alloc(const vector& __v) { + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __v) { __copy_assign_alloc( __v, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __copy_assign_alloc(const vector& __c, true_type) { + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c, true_type) { if (__alloc() != __c.__alloc()) __vdeallocate(); __alloc() = __c.__alloc(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __copy_assign_alloc(const vector&, false_type) {} + _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector&, false_type) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign(vector& __c, false_type); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign(vector& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value); - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign_alloc(vector& __c) - _NOEXCEPT_(!__storage_traits::propagate_on_container_move_assignment::value || - is_nothrow_move_assignable::value) { + _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, false_type); + _LIBCPP_HIDE_FROM_ABI void __move_assign(vector& __c, true_type); + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector& __c) { __move_assign_alloc( __c, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign_alloc(vector& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value) { - __alloc() = std::move(__c.__alloc()); - } + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector& __c, true_type) { __alloc() = std::move(__c.__alloc()); } + _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(vector&, false_type) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign_alloc(vector&, false_type) _NOEXCEPT {} - - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_t __hash_code() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT; friend class __bit_reference; friend class __bit_const_reference; @@ -2001,7 +1864,7 @@ private: }; template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::__vdeallocate() _NOEXCEPT { +void vector::__vdeallocate() _NOEXCEPT { if (this->__begin_ != nullptr) { __storage_traits::deallocate(this->__alloc(), this->__begin_, __cap()); this->__begin_ = nullptr; @@ -2010,8 +1873,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::__vdeallocate() _NO } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::size_type -vector::max_size() const _NOEXCEPT { +typename vector::size_type vector::max_size() const _NOEXCEPT { size_type __amax = __storage_traits::max_size(__alloc()); size_type __nmax = numeric_limits::max() / 2; // end() >= begin(), always if (__nmax / __bits_per_word <= __amax) @@ -2021,7 +1883,7 @@ vector::max_size() const _NOEXCEPT { // Precondition: __new_size > capacity() template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::size_type +inline _LIBCPP_HIDE_FROM_ABI typename vector::size_type vector::__recommend(size_type __new_size) const { const size_type __ms = max_size(); if (__new_size > __ms) @@ -2037,8 +1899,7 @@ vector::__recommend(size_type __new_size) const { // Precondition: size() + __n <= capacity() // Postcondition: size() == size() + __n template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void -vector::__construct_at_end(size_type __n, bool __x) { +inline _LIBCPP_HIDE_FROM_ABI void vector::__construct_at_end(size_type __n, bool __x) { size_type __old_size = this->__size_; this->__size_ += __n; if (__old_size == 0 || ((__old_size - 1) / __bits_per_word) != ((this->__size_ - 1) / __bits_per_word)) { @@ -2052,8 +1913,7 @@ vector::__construct_at_end(size_type __n, bool __x) { template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void -vector::__construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n) { +void vector::__construct_at_end(_InputIterator __first, _Sentinel __last, size_type __n) { size_type __old_size = this->__size_; this->__size_ += __n; if (__old_size == 0 || ((__old_size - 1) / __bits_per_word) != ((this->__size_ - 1) / __bits_per_word)) { @@ -2066,18 +1926,15 @@ vector::__construct_at_end(_InputIterator __first, _Sentinel _ } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector() - _NOEXCEPT_(is_nothrow_default_constructible::value) +inline _LIBCPP_HIDE_FROM_ABI vector::vector() : __begin_(nullptr), __size_(0), __cap_alloc_(0, __default_init_tag()) {} template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(const allocator_type& __a) - _NOEXCEPT_(is_nothrow_copy_constructible::value) +inline _LIBCPP_HIDE_FROM_ABI vector::vector(const allocator_type& __a) : __begin_(nullptr), __size_(0), __cap_alloc_(0, static_cast<__storage_allocator>(__a)) {} template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(size_type __n) - : __begin_(nullptr), __size_(0), __cap_alloc_(0, __default_init_tag()) { +vector::vector(size_type __n) : __begin_(nullptr), __size_(0), __cap_alloc_(0, __default_init_tag()) { if (__n > 0) { __vallocate(__n); __construct_at_end(__n, false); @@ -2085,7 +1942,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(size_type __n) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(size_type __n, const value_type& __x) +vector::vector(size_type __n, const value_type& __x) : __begin_(nullptr), __size_(0), __cap_alloc_(0, __default_init_tag()) { if (__n > 0) { __vallocate(__n); @@ -2094,7 +1951,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(size_type __n, co } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(size_type __n, const value_type& __x, const allocator_type& __a) : __begin_(nullptr), __size_(0), __cap_alloc_(0, static_cast<__storage_allocator>(__a)) { if (__n > 0) { @@ -2105,14 +1961,13 @@ vector::vector(size_type __n, const value_type& __x, const all template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(_InputIterator __first, _InputIterator __last) +vector::vector(_InputIterator __first, _InputIterator __last) : __begin_(nullptr), __size_(0), __cap_alloc_(0, __default_init_tag()) { __init_with_sentinel(__first, __last); } template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(_InputIterator __first, _InputIterator __last, const allocator_type& __a) : __begin_(nullptr), __size_(0), __cap_alloc_(0, static_cast<__storage_allocator>(__a)) { __init_with_sentinel(__first, __last); @@ -2120,7 +1975,7 @@ vector::vector(_InputIterator __first, _InputIterator __last, template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(_ForwardIterator __first, _ForwardIterator __last) +vector::vector(_ForwardIterator __first, _ForwardIterator __last) : __begin_(nullptr), __size_(0), __cap_alloc_(0, __default_init_tag()) { auto __n = static_cast(std::distance(__first, __last)); __init_with_size(__first, __last, __n); @@ -2128,7 +1983,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(_ForwardIterator template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(_ForwardIterator __first, _ForwardIterator __last, const allocator_type& __a) : __begin_(nullptr), __size_(0), __cap_alloc_(0, static_cast<__storage_allocator>(__a)) { auto __n = static_cast(std::distance(__first, __last)); @@ -2136,7 +1990,7 @@ vector::vector(_ForwardIterator __first, _ForwardIterator __la } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(const vector& __v) +vector::vector(const vector& __v) : __begin_(nullptr), __size_(0), __cap_alloc_(0, __storage_traits::select_on_container_copy_construction(__v.__alloc())) { @@ -2147,7 +2001,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(const vector& __v } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(const vector& __v, const allocator_type& __a) +vector::vector(const vector& __v, const allocator_type& __a) : __begin_(nullptr), __size_(0), __cap_alloc_(0, __a) { if (__v.size() > 0) { __vallocate(__v.size()); @@ -2156,7 +2010,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(const vector& __v } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector& vector::operator=(const vector& __v) { +vector& vector::operator=(const vector& __v) { if (this != std::addressof(__v)) { __copy_assign_alloc(__v); if (__v.__size_) { @@ -2172,8 +2026,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 vector& vector } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(vector&& __v) - _NOEXCEPT_(is_nothrow_move_constructible::value) +inline _LIBCPP_HIDE_FROM_ABI vector::vector(vector&& __v) : __begin_(__v.__begin_), __size_(__v.__size_), __cap_alloc_(std::move(__v.__cap_alloc_)) { __v.__begin_ = nullptr; __v.__size_ = 0; @@ -2181,7 +2034,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector -_LIBCPP_CONSTEXPR_SINCE_CXX20 vector::vector(vector&& __v, const __type_identity_t& __a) : __begin_(nullptr), __size_(0), __cap_alloc_(0, __a) { if (__a == allocator_type(__v.__alloc())) { @@ -2197,15 +2049,13 @@ vector::vector(vector&& __v, const __type_identity_t -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 vector& -vector::operator=(vector&& __v) - _NOEXCEPT_(__noexcept_move_assign_container<_Allocator, __alloc_traits>::value) { +inline _LIBCPP_HIDE_FROM_ABI vector& vector::operator=(vector&& __v) { __move_assign(__v, integral_constant()); return *this; } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::__move_assign(vector& __c, false_type) { +void vector::__move_assign(vector& __c, false_type) { if (__alloc() != __c.__alloc()) assign(__c.begin(), __c.end()); else @@ -2213,8 +2063,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::__move_assign(vecto } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::__move_assign(vector& __c, true_type) - _NOEXCEPT_(is_nothrow_move_assignable::value) { +void vector::__move_assign(vector& __c, true_type) { __vdeallocate(); __move_assign_alloc(__c); this->__begin_ = __c.__begin_; @@ -2225,7 +2074,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::__move_assign(vecto } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::assign(size_type __n, const value_type& __x) { +void vector::assign(size_type __n, const value_type& __x) { __size_ = 0; if (__n > 0) { size_type __c = capacity(); @@ -2243,14 +2092,13 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::assign(size_type __ template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::assign(_InputIterator __first, _InputIterator __last) { +void vector::assign(_InputIterator __first, _InputIterator __last) { __assign_with_sentinel(__first, __last); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void -vector::__assign_with_sentinel(_Iterator __first, _Sentinel __last) { +_LIBCPP_HIDE_FROM_ABI void vector::__assign_with_sentinel(_Iterator __first, _Sentinel __last) { clear(); for (; __first != __last; ++__first) push_back(*__first); @@ -2258,13 +2106,13 @@ vector::__assign_with_sentinel(_Iterator __first, _Sentinel __ template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::assign(_ForwardIterator __first, _ForwardIterator __last) { +void vector::assign(_ForwardIterator __first, _ForwardIterator __last) { __assign_with_size(__first, __last, std::distance(__first, __last)); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void +_LIBCPP_HIDE_FROM_ABI void vector::__assign_with_size(_ForwardIterator __first, _Sentinel __last, difference_type __ns) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__ns >= 0, "invalid range specified"); @@ -2281,7 +2129,7 @@ vector::__assign_with_size(_ForwardIterator __first, _Sentinel } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::reserve(size_type __n) { +void vector::reserve(size_type __n) { if (__n > capacity()) { if (__n > max_size()) this->__throw_length_error(); @@ -2293,7 +2141,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::reserve(size_type _ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::shrink_to_fit() _NOEXCEPT { +void vector::shrink_to_fit() _NOEXCEPT { if (__external_cap_to_internal(size()) > __cap()) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { @@ -2321,7 +2169,7 @@ typename vector::const_reference vector::at( } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::push_back(const value_type& __x) { +void vector::push_back(const value_type& __x) { if (this->__size_ == this->capacity()) reserve(__recommend(this->__size_ + 1)); ++this->__size_; @@ -2329,7 +2177,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::push_back(const val } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::iterator +typename vector::iterator vector::insert(const_iterator __position, const value_type& __x) { iterator __r; if (size() < capacity()) { @@ -2350,7 +2198,7 @@ vector::insert(const_iterator __position, const value_type& __ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::iterator +typename vector::iterator vector::insert(const_iterator __position, size_type __n, const value_type& __x) { iterator __r; size_type __c = capacity(); @@ -2373,14 +2221,14 @@ vector::insert(const_iterator __position, size_type __n, const template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::iterator +typename vector::iterator vector::insert(const_iterator __position, _InputIterator __first, _InputIterator __last) { return __insert_with_sentinel(__position, __first, __last); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI typename vector::iterator +_LIBCPP_HIDE_FROM_ABI typename vector::iterator vector::__insert_with_sentinel(const_iterator __position, _InputIterator __first, _Sentinel __last) { difference_type __off = __position - begin(); iterator __p = __const_iterator_cast(__position); @@ -2414,15 +2262,14 @@ vector::__insert_with_sentinel(const_iterator __position, _Inp template template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::iterator +typename vector::iterator vector::insert(const_iterator __position, _ForwardIterator __first, _ForwardIterator __last) { return __insert_with_size(__position, __first, __last, std::distance(__first, __last)); } template template -_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI typename vector::iterator -vector::__insert_with_size( +_LIBCPP_HIDE_FROM_ABI typename vector::iterator vector::__insert_with_size( const_iterator __position, _ForwardIterator __first, _Sentinel __last, difference_type __n_signed) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__n_signed >= 0, "invalid range specified"); const size_type __n = static_cast(__n_signed); @@ -2446,7 +2293,7 @@ vector::__insert_with_size( } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::iterator +inline _LIBCPP_HIDE_FROM_ABI typename vector::iterator vector::erase(const_iterator __position) { iterator __r = __const_iterator_cast(__position); std::copy(__position + 1, this->cend(), __r); @@ -2455,7 +2302,7 @@ vector::erase(const_iterator __position) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector::iterator +typename vector::iterator vector::erase(const_iterator __first, const_iterator __last) { iterator __r = __const_iterator_cast(__first); difference_type __d = __last - __first; @@ -2465,8 +2312,7 @@ vector::erase(const_iterator __first, const_iterator __last) { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::swap(vector& __x) - _NOEXCEPT_(!__alloc_traits::propagate_on_container_swap::value || __is_nothrow_swappable_v) { +void vector::swap(vector& __x) { std::swap(this->__begin_, __x.__begin_); std::swap(this->__size_, __x.__size_); std::swap(this->__cap(), __x.__cap()); @@ -2475,7 +2321,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::swap(vector& __x) } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::resize(size_type __sz, value_type __x) { +void vector::resize(size_type __sz, value_type __x) { size_type __cs = size(); if (__cs < __sz) { iterator __r; @@ -2497,7 +2343,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::resize(size_type __ } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::flip() _NOEXCEPT { +void vector::flip() _NOEXCEPT { // do middle whole words size_type __n = __size_; __storage_pointer __p = __begin_; @@ -2513,7 +2359,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector::flip() _NOEXCEPT { } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 bool vector::__invariants() const { +bool vector::__invariants() const { if (this->__begin_ == nullptr) { if (this->__size_ != 0 || this->__cap() != 0) return false; @@ -2527,7 +2373,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 bool vector::__invariants() cons } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 size_t vector::__hash_code() const _NOEXCEPT { +size_t vector::__hash_code() const _NOEXCEPT { size_t __h = 0; // do middle whole words size_type __n = __size_; @@ -2545,15 +2391,13 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 size_t vector::__hash_code() con template struct _LIBCPP_TEMPLATE_VIS hash > : public __unary_function, size_t> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_t - operator()(const vector& __vec) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_t operator()(const vector& __vec) const _NOEXCEPT { return __vec.__hash_code(); } }; template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI bool -operator==(const vector<_Tp, _Allocator>& __x, const vector<_Tp, _Allocator>& __y) { +inline _LIBCPP_HIDE_FROM_ABI bool operator==(const vector<_Tp, _Allocator>& __x, const vector<_Tp, _Allocator>& __y) { const typename vector<_Tp, _Allocator>::size_type __sz = __x.size(); return __sz == __y.size() && std::equal(__x.begin(), __x.end(), __y.begin()); } @@ -2584,8 +2428,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const vector<_Tp, _Allocator>& __x, } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI void -swap(vector<_Tp, _Allocator>& __x, vector<_Tp, _Allocator>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +inline _LIBCPP_HIDE_FROM_ABI void swap(vector<_Tp, _Allocator>& __x, vector<_Tp, _Allocator>& __y) { __x.swap(__y); } diff --git a/libcxx/include/__flat_set/utils.h b/libcxx/include/__flat_set/utils.h index ed3b4c48580fb..542bfd886aef5 100644 --- a/libcxx/include/__flat_set/utils.h +++ b/libcxx/include/__flat_set/utils.h @@ -11,6 +11,7 @@ #define _LIBCPP___FLAT_SET_UTILS_H #include <__config> +#include <__iterator/iterator_traits.h> #include <__ranges/access.h> #include <__ranges/concepts.h> #include <__type_traits/container_traits.h> @@ -60,7 +61,8 @@ struct __flat_set_utils { // C++23 Sequence Container should have insert_range member function // Note that not all Sequence Containers provide append_range. __set.__keys_.insert_range(__set.__keys_.end(), std::forward<_Range>(__rng)); - } else if constexpr (ranges::common_range<_Range>) { + } else if constexpr (ranges::common_range<_Range> && + __has_input_iterator_category>::value) { __set.__keys_.insert(__set.__keys_.end(), ranges::begin(__rng), ranges::end(__rng)); } else { for (auto&& __x : __rng) { diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h index 74fec9f2761e0..873265bc17c24 100644 --- a/libcxx/include/__format/format_functions.h +++ b/libcxx/include/__format/format_functions.h @@ -11,6 +11,8 @@ #define _LIBCPP___FORMAT_FORMAT_FUNCTIONS #include <__algorithm/clamp.h> +#include <__algorithm/ranges_find_first_of.h> +#include <__chrono/statically_widen.h> #include <__concepts/convertible_to.h> #include <__concepts/same_as.h> #include <__config> @@ -36,6 +38,7 @@ #include <__iterator/iterator_traits.h> // iter_value_t #include <__variant/monostate.h> #include +#include #include #include @@ -447,10 +450,47 @@ format_to(_OutIt __out_it, wformat_string<_Args...> __fmt, _Args&&... __args) { } # endif +// Try constant folding the format string instead of going through the whole formatting machinery. If there is no +// constant folding no extra code should be emitted (with optimizations enabled) and the function returns nullopt. When +// constant folding is successful, the formatting is performed and the resulting string is returned. +namespace __format { +template +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional> __try_constant_folding( + basic_string_view<_CharT> __fmt, + basic_format_args>, _CharT>> __args) { + // Fold strings not containing '{' or '}' to just return the string + if (bool __is_identity = [&] [[__gnu__::__pure__]] // Make sure the compiler knows this call can be eliminated + { return std::ranges::find_first_of(__fmt, array{'{', '}'}) == __fmt.end(); }(); + __builtin_constant_p(__is_identity) && __is_identity) + return basic_string<_CharT>{__fmt}; + + // Fold '{}' to the appropriate conversion function + if (auto __only_first_arg = __fmt == _LIBCPP_STATICALLY_WIDEN(_CharT, "{}"); + __builtin_constant_p(__only_first_arg) && __only_first_arg) { + if (auto __arg = __args.get(0); __builtin_constant_p(__arg.__type_)) { + return std::__visit_format_arg( + [](_Tp&& __argument) -> optional> { + if constexpr (is_same_v, basic_string_view<_CharT>>) { + return basic_string<_CharT>{__argument}; + } else { + return nullopt; + } + }, + __arg); + } + } + + return nullopt; +} +} // namespace __format + // TODO FMT This needs to be a template or std::to_chars(floating-point) availability markup // fires too eagerly, see http://llvm.org/PR61563. template [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string vformat(string_view __fmt, format_args __args) { + auto __result = __format::__try_constant_folding(__fmt, __args); + if (__result.has_value()) + return *std::move(__result); __format::__allocating_buffer __buffer; std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args); return string{__buffer.__view()}; @@ -462,6 +502,9 @@ template template [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring vformat(wstring_view __fmt, wformat_args __args) { + auto __result = __format::__try_constant_folding(__fmt, __args); + if (__result.has_value()) + return *std::move(__result); __format::__allocating_buffer __buffer; std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args); return wstring{__buffer.__view()}; diff --git a/libcxx/include/__fwd/pair.h b/libcxx/include/__fwd/pair.h index ea81a81ef8e11..cf07eabab6903 100644 --- a/libcxx/include/__fwd/pair.h +++ b/libcxx/include/__fwd/pair.h @@ -22,6 +22,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct pair; +template +inline const bool __is_pair_v = false; + +template +inline const bool __is_pair_v > = true; + template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename tuple_element<_Ip, pair<_T1, _T2> >::type& get(pair<_T1, _T2>&) _NOEXCEPT; diff --git a/libcxx/include/__memory/uses_allocator_construction.h b/libcxx/include/__memory/uses_allocator_construction.h index 955879ffc5845..49ddf99d9cc95 100644 --- a/libcxx/include/__memory/uses_allocator_construction.h +++ b/libcxx/include/__memory/uses_allocator_construction.h @@ -14,7 +14,6 @@ #include <__memory/uses_allocator.h> #include <__tuple/tuple_like_no_subrange.h> #include <__type_traits/enable_if.h> -#include <__type_traits/is_same.h> #include <__type_traits/remove_cv.h> #include <__utility/declval.h> #include <__utility/pair.h> @@ -31,14 +30,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 -template -inline constexpr bool __is_std_pair = false; - -template -inline constexpr bool __is_std_pair> = true; - template -inline constexpr bool __is_cv_std_pair = __is_std_pair>; +inline constexpr bool __is_cv_std_pair = __is_pair_v>; template struct __uses_allocator_construction_args; diff --git a/libcxx/include/__node_handle b/libcxx/include/__node_handle index 08c4ffa5ff17b..5c559c657ef50 100644 --- a/libcxx/include/__node_handle +++ b/libcxx/include/__node_handle @@ -62,6 +62,7 @@ public: #include <__config> #include <__memory/allocator_traits.h> #include <__memory/pointer_traits.h> +#include <__type_traits/is_specialization.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -173,17 +174,40 @@ struct __set_node_handle_specifics { _LIBCPP_HIDE_FROM_ABI value_type& value() const { return static_cast<_Derived const*>(this)->__ptr_->__get_value(); } }; +template +struct __hash_value_type; + template struct __map_node_handle_specifics { - typedef typename _NodeType::__node_value_type::key_type key_type; - typedef typename _NodeType::__node_value_type::mapped_type mapped_type; + template + struct __get_type { + using key_type = __remove_const_t; + using mapped_type = typename _Tp::second_type; + }; + + template + struct __get_type<__hash_value_type<_Key, _Mapped> > { + using key_type = _Key; + using mapped_type = _Mapped; + }; + + using key_type = typename __get_type::key_type; + using mapped_type = typename __get_type::mapped_type; _LIBCPP_HIDE_FROM_ABI key_type& key() const { - return static_cast<_Derived const*>(this)->__ptr_->__get_value().__ref().first; + if constexpr (__is_specialization_v) { + return static_cast<_Derived const*>(this)->__ptr_->__get_value().__ref().first; + } else { + return const_cast(static_cast<_Derived const*>(this)->__ptr_->__get_value().first); + } } _LIBCPP_HIDE_FROM_ABI mapped_type& mapped() const { - return static_cast<_Derived const*>(this)->__ptr_->__get_value().__ref().second; + if constexpr (__is_specialization_v) { + return static_cast<_Derived const*>(this)->__ptr_->__get_value().__ref().second; + } else { + return static_cast<_Derived const*>(this)->__ptr_->__get_value().second; + } } }; diff --git a/libcxx/include/__tree b/libcxx/include/__tree index bbf7c71962e93..1903533898481 100644 --- a/libcxx/include/__tree +++ b/libcxx/include/__tree @@ -14,6 +14,7 @@ #include <__assert> #include <__config> #include <__fwd/map.h> +#include <__fwd/pair.h> #include <__fwd/set.h> #include <__iterator/distance.h> #include <__iterator/iterator_traits.h> @@ -25,6 +26,7 @@ #include <__memory/swap_allocator.h> #include <__memory/unique_ptr.h> #include <__type_traits/can_extract_key.h> +#include <__type_traits/copy_cvref.h> #include <__type_traits/enable_if.h> #include <__type_traits/invoke.h> #include <__type_traits/is_const.h> @@ -505,48 +507,24 @@ struct __is_tree_value_type<_One> : __is_tree_value_type_imp<__remove_cvref_t<_O template struct __tree_key_value_types { typedef _Tp key_type; - typedef _Tp __node_value_type; typedef _Tp __container_value_type; static const bool __is_map = false; _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(_Tp const& __v) { return __v; } - _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(__node_value_type const& __v) { return __v; } - _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__node_value_type& __n) { return std::addressof(__n); } - _LIBCPP_HIDE_FROM_ABI static __container_value_type&& __move(__node_value_type& __v) { return std::move(__v); } }; template struct __tree_key_value_types<__value_type<_Key, _Tp> > { typedef _Key key_type; typedef _Tp mapped_type; - typedef __value_type<_Key, _Tp> __node_value_type; typedef pair __container_value_type; typedef __container_value_type __map_value_type; static const bool __is_map = true; - _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(__node_value_type const& __t) { - return __t.__get_value().first; - } - template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(_Up& __t) { return __t.first; } - - _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(__node_value_type const& __t) { - return __t.__get_value(); - } - - template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(_Up& __t) { - return __t; - } - - _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__node_value_type& __n) { - return std::addressof(__n.__get_value()); - } - - _LIBCPP_HIDE_FROM_ABI static pair __move(__node_value_type& __v) { return __v.__move(); } }; template @@ -587,6 +565,19 @@ struct __tree_map_pointer_types<_Tp, _AllocPtr, _KVTypes, true> { typedef __rebind_pointer_t<_AllocPtr, const _Mv> __const_map_value_type_pointer; }; +template +struct __get_node_value_type { + using type _LIBCPP_NODEBUG = _Tp; +}; + +template +struct __get_node_value_type<__value_type<_Key, _ValueT> > { + using type _LIBCPP_NODEBUG = pair; +}; + +template +using __get_node_value_type_t _LIBCPP_NODEBUG = typename __get_node_value_type<_Tp>::type; + template ::element_type> struct __tree_node_types; @@ -599,7 +590,7 @@ public: typedef typename pointer_traits<_NodePtr>::element_type __node_type; typedef _NodePtr __node_pointer; - typedef _Tp __node_value_type; + using __node_value_type _LIBCPP_NODEBUG = __get_node_value_type_t<_Tp>; typedef __rebind_pointer_t<_VoidPtr, __node_value_type> __node_value_type_pointer; typedef __rebind_pointer_t<_VoidPtr, const __node_value_type> __const_node_value_type_pointer; @@ -650,11 +641,11 @@ public: template class _LIBCPP_STANDALONE_DEBUG __tree_node : public __tree_node_base<_VoidPtr> { public: - typedef _Tp __node_value_type; + using __node_value_type _LIBCPP_NODEBUG = __get_node_value_type_t<_Tp>; __node_value_type __value_; - _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; } + _LIBCPP_HIDE_FROM_ABI __node_value_type& __get_value() { return __value_; } ~__tree_node() = delete; __tree_node(__tree_node const&) = delete; @@ -685,7 +676,7 @@ public: _LIBCPP_HIDE_FROM_ABI void operator()(pointer __p) _NOEXCEPT { if (__value_constructed) - __alloc_traits::destroy(__na_, _NodeTypes::__get_ptr(__p->__value_)); + __alloc_traits::destroy(__na_, std::addressof(__p->__value_)); if (__p) __alloc_traits::deallocate(__na_, __p, 1); } @@ -715,7 +706,7 @@ class __tree_iterator { public: typedef bidirectional_iterator_tag iterator_category; - typedef _Tp value_type; + using value_type = __get_node_value_type_t<_Tp>; typedef _DiffType difference_type; typedef value_type& reference; typedef typename _NodeTypes::__node_value_type_pointer pointer; @@ -789,7 +780,7 @@ class __tree_const_iterator { public: typedef bidirectional_iterator_tag iterator_category; - typedef _Tp value_type; + using value_type = __get_node_value_type_t<_Tp>; typedef _DiffType difference_type; typedef const value_type& reference; typedef typename _NodeTypes::__const_node_value_type_pointer pointer; @@ -802,7 +793,7 @@ public: } private: - typedef __tree_iterator __non_const_iterator; + typedef __tree_iterator<_Tp, __node_pointer, difference_type> __non_const_iterator; public: _LIBCPP_HIDE_FROM_ABI __tree_const_iterator(__non_const_iterator __p) _NOEXCEPT : __ptr_(__p.__ptr_) {} @@ -1107,6 +1098,18 @@ public: return __emplace_hint_unique(__p, std::forward<_Vp>(__v)); } + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI void + __insert_unique_from_orphaned_node(const_iterator __p, __get_node_value_type_t<_Tp>&& __value) { + using __key_type = typename _NodeTypes::key_type; + __emplace_hint_unique(__p, const_cast<__key_type&&>(__value.first), std::move(__value.second)); + } + + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI void __insert_unique_from_orphaned_node(const_iterator __p, _Tp&& __value) { + __emplace_hint_unique(__p, std::move(__value)); + } + _LIBCPP_HIDE_FROM_ABI iterator __insert_multi(__container_value_type&& __v) { return __emplace_multi(std::move(__v)); } @@ -1125,6 +1128,18 @@ public: return __emplace_hint_multi(__p, std::forward<_Vp>(__v)); } + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI void + __insert_multi_from_orphaned_node(const_iterator __p, __get_node_value_type_t<_Tp>&& __value) { + using __key_type = typename _NodeTypes::key_type; + __emplace_hint_multi(__p, const_cast<__key_type&&>(__value.first), std::move(__value.second)); + } + + template ::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(const_iterator __p, _Tp&& __value) { + __emplace_hint_multi(__p, std::move(__value)); + } + _LIBCPP_HIDE_FROM_ABI pair __node_assign_unique(const __container_value_type& __v, __node_pointer __dest); @@ -1266,6 +1281,21 @@ private: } _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__tree&, false_type) _NOEXCEPT {} + template >, int> = 0> + _LIBCPP_HIDE_FROM_ABI static void __assign_value(__get_node_value_type_t& __lhs, _From&& __rhs) { + using __key_type = typename _NodeTypes::key_type; + + // This is technically UB, since the object was constructed as `const`. + // Clang doesn't optimize on this currently though. + const_cast<__key_type&>(__lhs.first) = const_cast<__copy_cvref_t<_From, __key_type>&&>(__rhs.first); + __lhs.second = std::forward<_From>(__rhs).second; + } + + template >, int> = 0> + _LIBCPP_HIDE_FROM_ABI static void __assign_value(_To& __lhs, _From&& __rhs) { + __lhs = std::forward<_From>(__rhs); + } + struct _DetachedTreeCache { _LIBCPP_HIDE_FROM_ABI explicit _DetachedTreeCache(__tree* __t) _NOEXCEPT : __t_(__t), @@ -1406,14 +1436,14 @@ void __tree<_Tp, _Compare, _Allocator>::__assign_multi(_InputIterator __first, _ if (size() != 0) { _DetachedTreeCache __cache(this); for (; __cache.__get() && __first != __last; ++__first) { - __cache.__get()->__value_ = *__first; + __assign_value(__cache.__get()->__value_, *__first); __node_insert_multi(__cache.__get()); __cache.__advance(); } } const_iterator __e = end(); for (; __first != __last; ++__first) - __insert_multi(__e, _NodeTypes::__get_value(*__first)); + __insert_multi(__e, *__first); } template @@ -1492,13 +1522,14 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) { if (size() != 0) { _DetachedTreeCache __cache(this); while (__cache.__get() != nullptr && __t.size() != 0) { - __cache.__get()->__value_ = std::move(__t.remove(__t.begin())->__value_); + __assign_value(__cache.__get()->__value_, std::move(__t.remove(__t.begin())->__value_)); __node_insert_multi(__cache.__get()); __cache.__advance(); } } - while (__t.size() != 0) - __insert_multi(__e, _NodeTypes::__move(__t.remove(__t.begin())->__value_)); + while (__t.size() != 0) { + __insert_multi_from_orphaned_node(__e, std::move(__t.remove(__t.begin())->__value_)); + } } } @@ -1524,7 +1555,7 @@ void __tree<_Tp, _Compare, _Allocator>::destroy(__node_pointer __nd) _NOEXCEPT { destroy(static_cast<__node_pointer>(__nd->__left_)); destroy(static_cast<__node_pointer>(__nd->__right_)); __node_allocator& __na = __node_alloc(); - __node_traits::destroy(__na, _NodeTypes::__get_ptr(__nd->__value_)); + __node_traits::destroy(__na, std::addressof(__nd->__value_)); __node_traits::deallocate(__na, __nd, 1); } } @@ -1794,10 +1825,9 @@ template template typename __tree<_Tp, _Compare, _Allocator>::__node_holder __tree<_Tp, _Compare, _Allocator>::__construct_node(_Args&&... __args) { - static_assert(!__is_tree_value_type<_Args...>::value, "Cannot construct from __value_type"); __node_allocator& __na = __node_alloc(); __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na)); - __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_), std::forward<_Args>(__args)...); + __node_traits::construct(__na, std::addressof(__h->__value_), std::forward<_Args>(__args)...); __h.get_deleter().__value_constructed = true; return __h; } @@ -1865,7 +1895,7 @@ __tree<_Tp, _Compare, _Allocator>::__node_assign_unique(const __container_value_ __node_pointer __r = static_cast<__node_pointer>(__child); bool __inserted = false; if (__child == nullptr) { - __nd->__value_ = __v; + __assign_value(__nd->__value_, __v); __insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__nd)); __r = __nd; __inserted = true; @@ -2027,7 +2057,7 @@ typename __tree<_Tp, _Compare, _Allocator>::iterator __tree<_Tp, _Compare, _Allo __node_pointer __np = __p.__get_np(); iterator __r = __remove_node_pointer(__np); __node_allocator& __na = __node_alloc(); - __node_traits::destroy(__na, _NodeTypes::__get_ptr(const_cast<__node_value_type&>(*__p))); + __node_traits::destroy(__na, std::addressof(const_cast<__node_value_type&>(*__p))); __node_traits::deallocate(__na, __np, 1); return __r; } diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h index a608ae522c561..e921e651e950f 100644 --- a/libcxx/include/__vector/vector_bool.h +++ b/libcxx/include/__vector/vector_bool.h @@ -512,7 +512,7 @@ class vector { _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __move_assign_alloc(vector&, false_type) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_t __hash_code() const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT; friend class __bit_reference; friend class __bit_const_reference; @@ -1093,7 +1093,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 bool vector::__invariants() cons } template -_LIBCPP_CONSTEXPR_SINCE_CXX20 size_t vector::__hash_code() const _NOEXCEPT { +size_t vector::__hash_code() const _NOEXCEPT { size_t __h = 0; // do middle whole words size_type __n = __size_; @@ -1110,8 +1110,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 size_t vector::__hash_code() con template struct hash > : public __unary_function, size_t> { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_t - operator()(const vector& __vec) const _NOEXCEPT { + _LIBCPP_HIDE_FROM_ABI size_t operator()(const vector& __vec) const _NOEXCEPT { return __vec.__hash_code(); } }; diff --git a/libcxx/include/map b/libcxx/include/map index a244696295fb8..039ed86dc756f 100644 --- a/libcxx/include/map +++ b/libcxx/include/map @@ -593,7 +593,6 @@ erase_if(multimap& c, Predicate pred); // C++20 # include <__memory/pointer_traits.h> # include <__memory/unique_ptr.h> # include <__memory_resource/polymorphic_allocator.h> -# include <__new/launder.h> # include <__node_handle> # include <__ranges/concepts.h> # include <__ranges/container_compatible_range.h> @@ -645,13 +644,13 @@ public: : _Compare(__c) {} _LIBCPP_HIDE_FROM_ABI const _Compare& key_comp() const _NOEXCEPT { return *this; } _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _CP& __y) const { - return static_cast(*this)(__x.__get_value().first, __y.__get_value().first); + return static_cast(*this)(__x.first, __y.first); } _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _Key& __y) const { - return static_cast(*this)(__x.__get_value().first, __y); + return static_cast(*this)(__x.first, __y); } _LIBCPP_HIDE_FROM_ABI bool operator()(const _Key& __x, const _CP& __y) const { - return static_cast(*this)(__x, __y.__get_value().first); + return static_cast(*this)(__x, __y.first); } _LIBCPP_HIDE_FROM_ABI void swap(__map_value_compare& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Compare>) { using std::swap; @@ -661,12 +660,12 @@ public: # if _LIBCPP_STD_VER >= 14 template _LIBCPP_HIDE_FROM_ABI bool operator()(const _K2& __x, const _CP& __y) const { - return static_cast(*this)(__x, __y.__get_value().first); + return static_cast(*this)(__x, __y.first); } template _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _K2& __y) const { - return static_cast(*this)(__x.__get_value().first, __y); + return static_cast(*this)(__x.first, __y); } # endif }; @@ -682,15 +681,9 @@ public: : __comp_(__c) {} _LIBCPP_HIDE_FROM_ABI const _Compare& key_comp() const _NOEXCEPT { return __comp_; } - _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _CP& __y) const { - return __comp_(__x.__get_value().first, __y.__get_value().first); - } - _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _Key& __y) const { - return __comp_(__x.__get_value().first, __y); - } - _LIBCPP_HIDE_FROM_ABI bool operator()(const _Key& __x, const _CP& __y) const { - return __comp_(__x, __y.__get_value().first); - } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _CP& __y) const { return __comp_(__x.first, __y.first); } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _Key& __y) const { return __comp_(__x.first, __y); } + _LIBCPP_HIDE_FROM_ABI bool operator()(const _Key& __x, const _CP& __y) const { return __comp_(__x, __y.first); } void swap(__map_value_compare& __y) _NOEXCEPT_(__is_nothrow_swappable_v<_Compare>) { using std::swap; swap(__comp_, __y.__comp_); @@ -749,9 +742,9 @@ public: _LIBCPP_HIDE_FROM_ABI void operator()(pointer __p) _NOEXCEPT { if (__second_constructed) - __alloc_traits::destroy(__na_, std::addressof(__p->__value_.__get_value().second)); + __alloc_traits::destroy(__na_, std::addressof(__p->__value_.second)); if (__first_constructed) - __alloc_traits::destroy(__na_, std::addressof(__p->__value_.__get_value().first)); + __alloc_traits::destroy(__na_, std::addressof(__p->__value_.first)); if (__p) __alloc_traits::deallocate(__na_, __p, 1); } @@ -760,90 +753,8 @@ public: template class __map_const_iterator; -# ifndef _LIBCPP_CXX03_LANG - -template -struct _LIBCPP_STANDALONE_DEBUG __value_type { - typedef _Key key_type; - typedef _Tp mapped_type; - typedef pair value_type; - typedef pair __nc_ref_pair_type; - typedef pair __nc_rref_pair_type; - -private: - value_type __cc_; - -public: - _LIBCPP_HIDE_FROM_ABI value_type& __get_value() { -# if _LIBCPP_STD_VER >= 17 - return *std::launder(std::addressof(__cc_)); -# else - return __cc_; -# endif - } - - _LIBCPP_HIDE_FROM_ABI const value_type& __get_value() const { -# if _LIBCPP_STD_VER >= 17 - return *std::launder(std::addressof(__cc_)); -# else - return __cc_; -# endif - } - - _LIBCPP_HIDE_FROM_ABI __nc_ref_pair_type __ref() { - value_type& __v = __get_value(); - return __nc_ref_pair_type(const_cast(__v.first), __v.second); - } - - _LIBCPP_HIDE_FROM_ABI __nc_rref_pair_type __move() { - value_type& __v = __get_value(); - return __nc_rref_pair_type(std::move(const_cast(__v.first)), std::move(__v.second)); - } - - _LIBCPP_HIDE_FROM_ABI __value_type& operator=(const __value_type& __v) { - __ref() = __v.__get_value(); - return *this; - } - - _LIBCPP_HIDE_FROM_ABI __value_type& operator=(__value_type&& __v) { - __ref() = __v.__move(); - return *this; - } - - template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI __value_type& operator=(_ValueTp&& __v) { - __ref() = std::forward<_ValueTp>(__v); - return *this; - } - - __value_type() = delete; - ~__value_type() = delete; - __value_type(const __value_type&) = delete; - __value_type(__value_type&&) = delete; -}; - -# else - template -struct __value_type { - typedef _Key key_type; - typedef _Tp mapped_type; - typedef pair value_type; - -private: - value_type __cc_; - -public: - _LIBCPP_HIDE_FROM_ABI value_type& __get_value() { return __cc_; } - _LIBCPP_HIDE_FROM_ABI const value_type& __get_value() const { return __cc_; } - - __value_type() = delete; - __value_type(__value_type const&) = delete; - __value_type& operator=(__value_type const&) = delete; - ~__value_type() = delete; -}; - -# endif // _LIBCPP_CXX03_LANG +struct __value_type; template struct __extract_key_value_types; @@ -872,8 +783,8 @@ public: _LIBCPP_HIDE_FROM_ABI __map_iterator(_TreeIterator __i) _NOEXCEPT : __i_(__i) {} - _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __i_->__get_value(); } - _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(__i_->__get_value()); } + _LIBCPP_HIDE_FROM_ABI reference operator*() const { return *__i_; } + _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(*__i_); } _LIBCPP_HIDE_FROM_ABI __map_iterator& operator++() { ++__i_; @@ -930,8 +841,8 @@ public: _LIBCPP_HIDE_FROM_ABI __map_const_iterator(__map_iterator< typename _TreeIterator::__non_const_iterator> __i) _NOEXCEPT : __i_(__i.__i_) {} - _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __i_->__get_value(); } - _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(__i_->__get_value()); } + _LIBCPP_HIDE_FROM_ABI reference operator*() const { return *__i_; } + _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(*__i_); } _LIBCPP_HIDE_FROM_ABI __map_const_iterator& operator++() { ++__i_; @@ -999,7 +910,7 @@ public: private: typedef std::__value_type __value_type; - typedef __map_value_compare __vc; + typedef __map_value_compare __vc; typedef __rebind_alloc, __value_type> __allocator_type; typedef __tree<__value_type, __vc, __allocator_type> __base; typedef typename __base::__node_traits __node_traits; @@ -1294,7 +1205,7 @@ public: auto [__r, __inserted] = __tree_.__emplace_hint_unique_key_args(__h.__i_, __k, __k, std::forward<_Vp>(__v)); if (!__inserted) - __r->__get_value().second = std::forward<_Vp>(__v); + __r->second = std::forward<_Vp>(__v); return __r; } @@ -1305,7 +1216,7 @@ public: __tree_.__emplace_hint_unique_key_args(__h.__i_, __k, std::move(__k), std::forward<_Vp>(__v)); if (!__inserted) - __r->__get_value().second = std::forward<_Vp>(__v); + __r->second = std::forward<_Vp>(__v); return __r; } @@ -1510,8 +1421,9 @@ map<_Key, _Tp, _Compare, _Allocator>::map(map&& __m, const allocator_type& __a) : __tree_(std::move(__m.__tree_), typename __base::allocator_type(__a)) { if (__a != __m.get_allocator()) { const_iterator __e = cend(); - while (!__m.empty()) - __tree_.__insert_unique(__e.__i_, __m.__tree_.remove(__m.begin().__i_)->__value_.__move()); + while (!__m.empty()) { + __tree_.__insert_unique_from_orphaned_node(__e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__value_)); + } } } @@ -1519,8 +1431,7 @@ template _Tp& map<_Key, _Tp, _Compare, _Allocator>::operator[](const key_type& __k) { return __tree_ .__emplace_unique_key_args(__k, std::piecewise_construct, std::forward_as_tuple(__k), std::forward_as_tuple()) - .first->__get_value() - .second; + .first->second; } template @@ -1530,8 +1441,7 @@ _Tp& map<_Key, _Tp, _Compare, _Allocator>::operator[](key_type&& __k) { return __tree_ .__emplace_unique_key_args( __k, std::piecewise_construct, std::forward_as_tuple(std::move(__k)), std::forward_as_tuple()) - .first->__get_value() - .second; + .first->second; // NOLINTEND(bugprone-use-after-move) } @@ -1542,9 +1452,9 @@ typename map<_Key, _Tp, _Compare, _Allocator>::__node_holder map<_Key, _Tp, _Compare, _Allocator>::__construct_node_with_key(const key_type& __k) { __node_allocator& __na = __tree_.__node_alloc(); __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na)); - __node_traits::construct(__na, std::addressof(__h->__value_.__get_value().first), __k); + __node_traits::construct(__na, std::addressof(__h->__value_.first), __k); __h.get_deleter().__first_constructed = true; - __node_traits::construct(__na, std::addressof(__h->__value_.__get_value().second)); + __node_traits::construct(__na, std::addressof(__h->__value_.second)); __h.get_deleter().__second_constructed = true; return __h; } @@ -1559,7 +1469,7 @@ _Tp& map<_Key, _Tp, _Compare, _Allocator>::operator[](const key_type& __k) { __tree_.__insert_node_at(__parent, __child, static_cast<__node_base_pointer>(__h.get())); __r = __h.release(); } - return __r->__value_.__get_value().second; + return __r->__value_.second; } # endif // _LIBCPP_CXX03_LANG @@ -1570,7 +1480,7 @@ _Tp& map<_Key, _Tp, _Compare, _Allocator>::at(const key_type& __k) { __node_base_pointer& __child = __tree_.__find_equal(__parent, __k); if (__child == nullptr) std::__throw_out_of_range("map::at: key not found"); - return static_cast<__node_pointer>(__child)->__value_.__get_value().second; + return static_cast<__node_pointer>(__child)->__value_.second; } template @@ -1579,7 +1489,7 @@ const _Tp& map<_Key, _Tp, _Compare, _Allocator>::at(const key_type& __k) const { __node_base_pointer __child = __tree_.__find_equal(__parent, __k); if (__child == nullptr) std::__throw_out_of_range("map::at: key not found"); - return static_cast<__node_pointer>(__child)->__value_.__get_value().second; + return static_cast<__node_pointer>(__child)->__value_.second; } template @@ -1685,7 +1595,7 @@ public: private: typedef std::__value_type __value_type; - typedef __map_value_compare __vc; + typedef __map_value_compare __vc; typedef __rebind_alloc, __value_type> __allocator_type; typedef __tree<__value_type, __vc, __allocator_type> __base; typedef typename __base::__node_traits __node_traits; @@ -2100,7 +2010,7 @@ multimap<_Key, _Tp, _Compare, _Allocator>::multimap(multimap&& __m, const alloca if (__a != __m.get_allocator()) { const_iterator __e = cend(); while (!__m.empty()) - __tree_.__insert_multi(__e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__value_.__move())); + __tree_.__insert_multi_from_orphaned_node(__e.__i_, std::move(__m.__tree_.remove(__m.begin().__i_)->__value_)); } } # endif diff --git a/libcxx/include/print b/libcxx/include/print index 61c3ebcd98cb8..be05d30e0147f 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -123,7 +123,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value _LIBCPP_ASSERT_UNCATEGORIZED(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-16"); if (__value < 0x10000) { - *__out_it++ = __value; + *__out_it++ = static_cast>(__value); return; } diff --git a/libcxx/src/.clang-tidy b/libcxx/src/.clang-tidy index ec8f2e0a76a3c..3d5493d965558 100644 --- a/libcxx/src/.clang-tidy +++ b/libcxx/src/.clang-tidy @@ -1,4 +1,18 @@ InheritParentConfig: true Checks: > - -readability-identifier-naming + -clang-analyzer-*, + + -llvm-include-order, + + -modernize-loop-convert, + -modernize-use-equals-delete, + -modernize-use-nullptr, + -modernize-use-override, + + -readability-identifier-naming, + -readability-function-cognitive-complexity, + -readability-function-size, + -readability-simplify-boolean-expr, + +# TODO: Consider enabling clang-analyzer. Without the checks clang-tidy runs 18x faster on my system. diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h index 6978a4fd59e5e..0b43f271486c1 100644 --- a/libcxx/src/include/overridable_function.h +++ b/libcxx/src/include/overridable_function.h @@ -29,14 +29,14 @@ // This is a low-level utility which does not work on all platforms, since it needs // to make assumptions about the object file format in use. Furthermore, it requires // the "base definition" of the function (the one we want to check whether it has been -// overridden) to be annotated with the _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro. +// overridden) to be defined using the _LIBCPP_OVERRIDABLE_FUNCTION macro. // // This currently works with Mach-O files (used on Darwin) and with ELF files (used on Linux // and others). On platforms where we know how to implement this detection, the macro // _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION is defined to 1, and it is defined to 0 on -// other platforms. The _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro is defined to -// nothing on unsupported platforms so that it can be used to decorate functions regardless -// of whether detection is actually supported. +// other platforms. The _LIBCPP_OVERRIDABLE_FUNCTION macro is defined to perform a normal +// function definition on unsupported platforms so that it can be used to define functions +// regardless of whether detection is actually supported. // // How does this work? // ------------------- @@ -44,7 +44,7 @@ // Let's say we want to check whether a weak function `f` has been overridden by the user. // The general mechanism works by placing `f`'s definition (in the libc++ built library) // inside a special section, which we do using the `__section__` attribute via the -// _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro. +// _LIBCPP_OVERRIDABLE_FUNCTION macro. // // Then, when comes the time to check whether the function has been overridden, we take // the address of the function and we check whether it falls inside the special function diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp index 9ea59a3a19ca4..b3ec28950008a 100644 --- a/libcxx/src/locale.cpp +++ b/libcxx/src/locale.cpp @@ -3957,7 +3957,7 @@ static bool is_narrow_non_breaking_space(const char* ptr) { } static bool is_non_breaking_space(const char* ptr) { - // https://www.fileformat.info/info/unicode/char/0a/index.htm + // https://www.fileformat.info/info/unicode/char/a0/index.htm return ptr[0] == '\xc2' && ptr[1] == '\xa0'; } #endif // _LIBCPP_HAS_WIDE_CHARACTERS diff --git a/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp deleted file mode 100644 index a385185ec7fe5..0000000000000 --- a/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp +++ /dev/null @@ -1,42 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17 -// UNSUPPORTED: libcpp-has-no-incomplete-pstl - -#include -#include - -#include "common.h" - -namespace { -template -struct StableSort { - size_t Quantity; - - void run(benchmark::State& state) const { - runOpOnCopies(state, Quantity, Order(), BatchSize::CountBatch, [](auto& Copy) { - std::stable_sort(std::execution::par, Copy.begin(), Copy.end()); - }); - } - - bool skip() const { return Order() == ::Order::Heap; } - - std::string name() const { - return "BM_pstl_stable_sort" + ValueType::name() + Order::name() + "/" + std::to_string(Quantity); - } -}; -} // namespace - -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - if (benchmark::ReportUnrecognizedArguments(argc, argv)) - return 1; - makeCartesianProductBenchmark(Quantities); - benchmark::RunSpecifiedBenchmarks(); -} diff --git a/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp deleted file mode 100644 index d145a159a21fd..0000000000000 --- a/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp +++ /dev/null @@ -1,40 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17 - -#include - -#include "common.h" - -namespace { -template -struct Sort { - size_t Quantity; - - void run(benchmark::State& state) const { - runOpOnCopies(state, Quantity, Order(), BatchSize::CountElements, [](auto& Copy) { - std::ranges::sort(Copy); - }); - } - - bool skip() const { return Order() == ::Order::Heap; } - - std::string name() const { - return "BM_RangesSort" + ValueType::name() + Order::name() + "_" + std::to_string(Quantity); - } -}; -} // namespace - -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - if (benchmark::ReportUnrecognizedArguments(argc, argv)) - return 1; - makeCartesianProductBenchmark(Quantities); - benchmark::RunSpecifiedBenchmarks(); -} diff --git a/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp deleted file mode 100644 index acc2f3f755fb8..0000000000000 --- a/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp +++ /dev/null @@ -1,40 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17 - -#include - -#include "common.h" - -namespace { -template -struct StableSort { - size_t Quantity; - - void run(benchmark::State& state) const { - runOpOnCopies(state, Quantity, Order(), BatchSize::CountElements, [](auto& Copy) { - std::ranges::stable_sort(Copy); - }); - } - - bool skip() const { return Order() == ::Order::Heap; } - - std::string name() const { - return "BM_RangesStableSort" + ValueType::name() + Order::name() + "_" + std::to_string(Quantity); - } -}; -} // namespace - -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - if (benchmark::ReportUnrecognizedArguments(argc, argv)) - return 1; - makeCartesianProductBenchmark(Quantities); - benchmark::RunSpecifiedBenchmarks(); -} diff --git a/libcxx/test/benchmarks/algorithms/sort.bench.cpp b/libcxx/test/benchmarks/algorithms/sort.bench.cpp deleted file mode 100644 index 7f3ce6ff7a07e..0000000000000 --- a/libcxx/test/benchmarks/algorithms/sort.bench.cpp +++ /dev/null @@ -1,38 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17 - -#include - -#include "common.h" - -namespace { -template -struct Sort { - size_t Quantity; - - void run(benchmark::State& state) const { - runOpOnCopies(state, Quantity, Order(), BatchSize::CountElements, [](auto& Copy) { - std::sort(Copy.begin(), Copy.end()); - }); - } - - bool skip() const { return Order() == ::Order::Heap; } - - std::string name() const { return "BM_Sort" + ValueType::name() + Order::name() + "_" + std::to_string(Quantity); }; -}; -} // namespace - -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - if (benchmark::ReportUnrecognizedArguments(argc, argv)) - return 1; - makeCartesianProductBenchmark(Quantities); - benchmark::RunSpecifiedBenchmarks(); -} diff --git a/libcxx/test/benchmarks/algorithms/sorting/common.h b/libcxx/test/benchmarks/algorithms/sorting/common.h new file mode 100644 index 0000000000000..8195e9a2dc8d0 --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/sorting/common.h @@ -0,0 +1,141 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LIBCXX_TEST_BENCHMARKS_ALGORITHMS_SORTING_COMMON_H +#define LIBCXX_TEST_BENCHMARKS_ALGORITHMS_SORTING_COMMON_H + +#include +#include +#include +#include +#include +#include + +namespace support { + +// This function creates a vector with N int-like values. +// +// These values are arranged in such a way that they would invoke O(N^2) +// behavior on any quick sort implementation that satisifies certain conditions. +// Details are available in the following paper: +// +// "A Killer Adversary for Quicksort", M. D. McIlroy, Software-Practice & +// Experience Volume 29 Issue 4 April 10, 1999 pp 341-344. +// https://dl.acm.org/doi/10.5555/311868.311871. +template +std::vector quicksort_adversarial_data(std::size_t n) { + static_assert(std::is_integral_v); + assert(n > 0); + + // If an element is equal to gas, it indicates that the value of the element + // is still to be decided and may change over the course of time. + T gas = n - 1; + + std::vector v; + v.resize(n); + for (unsigned int i = 0; i < n; ++i) { + v[i] = gas; + } + // Candidate for the pivot position. + int candidate = 0; + int nsolid = 0; + // Populate all positions in the generated input to gas. + std::vector ascending_values(v.size()); + + // Fill up with ascending values from 0 to v.size()-1. These will act as + // indices into v. + std::iota(ascending_values.begin(), ascending_values.end(), 0); + std::sort(ascending_values.begin(), ascending_values.end(), [&](int x, int y) { + if (v[x] == gas && v[y] == gas) { + // We are comparing two inputs whose value is still to be decided. + if (x == candidate) { + v[x] = nsolid++; + } else { + v[y] = nsolid++; + } + } + if (v[x] == gas) { + candidate = x; + } else if (v[y] == gas) { + candidate = y; + } + return v[x] < v[y]; + }); + return v; +} + +// ascending sorted values +template +std::vector ascending_sorted_data(std::size_t n) { + std::vector v(n); + std::iota(v.begin(), v.end(), 0); + return v; +} + +// descending sorted values +template +std::vector descending_sorted_data(std::size_t n) { + std::vector v(n); + std::iota(v.begin(), v.end(), 0); + std::reverse(v.begin(), v.end()); + return v; +} + +// pipe-organ pattern +template +std::vector pipe_organ_data(std::size_t n) { + std::vector v(n); + std::iota(v.begin(), v.end(), 0); + auto half = v.begin() + v.size() / 2; + std::reverse(half, v.end()); + return v; +} + +// heap pattern +template +std::vector heap_data(std::size_t n) { + std::vector v(n); + std::iota(v.begin(), v.end(), 0); + std::make_heap(v.begin(), v.end()); + return v; +} + +// shuffled randomly +template +std::vector shuffled_data(std::size_t n) { + std::vector v(n); + std::iota(v.begin(), v.end(), 0); + std::mt19937 rng; + std::shuffle(v.begin(), v.end(), rng); + return v; +} + +// single element in the whole sequence +template +std::vector single_element_data(std::size_t n) { + std::vector v(n); + return v; +} + +struct NonIntegral { + NonIntegral() : value_(0) {} + NonIntegral(int i) : value_(i) {} + friend auto operator<(NonIntegral const& a, NonIntegral const& b) { return a.value_ < b.value_; } + friend auto operator>(NonIntegral const& a, NonIntegral const& b) { return a.value_ > b.value_; } + friend auto operator<=(NonIntegral const& a, NonIntegral const& b) { return a.value_ <= b.value_; } + friend auto operator>=(NonIntegral const& a, NonIntegral const& b) { return a.value_ >= b.value_; } + friend auto operator==(NonIntegral const& a, NonIntegral const& b) { return a.value_ == b.value_; } + friend auto operator!=(NonIntegral const& a, NonIntegral const& b) { return a.value_ != b.value_; } + +private: + int value_; +}; + +} // namespace support + +#endif // LIBCXX_TEST_BENCHMARKS_ALGORITHMS_SORTING_COMMON_H diff --git a/libcxx/test/benchmarks/algorithms/sorting/is_sorted.bench.cpp b/libcxx/test/benchmarks/algorithms/sorting/is_sorted.bench.cpp new file mode 100644 index 0000000000000..6e553e93d017c --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/sorting/is_sorted.bench.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "../../GenerateInput.h" + +int main(int argc, char** argv) { + auto std_is_sorted = [](auto first, auto last) { return std::is_sorted(first, last); }; + auto std_is_sorted_pred = [](auto first, auto last) { + return std::is_sorted(first, last, [](auto x, auto y) { + benchmark::DoNotOptimize(x); + benchmark::DoNotOptimize(y); + return x < y; + }); + }; + auto ranges_is_sorted_pred = [](auto first, auto last) { + return std::ranges::is_sorted(first, last, [](auto x, auto y) { + benchmark::DoNotOptimize(x); + benchmark::DoNotOptimize(y); + return x < y; + }); + }; + + // Benchmark {std,ranges}::is_sorted on a sorted sequence (the worst case). + { + auto bm = [](std::string name, auto is_sorted) { + benchmark::RegisterBenchmark( + name, + [is_sorted](auto& st) { + std::size_t const size = st.range(0); + using ValueType = typename Container::value_type; + std::vector data; + std::generate_n(std::back_inserter(data), size, [] { return Generate::random(); }); + std::sort(data.begin(), data.end()); + + Container c(data.begin(), data.end()); + + for ([[maybe_unused]] auto _ : st) { + benchmark::DoNotOptimize(c); + auto result = is_sorted(c.begin(), c.end()); + benchmark::DoNotOptimize(result); + } + }) + ->Arg(8) + ->Arg(1024) + ->Arg(8192); + }; + bm.operator()>("std::is_sorted(vector)", std_is_sorted); + bm.operator()>("std::is_sorted(deque)", std_is_sorted); + bm.operator()>("std::is_sorted(list)", std_is_sorted); + bm.operator()>("rng::is_sorted(vector)", std::ranges::is_sorted); + bm.operator()>("rng::is_sorted(deque)", std::ranges::is_sorted); + bm.operator()>("rng::is_sorted(list)", std::ranges::is_sorted); + + bm.operator()>("std::is_sorted(vector, pred)", std_is_sorted_pred); + bm.operator()>("std::is_sorted(deque, pred)", std_is_sorted_pred); + bm.operator()>("std::is_sorted(list, pred)", std_is_sorted_pred); + bm.operator()>("rng::is_sorted(vector, pred)", ranges_is_sorted_pred); + bm.operator()>("rng::is_sorted(deque, pred)", ranges_is_sorted_pred); + bm.operator()>("rng::is_sorted(list, pred)", ranges_is_sorted_pred); + } + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/libcxx/test/benchmarks/algorithms/sorting/is_sorted_until.bench.cpp b/libcxx/test/benchmarks/algorithms/sorting/is_sorted_until.bench.cpp new file mode 100644 index 0000000000000..ab11ee35327c7 --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/sorting/is_sorted_until.bench.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "../../GenerateInput.h" + +int main(int argc, char** argv) { + auto std_is_sorted_until = [](auto first, auto last) { return std::is_sorted_until(first, last); }; + auto std_is_sorted_until_pred = [](auto first, auto last) { + return std::is_sorted_until(first, last, [](auto x, auto y) { + benchmark::DoNotOptimize(x); + benchmark::DoNotOptimize(y); + return x < y; + }); + }; + auto ranges_is_sorted_until_pred = [](auto first, auto last) { + return std::ranges::is_sorted_until(first, last, [](auto x, auto y) { + benchmark::DoNotOptimize(x); + benchmark::DoNotOptimize(y); + return x < y; + }); + }; + + // Benchmark {std,ranges}::is_sorted_until on a sorted sequence (the worst case). + { + auto bm = [](std::string name, auto is_sorted_until) { + benchmark::RegisterBenchmark( + name, + [is_sorted_until](auto& st) { + std::size_t const size = st.range(0); + using ValueType = typename Container::value_type; + std::vector data; + std::generate_n(std::back_inserter(data), size, [] { return Generate::random(); }); + std::sort(data.begin(), data.end()); + + Container c(data.begin(), data.end()); + + for ([[maybe_unused]] auto _ : st) { + benchmark::DoNotOptimize(c); + auto result = is_sorted_until(c.begin(), c.end()); + benchmark::DoNotOptimize(result); + } + }) + ->Arg(8) + ->Arg(1024) + ->Arg(8192); + }; + bm.operator()>("std::is_sorted_until(vector)", std_is_sorted_until); + bm.operator()>("std::is_sorted_until(deque)", std_is_sorted_until); + bm.operator()>("std::is_sorted_until(list)", std_is_sorted_until); + bm.operator()>("rng::is_sorted_until(vector)", std::ranges::is_sorted_until); + bm.operator()>("rng::is_sorted_until(deque)", std::ranges::is_sorted_until); + bm.operator()>("rng::is_sorted_until(list)", std::ranges::is_sorted_until); + + bm.operator()>("std::is_sorted_until(vector, pred)", std_is_sorted_until_pred); + bm.operator()>("std::is_sorted_until(deque, pred)", std_is_sorted_until_pred); + bm.operator()>("std::is_sorted_until(list, pred)", std_is_sorted_until_pred); + bm.operator()>("rng::is_sorted_until(vector, pred)", ranges_is_sorted_until_pred); + bm.operator()>("rng::is_sorted_until(deque, pred)", ranges_is_sorted_until_pred); + bm.operator()>("rng::is_sorted_until(list, pred)", ranges_is_sorted_until_pred); + } + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/libcxx/test/benchmarks/algorithms/sorting/partial_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/sorting/partial_sort.bench.cpp new file mode 100644 index 0000000000000..7000be66920d0 --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/sorting/partial_sort.bench.cpp @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "common.h" + +int main(int argc, char** argv) { + auto std_partial_sort = [](auto first, auto mid, auto last) { return std::partial_sort(first, mid, last); }; + + // Benchmark {std,ranges}::partial_sort on various types of data. We always partially sort only + // half of the full range. + // + // We perform this benchmark in a batch because we need to restore the + // state of the container after the operation. + // + // Also note that we intentionally don't benchmark the predicated version of the algorithm + // because that makes the benchmark run too slowly. + { + auto bm = [](std::string name, auto partial_sort, auto generate_data) { + benchmark::RegisterBenchmark( + name, + [partial_sort, generate_data](auto& st) { + std::size_t const size = st.range(0); + constexpr std::size_t BatchSize = 32; + using ValueType = typename Container::value_type; + std::vector data = generate_data(size); + std::array c; + std::fill_n(c.begin(), BatchSize, Container(data.begin(), data.end())); + + std::size_t const half = size / 2; + while (st.KeepRunningBatch(BatchSize)) { + for (std::size_t i = 0; i != BatchSize; ++i) { + benchmark::DoNotOptimize(c[i]); + partial_sort(c[i].begin(), c[i].begin() + half, c[i].end()); + benchmark::DoNotOptimize(c[i]); + } + + st.PauseTiming(); + for (std::size_t i = 0; i != BatchSize; ++i) { + std::copy(data.begin(), data.end(), c[i].begin()); + } + st.ResumeTiming(); + } + }) + ->Arg(8) + ->Arg(1024) + ->Arg(8192); + }; + + auto register_bm = [&](auto generate, std::string variant) { + auto gen2 = [generate](auto size) { + std::vector data = generate(size); + std::vector real_data(data.begin(), data.end()); + return real_data; + }; + auto name = [variant](std::string op) { return op + " (" + variant + ")"; }; + bm.operator()>(name("std::partial_sort(vector"), std_partial_sort, generate); + bm.operator()>( + name("std::partial_sort(vector"), std_partial_sort, gen2); + bm.operator()>(name("std::partial_sort(deque"), std_partial_sort, generate); + + bm.operator()>(name("rng::partial_sort(vector"), std::ranges::partial_sort, generate); + bm.operator()>( + name("rng::partial_sort(vector"), std::ranges::partial_sort, gen2); + bm.operator()>(name("rng::partial_sort(deque"), std::ranges::partial_sort, generate); + }; + + register_bm(support::quicksort_adversarial_data, "qsort adversarial"); + register_bm(support::ascending_sorted_data, "ascending"); + register_bm(support::descending_sorted_data, "descending"); + register_bm(support::pipe_organ_data, "pipe-organ"); + register_bm(support::heap_data, "heap"); + register_bm(support::shuffled_data, "shuffled"); + register_bm(support::single_element_data, "repeated"); + } + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/libcxx/test/benchmarks/algorithms/sorting/partial_sort_copy.bench.cpp b/libcxx/test/benchmarks/algorithms/sorting/partial_sort_copy.bench.cpp new file mode 100644 index 0000000000000..2ebc286b1c03b --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/sorting/partial_sort_copy.bench.cpp @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "common.h" + +int main(int argc, char** argv) { + auto std_partial_sort_copy = [](auto first, auto last, auto dfirst, auto dlast) { + return std::partial_sort_copy(first, last, dfirst, dlast); + }; + + // Benchmark {std,ranges}::partial_sort_copy on various types of data. We always partially + // sort only half of the full range. + // + // Also note that we intentionally don't benchmark the predicated version of the algorithm + // because that makes the benchmark run too slowly. + { + auto bm = [](std::string name, auto partial_sort_copy, auto generate_data) { + benchmark::RegisterBenchmark( + name, + [partial_sort_copy, generate_data](auto& st) { + std::size_t const size = st.range(0); + using ValueType = typename Container::value_type; + std::vector data = generate_data(size); + Container c(data.begin(), data.end()); + std::vector out(size / 2); + + for ([[maybe_unused]] auto _ : st) { + benchmark::DoNotOptimize(c); + benchmark::DoNotOptimize(out); + auto result = partial_sort_copy(c.begin(), c.end(), out.begin(), out.end()); + benchmark::DoNotOptimize(result); + } + }) + ->Arg(8) + ->Arg(1024) + ->Arg(8192); + }; + + auto register_bm = [&](auto generate, std::string variant) { + auto gen2 = [generate](auto size) { + std::vector data = generate(size); + std::vector real_data(data.begin(), data.end()); + return real_data; + }; + auto name = [variant](std::string op) { return op + " (" + variant + ")"; }; + bm.operator()>(name("std::partial_sort_copy(vector)"), std_partial_sort_copy, generate); + bm.operator()>( + name("std::partial_sort_copy(vector)"), std_partial_sort_copy, gen2); + bm.operator()>(name("std::partial_sort_copy(deque)"), std_partial_sort_copy, generate); + bm.operator()>(name("std::partial_sort_copy(list)"), std_partial_sort_copy, generate); + + bm.operator()>( + name("rng::partial_sort_copy(vector)"), std::ranges::partial_sort_copy, generate); + bm.operator()>( + name("rng::partial_sort_copy(vector)"), std::ranges::partial_sort_copy, gen2); + bm.operator()>( + name("rng::partial_sort_copy(deque)"), std::ranges::partial_sort_copy, generate); + bm.operator()>( + name("rng::partial_sort_copy(list)"), std::ranges::partial_sort_copy, generate); + }; + + register_bm(support::quicksort_adversarial_data, "qsort adversarial"); + register_bm(support::ascending_sorted_data, "ascending"); + register_bm(support::descending_sorted_data, "descending"); + register_bm(support::pipe_organ_data, "pipe-organ"); + register_bm(support::heap_data, "heap"); + register_bm(support::shuffled_data, "shuffled"); + register_bm(support::single_element_data, "repeated"); + } + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/libcxx/test/benchmarks/algorithms/sorting/sort.bench.cpp b/libcxx/test/benchmarks/algorithms/sorting/sort.bench.cpp new file mode 100644 index 0000000000000..d12aa108fe123 --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/sorting/sort.bench.cpp @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "common.h" + +int main(int argc, char** argv) { + auto std_sort = [](auto first, auto last) { return std::sort(first, last); }; + + // Benchmark {std,ranges}::sort on various types of data + // + // We perform this benchmark in a batch because we need to restore the + // state of the container after the operation. + // + // Also note that we intentionally don't benchmark the predicated version of the algorithm + // because that makes the benchmark run too slowly. + { + auto bm = [](std::string name, auto sort, auto generate_data) { + benchmark::RegisterBenchmark( + name, + [sort, generate_data](auto& st) { + std::size_t const size = st.range(0); + constexpr std::size_t BatchSize = 32; + using ValueType = typename Container::value_type; + std::vector data = generate_data(size); + std::array c; + std::fill_n(c.begin(), BatchSize, Container(data.begin(), data.end())); + + while (st.KeepRunningBatch(BatchSize)) { + for (std::size_t i = 0; i != BatchSize; ++i) { + benchmark::DoNotOptimize(c[i]); + sort(c[i].begin(), c[i].end()); + benchmark::DoNotOptimize(c[i]); + } + + st.PauseTiming(); + for (std::size_t i = 0; i != BatchSize; ++i) { + std::copy(data.begin(), data.end(), c[i].begin()); + } + st.ResumeTiming(); + } + }) + ->Arg(8) + ->Arg(1024) + ->Arg(8192); + }; + + auto register_bm = [&](auto generate, std::string variant) { + auto gen2 = [generate](auto size) { + std::vector data = generate(size); + std::vector real_data(data.begin(), data.end()); + return real_data; + }; + auto name = [variant](std::string op) { return op + " (" + variant + ")"; }; + bm.operator()>(name("std::sort(vector)"), std_sort, generate); + bm.operator()>(name("std::sort(vector)"), std_sort, gen2); + bm.operator()>(name("std::sort(deque)"), std_sort, generate); + + bm.operator()>(name("rng::sort(vector)"), std::ranges::sort, generate); + bm.operator()>(name("rng::sort(vector)"), std::ranges::sort, gen2); + bm.operator()>(name("rng::sort(deque)"), std::ranges::sort, generate); + }; + + register_bm(support::quicksort_adversarial_data, "qsort adversarial"); + register_bm(support::ascending_sorted_data, "ascending"); + register_bm(support::descending_sorted_data, "descending"); + register_bm(support::pipe_organ_data, "pipe-organ"); + register_bm(support::heap_data, "heap"); + register_bm(support::shuffled_data, "shuffled"); + register_bm(support::single_element_data, "repeated"); + } + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/libcxx/test/benchmarks/algorithms/sorting/stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/sorting/stable_sort.bench.cpp new file mode 100644 index 0000000000000..8040f5c12a46a --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/sorting/stable_sort.bench.cpp @@ -0,0 +1,159 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +#include +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "common.h" +#include "count_new.h" + +int main(int argc, char** argv) { + auto std_stable_sort = [](auto first, auto last) { return std::stable_sort(first, last); }; + + // Benchmark {std,ranges}::stable_sort on various types of data + // + // We perform this benchmark in a batch because we need to restore the + // state of the container after the operation. + // + // Also note that we intentionally don't benchmark the predicated version of the algorithm + // because that makes the benchmark run too slowly. + { + auto bm = [](std::string name, auto stable_sort, auto generate_data) { + benchmark::RegisterBenchmark( + name, + [stable_sort, generate_data](auto& st) { + std::size_t const size = st.range(0); + constexpr std::size_t BatchSize = 32; + using ValueType = typename Container::value_type; + std::vector data = generate_data(size); + std::array c; + std::fill_n(c.begin(), BatchSize, Container(data.begin(), data.end())); + + while (st.KeepRunningBatch(BatchSize)) { + for (std::size_t i = 0; i != BatchSize; ++i) { + benchmark::DoNotOptimize(c[i]); + stable_sort(c[i].begin(), c[i].end()); + benchmark::DoNotOptimize(c[i]); + } + + st.PauseTiming(); + for (std::size_t i = 0; i != BatchSize; ++i) { + std::copy(data.begin(), data.end(), c[i].begin()); + } + st.ResumeTiming(); + } + }) + ->Arg(8) + ->Arg(1024) + ->Arg(8192); + }; + + auto register_bm = [&](auto generate, std::string variant) { + auto gen2 = [generate](auto size) { + std::vector data = generate(size); + std::vector real_data(data.begin(), data.end()); + return real_data; + }; + auto name = [variant](std::string op) { return op + " (" + variant + ")"; }; + bm.operator()>(name("std::stable_sort(vector)"), std_stable_sort, generate); + bm.operator()>( + name("std::stable_sort(vector)"), std_stable_sort, gen2); + bm.operator()>(name("std::stable_sort(deque)"), std_stable_sort, generate); + + bm.operator()>(name("rng::stable_sort(vector)"), std::ranges::stable_sort, generate); + bm.operator()>( + name("rng::stable_sort(vector)"), std::ranges::stable_sort, gen2); + bm.operator()>(name("rng::stable_sort(deque)"), std::ranges::stable_sort, generate); + }; + + register_bm(support::quicksort_adversarial_data, "qsort adversarial"); + register_bm(support::ascending_sorted_data, "ascending"); + register_bm(support::descending_sorted_data, "descending"); + register_bm(support::pipe_organ_data, "pipe-organ"); + register_bm(support::heap_data, "heap"); + register_bm(support::shuffled_data, "shuffled"); + register_bm(support::single_element_data, "repeated"); + } + + // Benchmark {std,ranges}::stable_sort when memory allocation fails. The algorithm must fall back to + // a different algorithm that has different complexity guarantees. + { + auto bm = [](std::string name, auto stable_sort, auto generate_data) { + benchmark::RegisterBenchmark( + name, + [stable_sort, generate_data](auto& st) { + std::size_t const size = st.range(0); + constexpr std::size_t BatchSize = 32; + using ValueType = typename Container::value_type; + std::vector data = generate_data(size); + std::array c; + std::fill_n(c.begin(), BatchSize, Container(data.begin(), data.end())); + + while (st.KeepRunningBatch(BatchSize)) { + for (std::size_t i = 0; i != BatchSize; ++i) { + benchmark::DoNotOptimize(c[i]); + // Disable the ability to allocate memory inside this block + globalMemCounter.throw_after = 0; + + stable_sort(c[i].begin(), c[i].end()); + benchmark::DoNotOptimize(c[i]); + + globalMemCounter.reset(); + } + + st.PauseTiming(); + for (std::size_t i = 0; i != BatchSize; ++i) { + std::copy(data.begin(), data.end(), c[i].begin()); + } + st.ResumeTiming(); + } + }) + ->Arg(8) + ->Arg(1024) + ->Arg(8192); + }; + + auto register_bm = [&](auto generate, std::string variant) { + auto gen2 = [generate](auto size) { + std::vector data = generate(size); + std::vector real_data(data.begin(), data.end()); + return real_data; + }; + auto name = [variant](std::string op) { return op + " (alloc fails, " + variant + ")"; }; + bm.operator()>(name("std::stable_sort(vector)"), std_stable_sort, generate); + bm.operator()>( + name("std::stable_sort(vector)"), std_stable_sort, gen2); + bm.operator()>(name("std::stable_sort(deque)"), std_stable_sort, generate); + + bm.operator()>(name("rng::stable_sort(vector)"), std::ranges::stable_sort, generate); + bm.operator()>( + name("rng::stable_sort(vector)"), std::ranges::stable_sort, gen2); + bm.operator()>(name("rng::stable_sort(deque)"), std::ranges::stable_sort, generate); + }; + + register_bm(support::quicksort_adversarial_data, "qsort adversarial"); + register_bm(support::ascending_sorted_data, "ascending"); + register_bm(support::descending_sorted_data, "descending"); + register_bm(support::pipe_organ_data, "pipe-organ"); + register_bm(support::heap_data, "heap"); + register_bm(support::shuffled_data, "shuffled"); + register_bm(support::single_element_data, "repeated"); + } + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp deleted file mode 100644 index 26e8de935f5c5..0000000000000 --- a/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp +++ /dev/null @@ -1,40 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// UNSUPPORTED: c++03, c++11, c++14, c++17 - -#include - -#include "common.h" - -namespace { -template -struct StableSort { - size_t Quantity; - - void run(benchmark::State& state) const { - runOpOnCopies(state, Quantity, Order(), BatchSize::CountBatch, [](auto& Copy) { - std::stable_sort(Copy.begin(), Copy.end()); - }); - } - - bool skip() const { return Order() == ::Order::Heap; } - - std::string name() const { - return "BM_StableSort" + ValueType::name() + Order::name() + "_" + std::to_string(Quantity); - }; -}; -} // namespace - -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - if (benchmark::ReportUnrecognizedArguments(argc, argv)) - return 1; - makeCartesianProductBenchmark(Quantities); - benchmark::RunSpecifiedBenchmarks(); -} diff --git a/libcxx/test/benchmarks/format/format.bench.cpp b/libcxx/test/benchmarks/format/format.bench.cpp index 267ef22950668..65caac747cbab 100644 --- a/libcxx/test/benchmarks/format/format.bench.cpp +++ b/libcxx/test/benchmarks/format/format.bench.cpp @@ -35,4 +35,15 @@ BENCHMARK(BM_format_string)->RangeMultiplier(2)->Range(1, 1 << 20); BENCHMARK(BM_format_string)->RangeMultiplier(2)->Range(1, 1 << 20); #endif +template +static void BM_string_without_formatting(benchmark::State& state) { + for (auto _ : state) { + benchmark::DoNotOptimize(std::format(CSTR("Hello, World!"))); + } +} +BENCHMARK(BM_string_without_formatting); +#ifndef TEST_HAS_NO_WIDE_CHARACTERS +BENCHMARK(BM_string_without_formatting); +#endif + BENCHMARK_MAIN(); diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in index 61f821a7e4f6b..d7d588669032d 100644 --- a/libcxx/test/configs/cmake-bridge.cfg.in +++ b/libcxx/test/configs/cmake-bridge.cfg.in @@ -23,6 +23,7 @@ config.recursiveExpansionLimit = 10 config.test_exec_root = os.path.join('@LIBCXX_BINARY_DIR@', 'test') # Add substitutions for bootstrapping the test suite configuration +config.substitutions.append(('%{bin-dir}', '@LIBCXX_BINARY_DIR@')) config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@')) config.substitutions.append(('%{install-prefix}', '@LIBCXX_TESTING_INSTALL_PREFIX@')) config.substitutions.append(('%{include-dir}', '@LIBCXX_TESTING_INSTALL_PREFIX@/@LIBCXX_INSTALL_INCLUDE_DIR@')) diff --git a/libcxx/test/libcxx/clang_tidy.sh.py b/libcxx/test/libcxx/clang_tidy.sh.py new file mode 100644 index 0000000000000..46f281f359209 --- /dev/null +++ b/libcxx/test/libcxx/clang_tidy.sh.py @@ -0,0 +1,11 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +# REQUIRES: has-clang-tidy + +# RUN: %{python} %{libcxx-dir}/../clang-tools-extra/clang-tidy/tool/run-clang-tidy.py -clang-tidy-binary %{clang-tidy} -warnings-as-errors "*" -source-filter=".*libcxx/src.*" -quiet -p %{bin-dir}/.. diff --git a/libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp b/libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp index e3a5a6f634138..04dcb8f54fafc 100644 --- a/libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp +++ b/libcxx/test/libcxx/containers/associative/tree_key_value_traits.pass.cpp @@ -21,7 +21,6 @@ void testKeyValueTrait() { typedef int Tp; typedef std::__tree_key_value_types Traits; static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); static_assert(Traits::__is_map == false, ""); } @@ -29,7 +28,6 @@ void testKeyValueTrait() { typedef std::pair Tp; typedef std::__tree_key_value_types Traits; static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); static_assert(Traits::__is_map == false, ""); } @@ -37,7 +35,6 @@ void testKeyValueTrait() { typedef std::pair Tp; typedef std::__tree_key_value_types Traits; static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); static_assert(Traits::__is_map == false, ""); } @@ -46,7 +43,6 @@ void testKeyValueTrait() { typedef std::__tree_key_value_types Traits; static_assert((std::is_same::value), ""); static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); static_assert((std::is_same >::value), ""); static_assert((std::is_same >::value), ""); static_assert(Traits::__is_map == true, ""); diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp new file mode 100644 index 0000000000000..3ca021c2ac650 --- /dev/null +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/insert_range.pass.cpp @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: no-localization + +// + +// As an extension, libc++ flat containers support inserting a non forward range into +// a pre-C++23 container that doesn't provide insert_range(...), since many containers +// out there are in that situation. +// https://github.com/llvm/llvm-project/issues/136656 + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" + +void test() { + MinSequenceContainer v; + std::flat_multiset s(v); + std::istringstream ints("0 1 1 0"); + auto r = std::ranges::subrange(std::istream_iterator(ints), std::istream_iterator()) | + std::views::transform([](int i) { return i * i; }); + static_assert( + ![](auto& t) { return requires { t.insert_range(r); }; }(v), + "This test is to test the case where the underlying container does not provide insert_range"); + s.insert_range(r); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.set/insert_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.set/insert_range.pass.cpp new file mode 100644 index 0000000000000..8023e251ccb17 --- /dev/null +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.set/insert_range.pass.cpp @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: no-localization + +// + +// As an extension, libc++ flat containers support inserting a non forward range into +// a pre-C++23 container that doesn't provide insert_range(...), since many containers +// out there are in that situation. +// https://github.com/llvm/llvm-project/issues/136656 + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" + +void test() { + MinSequenceContainer v; + std::flat_set s(v); + std::istringstream ints("0 1 1 0"); + auto r = std::ranges::subrange(std::istream_iterator(ints), std::istream_iterator()) | + std::views::transform([](int i) { return i * i; }); + static_assert( + ![](auto& t) { return requires { t.insert_range(r); }; }(v), + "This test is to test the case where the underlying container does not provide insert_range"); + s.insert_range(r); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp index dcdce261298c1..f125cc9adc491 100644 --- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp +++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp @@ -8,11 +8,12 @@ // REQUIRES: host-has-gdb-with-python // REQUIRES: locale.en_US.UTF-8 +// REQUIRES: optimization=none // UNSUPPORTED: no-localization // UNSUPPORTED: c++03 -// TODO: Investigate these failures which break the CI. -// UNSUPPORTED: clang-18, clang-19, clang-20, clang-21 +// TODO: Investigate why this fails on the arm bots +// UNSUPPORTED: target=arm{{.*}} // The Android libc++ tests are run on a non-Android host, connected to an // Android device over adb. gdb needs special support to make this work (e.g. diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp index 93090ed6138f8..85557ecbbfabc 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp @@ -30,108 +30,40 @@ #include "test_iterators.h" #include "type_algorithms.h" -constexpr void test_different_lengths() { - using Expected = std::ranges::swap_ranges_result; - int i[3] = {1, 2, 3}; - int j[1] = {4}; - std::same_as auto r = std::ranges::swap_ranges(i, i + 3, j, j + 1); - assert(r.in1 == i + 1); - assert(r.in2 == j + 1); - assert(std::ranges::equal(i, std::array{4, 2, 3})); - assert(std::ranges::equal(j, std::array{1})); - std::same_as auto r2 = std::ranges::swap_ranges(i, j); - assert(r2.in1 == i + 1); - assert(r2.in2 == j + 1); - assert(std::ranges::equal(i, std::array{1, 2, 3})); - assert(std::ranges::equal(j, std::array{4})); - std::same_as auto r3 = std::ranges::swap_ranges(j, j + 1, i, i + 3); - assert(r3.in1 == j + 1); - assert(r3.in2 == i + 1); - assert(std::ranges::equal(i, std::array{4, 2, 3})); - assert(std::ranges::equal(j, std::array{1})); - std::same_as auto r4 = std::ranges::swap_ranges(j, i); - assert(r4.in1 == j + 1); - assert(r4.in2 == i + 1); - assert(std::ranges::equal(i, std::array{1, 2, 3})); - assert(std::ranges::equal(j, std::array{4})); -} - -constexpr void test_range() { - std::array r1 = {1, 2, 3}; - std::array r2 = {4, 5, 6}; - - std::same_as::iterator, std::array::iterator>> auto r = - std::ranges::swap_ranges(r1, r2); - assert(r.in1 == r1.end()); - assert(r.in2 == r2.end()); - assert((r1 == std::array{4, 5, 6})); - assert((r2 == std::array{1, 2, 3})); -} - -constexpr void test_borrowed_input_range() { - { - int r1[] = {1, 2, 3}; - int r2[] = {4, 5, 6}; - std::ranges::swap_ranges(std::views::all(r1), r2); - assert(std::ranges::equal(r1, std::array{4, 5, 6})); - assert(std::ranges::equal(r2, std::array{1, 2, 3})); - } - { - int r1[] = {1, 2, 3}; - int r2[] = {4, 5, 6}; - std::ranges::swap_ranges(r1, std::views::all(r2)); - assert(std::ranges::equal(r1, std::array{4, 5, 6})); - assert(std::ranges::equal(r2, std::array{1, 2, 3})); - } - { - int r1[] = {1, 2, 3}; - int r2[] = {4, 5, 6}; - std::ranges::swap_ranges(std::views::all(r1), std::views::all(r2)); - assert(std::ranges::equal(r1, std::array{4, 5, 6})); - assert(std::ranges::equal(r2, std::array{1, 2, 3})); - } -} - -constexpr void test_sentinel() { - int i[3] = {1, 2, 3}; - int j[3] = {4, 5, 6}; - using It = cpp17_input_iterator; - using Sent = sentinel_wrapper; - using Expected = std::ranges::swap_ranges_result; - std::same_as auto r = std::ranges::swap_ranges(It(i), Sent(It(i + 3)), It(j), Sent(It(j + 3))); - assert(base(r.in1) == i + 3); - assert(base(r.in2) == j + 3); - assert(std::ranges::equal(i, std::array{4, 5, 6})); - assert(std::ranges::equal(j, std::array{1, 2, 3})); -} - template TEST_CONSTEXPR_CXX20 void test_iterators() { using Expected = std::ranges::swap_ranges_result; - int a[3] = {1, 2, 3}; - int b[3] = {4, 5, 6}; - std::same_as auto r = - std::ranges::swap_ranges(Iter1(a), sentinel_wrapper(Iter1(a + 3)), Iter2(b), sentinel_wrapper(Iter2(b + 3))); - assert(base(r.in1) == a + 3); - assert(base(r.in2) == b + 3); - assert(std::ranges::equal(a, std::array{4, 5, 6})); - assert(std::ranges::equal(b, std::array{1, 2, 3})); -} - -constexpr void test_rval_range() { - { - using Expected = std::ranges::swap_ranges_result::iterator, std::ranges::dangling>; - std::array r = {1, 2, 3}; - std::same_as auto a = std::ranges::swap_ranges(r, std::array{4, 5, 6}); - assert((r == std::array{4, 5, 6})); - assert(a.in1 == r.begin() + 3); + { // Basic test case: swapping three elements between two arrays + int a[3] = {1, 2, 3}; + int b[3] = {4, 5, 6}; + std::same_as auto r = + std::ranges::swap_ranges(Iter1(a), sentinel_wrapper(Iter1(a + 3)), Iter2(b), sentinel_wrapper(Iter2(b + 3))); + assert(base(r.in1) == a + 3); + assert(base(r.in2) == b + 3); + assert(std::ranges::equal(a, std::array{4, 5, 6})); + assert(std::ranges::equal(b, std::array{1, 2, 3})); } - { - std::array r = {1, 2, 3}; - using Expected = std::ranges::swap_ranges_result::iterator>; - std::same_as auto b = std::ranges::swap_ranges(std::array{4, 5, 6}, r); - assert((r == std::array{4, 5, 6})); - assert(b.in2 == r.begin() + 3); + { // Large-scale test: swapping 100 elements between two different containers + const int N = 100; + std::array a; + std::vector b(N + 2, 42); + b.front() = 1; + b.back() = -1; + for (int i = 0; i < N; ++i) + a[i] = i * i + 1; + std::same_as auto r = std::ranges::swap_ranges( + Iter1(a.data()), + sentinel_wrapper(Iter1(a.data() + N)), + Iter2(b.data() + 1), + sentinel_wrapper(Iter2(b.data() + b.size()))); + assert(base(r.in1) == a.data() + N); + assert(base(r.in2) == b.data() + N + 1); + assert(b.front() == 1); // Ensure that the unswapped portion remains unchanged + assert(b.back() == -1); + for (int i = 0; i < N; ++i) { + assert(a[i] == 42); + assert(b[i + 1] == i * i + 1); + } } } @@ -152,11 +84,97 @@ constexpr void test_vector_bool() { } constexpr bool test() { - test_range(); - test_sentinel(); - test_different_lengths(); - test_borrowed_input_range(); - test_rval_range(); + { // Validate swapping ranges directly + std::array r1 = {1, 2, 3}; + std::array r2 = {4, 5, 6}; + + std::same_as::iterator, std::array::iterator>> auto r = + std::ranges::swap_ranges(r1, r2); + assert(r.in1 == r1.end()); + assert(r.in2 == r2.end()); + assert((r1 == std::array{4, 5, 6})); + assert((r2 == std::array{1, 2, 3})); + } + + { // Validate swapping ranges using iterator and sentinels + int i[3] = {1, 2, 3}; + int j[3] = {4, 5, 6}; + using It = cpp17_input_iterator; + using Sent = sentinel_wrapper; + using Expected = std::ranges::swap_ranges_result; + std::same_as auto r = std::ranges::swap_ranges(It(i), Sent(It(i + 3)), It(j), Sent(It(j + 3))); + assert(base(r.in1) == i + 3); + assert(base(r.in2) == j + 3); + assert(std::ranges::equal(i, std::array{4, 5, 6})); + assert(std::ranges::equal(j, std::array{1, 2, 3})); + } + + { // Validate swapping ranges of different lengths + using Expected = std::ranges::swap_ranges_result; + int i[3] = {1, 2, 3}; + int j[1] = {4}; + std::same_as auto r = std::ranges::swap_ranges(i, i + 3, j, j + 1); + assert(r.in1 == i + 1); + assert(r.in2 == j + 1); + assert(std::ranges::equal(i, std::array{4, 2, 3})); + assert(std::ranges::equal(j, std::array{1})); + std::same_as auto r2 = std::ranges::swap_ranges(i, j); + assert(r2.in1 == i + 1); + assert(r2.in2 == j + 1); + assert(std::ranges::equal(i, std::array{1, 2, 3})); + assert(std::ranges::equal(j, std::array{4})); + std::same_as auto r3 = std::ranges::swap_ranges(j, j + 1, i, i + 3); + assert(r3.in1 == j + 1); + assert(r3.in2 == i + 1); + assert(std::ranges::equal(i, std::array{4, 2, 3})); + assert(std::ranges::equal(j, std::array{1})); + std::same_as auto r4 = std::ranges::swap_ranges(j, i); + assert(r4.in1 == j + 1); + assert(r4.in2 == i + 1); + assert(std::ranges::equal(i, std::array{1, 2, 3})); + assert(std::ranges::equal(j, std::array{4})); + } + + { // Validate swapping when one or both are borrowed input ranges (views) + { + int r1[] = {1, 2, 3}; + int r2[] = {4, 5, 6}; + std::ranges::swap_ranges(std::views::all(r1), r2); + assert(std::ranges::equal(r1, std::array{4, 5, 6})); + assert(std::ranges::equal(r2, std::array{1, 2, 3})); + } + { + int r1[] = {1, 2, 3}; + int r2[] = {4, 5, 6}; + std::ranges::swap_ranges(r1, std::views::all(r2)); + assert(std::ranges::equal(r1, std::array{4, 5, 6})); + assert(std::ranges::equal(r2, std::array{1, 2, 3})); + } + { + int r1[] = {1, 2, 3}; + int r2[] = {4, 5, 6}; + std::ranges::swap_ranges(std::views::all(r1), std::views::all(r2)); + assert(std::ranges::equal(r1, std::array{4, 5, 6})); + assert(std::ranges::equal(r2, std::array{1, 2, 3})); + } + } + + { // Validate swapping involving rvalue ranges + { + using Expected = std::ranges::swap_ranges_result::iterator, std::ranges::dangling>; + std::array r = {1, 2, 3}; + std::same_as auto a = std::ranges::swap_ranges(r, std::array{4, 5, 6}); + assert((r == std::array{4, 5, 6})); + assert(a.in1 == r.begin() + 3); + } + { + std::array r = {1, 2, 3}; + using Expected = std::ranges::swap_ranges_result::iterator>; + std::same_as auto b = std::ranges::swap_ranges(std::array{4, 5, 6}, r); + assert((r == std::array{4, 5, 6})); + assert(b.in2 == r.begin() + 3); + } + } types::for_each(types::cpp20_input_iterator_list(), []() { types::for_each(types::cpp20_input_iterator_list(), []() { diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp index 01cd33150e236..84ebedf213f5b 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp @@ -35,12 +35,31 @@ struct TestPtr { struct TestImpl { template TEST_CONSTEXPR_CXX20 void operator()() { - int a[] = {1, 2, 3}; - int b[] = {4, 5, 6}; - Iter2 r = std::swap_ranges(Iter1(a), Iter1(a + 3), Iter2(b)); - assert(base(r) == b + 3); - assert(a[0] == 4 && a[1] == 5 && a[2] == 6); - assert(b[0] == 1 && b[1] == 2 && b[2] == 3); + { // Basic test case: swapping three elements between two arrays + int a[] = {1, 2, 3}; + int b[] = {4, 5, 6}; + Iter2 r = std::swap_ranges(Iter1(a), Iter1(a + 3), Iter2(b)); + assert(base(r) == b + 3); + assert(a[0] == 4 && a[1] == 5 && a[2] == 6); + assert(b[0] == 1 && b[1] == 2 && b[2] == 3); + } + { // Large-scale test: swapping 100 elements between two different containers + const int N = 100; + std::array a; + std::vector b(N + 2, 42); + b.front() = 1; + b.back() = -1; + for (int i = 0; i < N; ++i) + a[i] = i * i + 1; + Iter2 r = std::swap_ranges(Iter1(a.data()), Iter1(a.data() + N), Iter2(b.data() + 1)); + assert(base(r) == b.data() + N + 1); + assert(b.front() == 1); // Ensure that the unswapped portion remains unchanged + assert(b.back() == -1); + for (int i = 0; i < N; ++i) { + assert(a[i] == 42); + assert(b[i + 1] == i * i + 1); + } + } } }; }; diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp index 02cc84c288828..859532d4b79c7 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp @@ -20,6 +20,8 @@ // We test the cartesian product, so we sometimes compare differently signed types // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare +// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): -Wno-character-conversion + // MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', possible loss of data // MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', possible loss of data // MSVC warning C4389: '==': signed/unsigned mismatch diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp index 3aaeb9c2f345f..989edcb3f6eed 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp @@ -8,6 +8,7 @@ // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare +// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): -Wno-character-conversion // MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned mismatch // MSVC warning C4305: truncation from 'int' to 'bool' // MSVC warning C4310: cast truncates constant value diff --git a/libcxx/test/std/containers/associative/map/map.modifiers/insert_or_assign.pass.cpp b/libcxx/test/std/containers/associative/map/map.modifiers/insert_or_assign.pass.cpp index 8a129b0295180..22d4a19a0eb44 100644 --- a/libcxx/test/std/containers/associative/map/map.modifiers/insert_or_assign.pass.cpp +++ b/libcxx/test/std/containers/associative/map/map.modifiers/insert_or_assign.pass.cpp @@ -140,14 +140,16 @@ int main(int, char**) { M::const_iterator it = m.find(2); Moveable mv1(3, 3.0); - r = m.insert_or_assign(it, 2, std::move(mv1)); + const int key1 = 2; + r = m.insert_or_assign(it, key1, std::move(mv1)); assert(m.size() == 10); assert(mv1.moved()); // was moved from assert(r->first == 2); // key assert(r->second.get() == 3); // value Moveable mv2(5, 5.0); - r = m.insert_or_assign(it, 3, std::move(mv2)); + const int key2 = 3; + r = m.insert_or_assign(it, key2, std::move(mv2)); assert(m.size() == 11); assert(mv2.moved()); // was moved from assert(r->first == 3); // key @@ -155,14 +157,16 @@ int main(int, char**) { // wrong hint: begin() Moveable mv3(7, 7.0); - r = m.insert_or_assign(m.begin(), 4, std::move(mv3)); + const int key3 = 4; + r = m.insert_or_assign(m.begin(), key3, std::move(mv3)); assert(m.size() == 11); assert(mv3.moved()); // was moved from assert(r->first == 4); // key assert(r->second.get() == 7); // value Moveable mv4(9, 9.0); - r = m.insert_or_assign(m.begin(), 5, std::move(mv4)); + const int key4 = 5; + r = m.insert_or_assign(m.begin(), key4, std::move(mv4)); assert(m.size() == 12); assert(mv4.moved()); // was moved from assert(r->first == 5); // key @@ -170,14 +174,16 @@ int main(int, char**) { // wrong hint: end() Moveable mv5(11, 11.0); - r = m.insert_or_assign(m.end(), 6, std::move(mv5)); + const int key5 = 6; + r = m.insert_or_assign(m.end(), key5, std::move(mv5)); assert(m.size() == 12); assert(mv5.moved()); // was moved from assert(r->first == 6); // key assert(r->second.get() == 11); // value Moveable mv6(13, 13.0); - r = m.insert_or_assign(m.end(), 7, std::move(mv6)); + const int key6 = 7; + r = m.insert_or_assign(m.end(), key6, std::move(mv6)); assert(m.size() == 13); assert(mv6.moved()); // was moved from assert(r->first == 7); // key @@ -185,14 +191,16 @@ int main(int, char**) { // wrong hint: third element Moveable mv7(15, 15.0); - r = m.insert_or_assign(std::next(m.begin(), 2), 8, std::move(mv7)); + const int key7 = 8; + r = m.insert_or_assign(std::next(m.begin(), 2), key7, std::move(mv7)); assert(m.size() == 13); assert(mv7.moved()); // was moved from assert(r->first == 8); // key assert(r->second.get() == 15); // value Moveable mv8(17, 17.0); - r = m.insert_or_assign(std::next(m.begin(), 2), 9, std::move(mv8)); + const int key8 = 9; + r = m.insert_or_assign(std::next(m.begin(), 2), key8, std::move(mv8)); assert(m.size() == 14); assert(mv8.moved()); // was moved from assert(r->first == 9); // key diff --git a/libcxx/test/std/containers/sequences/vector.bool/enabled_hash.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/enabled_hash.pass.cpp index 41cedd68fe50e..cba3101ef5009 100644 --- a/libcxx/test/std/containers/sequences/vector.bool/enabled_hash.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector.bool/enabled_hash.pass.cpp @@ -19,19 +19,14 @@ #include "test_macros.h" #include "min_allocator.h" -TEST_CONSTEXPR_CXX20 bool test() { +void test() { test_hash_enabled >(); test_hash_enabled>>(); - - return true; } int main(int, char**) { test_library_hash_specializations_available(); test(); -#if TEST_STD_VER > 17 - static_assert(test()); -#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector_bool.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector_bool.pass.cpp index e270869a8320f..670e934b5dddb 100644 --- a/libcxx/test/std/containers/sequences/vector.bool/vector_bool.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector.bool/vector_bool.pass.cpp @@ -14,8 +14,6 @@ // size_t operator()(T val) const; // }; -// Not very portable - #include #include #include @@ -24,35 +22,29 @@ #include "test_macros.h" #include "min_allocator.h" -TEST_CONSTEXPR_CXX20 bool tests() { - { - typedef std::vector T; - typedef std::hash H; +template +TEST_CONSTEXPR_CXX20 void test() { + typedef std::hash H; #if TEST_STD_VER <= 14 - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); + static_assert((std::is_same::value), ""); + static_assert((std::is_same::value), ""); #endif - ASSERT_NOEXCEPT(H()(T())); - - bool ba[] = {true, false, true, true, false}; - T vb(std::begin(ba), std::end(ba)); - H h; - assert(h(vb) != 0); + ASSERT_NOEXCEPT(H()(VB())); + + bool ba[] = {true, false, true, true, false}; + VB vb(std::begin(ba), std::end(ba)); + H h; + if (!TEST_IS_CONSTANT_EVALUATED) { + const std::size_t hash_value = h(vb); + assert(h(vb) == hash_value); + LIBCPP_ASSERT(hash_value != 0); } +} + +TEST_CONSTEXPR_CXX20 bool tests() { + test >(); #if TEST_STD_VER >= 11 - { - typedef std::vector> T; - typedef std::hash H; -# if TEST_STD_VER <= 14 - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); -# endif - ASSERT_NOEXCEPT(H()(T())); - bool ba[] = {true, false, true, true, false}; - T vb(std::begin(ba), std::end(ba)); - H h; - assert(h(vb) != 0); - } + test>>(); #endif return true; diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp index e54c0c2a4610a..fc5625d8ce4e9 100644 --- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp +++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp @@ -484,7 +484,7 @@ template void utf8_to_utf16_in_ok(const std::codecvt& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -549,7 +549,7 @@ template void utf8_to_utf16_in_partial(const std::codecvt& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA"; - const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -618,7 +618,7 @@ template void utf8_to_utf16_in_error(const std::codecvt& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP const unsigned char input[] = "b\u0448\uD700\U0010AAAA"; - const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; + const InternT expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0}; static_assert(array_size(input) == 11, ""); static_assert(array_size(expected) == 6, ""); @@ -765,7 +765,7 @@ void utf8_to_utf16_in(const std::codecvt& cvt) { template void utf16_to_utf8_out_ok(const std::codecvt& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP - const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; static_assert(array_size(input) == 6, ""); static_assert(array_size(expected) == 11, ""); @@ -801,7 +801,7 @@ void utf16_to_utf8_out_ok(const std::codecvt& cvt) template void utf16_to_utf8_out_partial(const std::codecvt& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP - const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; static_assert(array_size(input) == 6, ""); static_assert(array_size(expected) == 11, ""); @@ -860,7 +860,7 @@ void utf16_to_utf8_out_partial(const std::codecvt& template void utf16_to_utf8_out_error(const std::codecvt& cvt) { // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP - const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; + const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0}; const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA"; static_assert(array_size(input) == 6, ""); static_assert(array_size(expected) == 11, ""); diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp index c34e864220e12..86a08ee32cb45 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp @@ -33,6 +33,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(to[i] == static_cast(from[i])); return 0; } diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp index c39e64de7a59f..d5c0c3cf31244 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp @@ -34,6 +34,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(static_cast(to[i]) == from[i]); return 0; } diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp index e848f8a10912e..e6af982c10e99 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp @@ -33,6 +33,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(to[i] == static_cast(from[i])); return 0; } diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp index 7a31c9ef10558..3cf46a436e2e7 100644 --- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp @@ -34,6 +34,6 @@ int main(int, char**) { assert(from_next - from == 9); assert(to_next - to == 9); for (unsigned i = 0; i < 9; ++i) - assert(to[i] == from[i]); + assert(static_cast(to[i]) == from[i]); return 0; } diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp index e3bc9c3c100d4..971fcd68cc8e6 100644 --- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp +++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp @@ -19,9 +19,9 @@ #ifndef TEST_HAS_NO_CHAR8_T constexpr bool test_constexpr() { - char8_t c = u'1'; + char8_t c = u8'1'; std::char_traits::assign(c, u'a'); - return c == u'a'; + return c == u8'a'; } int main(int, char**) { diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index 5c809fafe2cf5..d8b23be9a0323 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -147,6 +147,7 @@ function generate-cmake() { generate-cmake-base \ -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \ -DLIBCXX_CXX_ABI=libcxxabi \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ "${@}" } diff --git a/libcxx/utils/gdb/libcxx/printers.py b/libcxx/utils/gdb/libcxx/printers.py index 31c27a1959cb2..e3d5d87aca325 100644 --- a/libcxx/utils/gdb/libcxx/printers.py +++ b/libcxx/utils/gdb/libcxx/printers.py @@ -673,7 +673,7 @@ def display_hint(self): return "map" def _get_key_value(self, node): - key_value = _cc_field(node.cast(self.util.cast_type).dereference()) + key_value = node.cast(self.util.cast_type).dereference()["__value_"] return [key_value["first"], key_value["second"]] @@ -738,7 +738,7 @@ def __init__(self, val): self._initialize(val["__i_"], _remove_generics(_prettify_typename(val.type))) def _get_node_value(self, node): - return _cc_field(node) + return node["__value_"] class SetIteratorPrinter(AbstractRBTreeIteratorPrinter): diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 10fc4b0afde6b..74746e37d3bc4 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -144,6 +144,10 @@ def _mingwSupportsModules(cfg): when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"), actions=[AddCompileFlag("-Wuser-defined-warnings")], ), + Feature( + name="character-conversion-warnings", + when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"), + ), # Tests to validate whether the compiler has a way to set the maximum number # of steps during constant evaluation. Since the flag differs per compiler # store the "valid" flag as a feature. This allows passing the proper compile diff --git a/libcxx/utils/sym_diff.py b/libcxx/utils/sym_diff.py index 8eaf8b7a57591..8d2ff14c65429 100755 --- a/libcxx/utils/sym_diff.py +++ b/libcxx/utils/sym_diff.py @@ -80,6 +80,11 @@ def main(): old_syms_list, _ = util.filter_stdlib_symbols(old_syms_list) new_syms_list, _ = util.filter_stdlib_symbols(new_syms_list) + for symbol in new_syms_list: + if symbol["is_defined"] and 'B' in symbol["name"]: + print(f"Symbol {symbol['name']} contains an ABI tag!") + sys.exit(1) + added, removed, changed = diff.diff(old_syms_list, new_syms_list) if args.removed_only: added = {} diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h index 6acefeea8484b..4e7f92dd1991a 100644 --- a/libcxxabi/src/demangle/ItaniumDemangle.h +++ b/libcxxabi/src/demangle/ItaniumDemangle.h @@ -21,6 +21,7 @@ #include "Utility.h" #include #include +#include #include #include #include @@ -164,18 +165,18 @@ class NodeArray; // traversed by the printLeft/Right functions to produce a demangled string. class Node { public: - enum Kind : unsigned char { + enum Kind : uint8_t { #define NODE(NodeKind) K##NodeKind, #include "ItaniumNodes.def" }; /// Three-way bool to track a cached value. Unknown is possible if this node /// has an unexpanded parameter pack below it that may affect this cache. - enum class Cache : unsigned char { Yes, No, Unknown, }; + enum class Cache : uint8_t { Yes, No, Unknown, }; /// Operator precedence for expression nodes. Used to determine required /// parens in expression emission. - enum class Prec { + enum class Prec : uint8_t { Primary, Postfix, Unary, diff --git a/lld/COFF/COFFLinkerContext.h b/lld/COFF/COFFLinkerContext.h index 2c5f6415e5d4b..f45b754384ef9 100644 --- a/lld/COFF/COFFLinkerContext.h +++ b/lld/COFF/COFFLinkerContext.h @@ -50,6 +50,14 @@ class COFFLinkerContext : public CommonLinkerContext { f(symtab); } + // Invoke the specified callback for each active symbol table, + // skipping the native symbol table on pure ARM64EC targets. + void forEachActiveSymtab(std::function f) { + if (symtab.ctx.config.machine == ARM64X) + f(*hybridSymtab); + f(symtab); + } + std::vector objFileInstances; std::map pdbInputFileInstances; std::vector importFileInstances; diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index ff2bc40932c04..01752cdc6a9da 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -580,7 +580,7 @@ void SectionChunk::getBaserels(std::vector *res) { // to match the value in the EC load config, which is expected to be // a relocatable pointer to the __chpe_metadata symbol. COFFLinkerContext &ctx = file->symtab.ctx; - if (ctx.hybridSymtab && ctx.hybridSymtab->loadConfigSym && + if (ctx.config.machine == ARM64X && ctx.hybridSymtab->loadConfigSym && ctx.hybridSymtab->loadConfigSym->getChunk() == this && ctx.symtab.loadConfigSym && ctx.hybridSymtab->loadConfigSize >= diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp index 0440507b71756..c327da28ce138 100644 --- a/lld/COFF/DLL.cpp +++ b/lld/COFF/DLL.cpp @@ -560,7 +560,8 @@ class TailMergeChunkARM64 : public NonSectionCodeChunk { memcpy(buf, tailMergeARM64, sizeof(tailMergeARM64)); applyArm64Addr(buf + 44, desc->getRVA(), rva + 44, 12); applyArm64Imm(buf + 48, desc->getRVA() & 0xfff, 0); - applyArm64Branch26(buf + 52, helper->getRVA() - rva - 52); + if (helper) + applyArm64Branch26(buf + 52, helper->getRVA() - rva - 52); } Chunk *desc = nullptr; @@ -781,6 +782,7 @@ void IdataContents::create(COFFLinkerContext &ctx) { // ordinal values to the table. size_t base = lookups.size(); Chunk *lookupsTerminator = nullptr, *addressesTerminator = nullptr; + uint32_t nativeOnly = 0; for (DefinedImportData *s : syms) { uint16_t ord = s->getOrdinal(); HintNameChunk *hintChunk = nullptr; @@ -806,8 +808,8 @@ void IdataContents::create(COFFLinkerContext &ctx) { // the native terminator, they will be ignored in the native view. // In the EC view, they should act as terminators, so emit ZEROFILL // relocations overriding them. - if (ctx.hybridSymtab && !lookupsTerminator && s->file->isEC() && - !s->file->hybridFile) { + if (ctx.config.machine == ARM64X && !lookupsTerminator && + s->file->isEC() && !s->file->hybridFile) { lookupsTerminator = lookupsChunk; addressesTerminator = addressesChunk; lookupsChunk = make(ctx); @@ -841,6 +843,7 @@ void IdataContents::create(COFFLinkerContext &ctx) { // Fill the auxiliary IAT with null chunks for native-only imports. auxIat.push_back(make(ctx)); auxIatCopy.push_back(make(ctx)); + ++nativeOnly; } } // Terminate with null values. @@ -862,18 +865,15 @@ void IdataContents::create(COFFLinkerContext &ctx) { // Create the import table header. dllNames.push_back(make(syms[0]->getDLLName())); auto *dir = make(dllNames.back()); - dir->lookupTab = lookups[base]; - dir->addressTab = addresses[base]; - dirs.push_back(dir); - if (ctx.hybridSymtab) { - // If native-only imports exist, they will appear as a prefix to all - // imports. Emit ARM64X relocations to skip them in the EC view. - uint32_t nativeOnly = - llvm::find_if(syms, - [](DefinedImportData *s) { return s->file->isEC(); }) - - syms.begin(); - if (nativeOnly) { + if (ctx.hybridSymtab && nativeOnly) { + if (ctx.config.machine != ARM64X) + // On pure ARM64EC targets, skip native-only imports in the import + // directory. + base += nativeOnly; + else if (nativeOnly) { + // If native-only imports exist, they will appear as a prefix to all + // imports. Emit ARM64X relocations to skip them in the EC view. ctx.dynamicRelocs->add( IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0, Arm64XRelocVal( @@ -886,6 +886,10 @@ void IdataContents::create(COFFLinkerContext &ctx) { nativeOnly * sizeof(uint64_t)); } } + + dir->lookupTab = lookups[base]; + dir->addressTab = addresses[base]; + dirs.push_back(dir); } // Add null terminator. dirs.push_back(make(sizeof(ImportDirectoryTableEntry), 4)); @@ -922,21 +926,25 @@ void DelayLoadContents::create() { size_t base = addresses.size(); ctx.forEachSymtab([&](SymbolTable &symtab) { - if (ctx.hybridSymtab && symtab.isEC()) { - // For hybrid images, emit null-terminated native import entries - // followed by null-terminated EC entries. If a view is missing imports - // for a given module, only terminators are emitted. Emit ARM64X - // relocations to skip native entries in the EC view. - ctx.dynamicRelocs->add( - IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0, - Arm64XRelocVal(dir, offsetof(delay_import_directory_table_entry, - DelayImportAddressTable)), - (addresses.size() - base) * sizeof(uint64_t)); - ctx.dynamicRelocs->add( - IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0, - Arm64XRelocVal(dir, offsetof(delay_import_directory_table_entry, - DelayImportNameTable)), - (addresses.size() - base) * sizeof(uint64_t)); + if (symtab.isEC()) { + if (ctx.config.machine == ARM64X) { + // For hybrid images, emit null-terminated native import entries + // followed by null-terminated EC entries. If a view is missing + // imports for a given module, only terminators are emitted. Emit + // ARM64X relocations to skip native entries in the EC view. + ctx.dynamicRelocs->add( + IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0, + Arm64XRelocVal(dir, offsetof(delay_import_directory_table_entry, + DelayImportAddressTable)), + (addresses.size() - base) * sizeof(uint64_t)); + ctx.dynamicRelocs->add( + IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0, + Arm64XRelocVal(dir, offsetof(delay_import_directory_table_entry, + DelayImportNameTable)), + (addresses.size() - base) * sizeof(uint64_t)); + } else { + base = addresses.size(); + } } Chunk *tm = nullptr; @@ -981,7 +989,7 @@ void DelayLoadContents::create() { chunk = make(s->file); auxIatCopy.push_back(chunk); s->file->auxImpCopySym->setLocation(chunk); - } else if (ctx.hybridSymtab) { + } else if (ctx.config.machine == ARM64X) { // Fill the auxiliary IAT with null chunks for native imports. auxIat.push_back(make(ctx)); auxIatCopy.push_back(make(ctx)); @@ -995,6 +1003,10 @@ void DelayLoadContents::create() { symtab.addSynthetic(tmName, tm); } + // Skip terminators on pure ARM64EC target if there are no native imports. + if (!tm && !symtab.isEC() && ctx.config.machine != ARM64X) + return; + // Terminate with null values. addresses.push_back(make(ctx, 8)); names.push_back(make(ctx, 8)); @@ -1024,7 +1036,7 @@ void DelayLoadContents::create() { } Chunk *DelayLoadContents::newTailMergeChunk(SymbolTable &symtab, Chunk *dir) { - auto helper = cast(symtab.delayLoadHelper); + auto helper = cast_or_null(symtab.delayLoadHelper); switch (symtab.machine) { case AMD64: case ARM64EC: diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 4c296da35d667..13e716d9958a0 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -190,7 +190,6 @@ static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) { case ARM64: return mt == ARM64 || mt == ARM64X; case ARM64EC: - return isArm64EC(mt) || mt == AMD64; case ARM64X: return isAnyArm64(mt) || mt == AMD64; case IMAGE_FILE_MACHINE_UNKNOWN: @@ -492,6 +491,12 @@ void LinkerDriver::parseDirectives(InputFile *file) { case OPT_alternatename: file->symtab.parseAlternateName(arg->getValue()); break; + case OPT_arm64xsameaddress: + if (!file->symtab.isEC()) + Warn(ctx) << arg->getSpelling() + << " is not allowed in non-ARM64EC files (" << toString(file) + << ")"; + break; case OPT_defaultlib: if (std::optional path = findLibIfNew(arg->getValue())) enqueuePath(*path, false, false); @@ -499,7 +504,7 @@ void LinkerDriver::parseDirectives(InputFile *file) { case OPT_entry: if (!arg->getValue()[0]) Fatal(ctx) << "missing entry point symbol name"; - ctx.forEachSymtab([&](SymbolTable &symtab) { + ctx.forEachActiveSymtab([&](SymbolTable &symtab) { symtab.entry = symtab.addGCRoot(symtab.mangle(arg->getValue()), true); }); break; @@ -657,9 +662,13 @@ void LinkerDriver::setMachine(MachineTypes machine) { ctx.config.machine = machine; - if (machine != ARM64X) { + if (!isArm64EC(machine)) { ctx.symtab.machine = machine; } else { + // Set up a hybrid symbol table on ARM64EC/ARM64X. This is primarily useful + // on ARM64X, where both the native and EC symbol tables are meaningful. + // However, since ARM64EC can include native object files, we also need to + // support a hybrid symbol table there. ctx.symtab.machine = ARM64EC; ctx.hybridSymtab.emplace(ctx, ARM64); } @@ -979,7 +988,7 @@ void LinkerDriver::createImportLibrary(bool asLib) { }; getExports(ctx.symtab, exports); - if (ctx.hybridSymtab) + if (ctx.config.machine == ARM64X) getExports(*ctx.hybridSymtab, nativeExports); std::string libName = getImportName(asLib); @@ -1383,13 +1392,13 @@ void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) { return; if (ctx.symtab.hadExplicitExports || - (ctx.hybridSymtab && ctx.hybridSymtab->hadExplicitExports)) + (ctx.config.machine == ARM64X && ctx.hybridSymtab->hadExplicitExports)) return; if (args.hasArg(OPT_exclude_all_symbols)) return; } - ctx.forEachSymtab([&](SymbolTable &symtab) { + ctx.forEachActiveSymtab([&](SymbolTable &symtab) { AutoExporter exporter(symtab, excludedSymbols); for (auto *arg : args.filtered(OPT_wholearchive_file)) @@ -2305,7 +2314,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (auto *arg = args.getLastArg(OPT_deffile)) { // parseModuleDefs mutates Config object. ctx.symtab.parseModuleDefs(arg->getValue()); - if (ctx.hybridSymtab) { + if (ctx.config.machine == ARM64X) { // MSVC ignores the /defArm64Native argument on non-ARM64X targets. // It is also ignored if the /def option is not specified. if (auto *arg = args.getLastArg(OPT_defarm64native)) @@ -2332,7 +2341,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { } // Handle /entry and /dll - ctx.forEachSymtab([&](SymbolTable &symtab) { + ctx.forEachActiveSymtab([&](SymbolTable &symtab) { llvm::TimeTraceScope timeScope("Entry point"); if (auto *arg = args.getLastArg(OPT_entry)) { if (!arg->getValue()[0]) @@ -2364,7 +2373,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { llvm::TimeTraceScope timeScope("Delay load"); for (auto *arg : args.filtered(OPT_delayload)) { config->delayLoads.insert(StringRef(arg->getValue()).lower()); - ctx.forEachSymtab([&](SymbolTable &symtab) { + ctx.forEachActiveSymtab([&](SymbolTable &symtab) { if (symtab.machine == I386) { symtab.delayLoadHelper = symtab.addGCRoot("___delayLoadHelper2@8"); } else { @@ -2538,7 +2547,9 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { u->setWeakAlias(symtab.addUndefined(to)); } } + }); + ctx.forEachActiveSymtab([&](SymbolTable &symtab) { // If any inputs are bitcode files, the LTO code generator may create // references to library functions that are not explicit in the bitcode // file's symbol table. If any of those library functions are defined in @@ -2568,7 +2579,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Handle /includeglob for (StringRef pat : args::getStrings(args, OPT_incl_glob)) - ctx.forEachSymtab( + ctx.forEachActiveSymtab( [&](SymbolTable &symtab) { symtab.addUndefinedGlob(pat); }); // Create wrapped symbols for -wrap option. @@ -2685,12 +2696,12 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // need to create a .lib file. In MinGW mode, we only do that when the // -implib option is given explicitly, for compatibility with GNU ld. if (config->dll || !ctx.symtab.exports.empty() || - (ctx.hybridSymtab && !ctx.hybridSymtab->exports.empty())) { + (ctx.config.machine == ARM64X && !ctx.hybridSymtab->exports.empty())) { llvm::TimeTraceScope timeScope("Create .lib exports"); - ctx.forEachSymtab([](SymbolTable &symtab) { symtab.fixupExports(); }); + ctx.forEachActiveSymtab([](SymbolTable &symtab) { symtab.fixupExports(); }); if (!config->noimplib && (!config->mingw || !config->implib.empty())) createImportLibrary(/*asLib=*/false); - ctx.forEachSymtab( + ctx.forEachActiveSymtab( [](SymbolTable &symtab) { symtab.assignExportOrdinals(); }); } @@ -2756,7 +2767,8 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (ctx.symtab.isEC()) ctx.symtab.initializeECThunks(); - ctx.forEachSymtab([](SymbolTable &symtab) { symtab.initializeLoadConfig(); }); + ctx.forEachActiveSymtab( + [](SymbolTable &symtab) { symtab.initializeLoadConfig(); }); // Identify unreferenced COMDAT sections. if (config->doGC) { diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 7fb42bb681939..e10b6419b5ad5 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -137,10 +137,8 @@ void ArchiveFile::parse() { ctx.symtab.addLazyArchive(this, sym); // Read both EC and native symbols on ARM64X. - if (!ctx.hybridSymtab) - return; archiveSymtab = &*ctx.hybridSymtab; - } else if (ctx.hybridSymtab) { + } else { // If the ECSYMBOLS section is missing in the archive, the archive could // be either a native-only ARM64 or x86_64 archive. Check the machine type // of the object containing a symbol to determine which symbol table to diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index 4e401a5fd1d6d..a887d7d351e18 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -356,3 +356,4 @@ def tlbid : P_priv<"tlbid">; def tlbout : P_priv<"tlbout">; def verbose_all : P_priv<"verbose">; def guardsym : P_priv<"guardsym">; +def arm64xsameaddress : P_priv<"arm64xsameaddress">; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 8fb0ee4e890d6..d6f771284aa83 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -551,7 +551,7 @@ void SymbolTable::initializeLoadConfig() { Warn(ctx) << "EC version of '_load_config_used' is missing"; return; } - if (ctx.hybridSymtab) { + if (ctx.config.machine == ARM64X) { Warn(ctx) << "native version of '_load_config_used' is missing for " "ARM64X target"; return; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index f3cf4902e6ecf..db6133e20a037 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -1374,7 +1374,7 @@ void Writer::createExportTable() { } } } - ctx.forEachSymtab([&](SymbolTable &symtab) { + ctx.forEachActiveSymtab([&](SymbolTable &symtab) { if (symtab.edataStart) { if (symtab.hadExplicitExports) Warn(ctx) << "literal .edata sections override exports"; @@ -1776,7 +1776,8 @@ template void Writer::writeHeader() { assert(coffHeaderOffset == buf - buffer->getBufferStart()); auto *coff = reinterpret_cast(buf); buf += sizeof(*coff); - SymbolTable &symtab = ctx.hybridSymtab ? *ctx.hybridSymtab : ctx.symtab; + SymbolTable &symtab = + ctx.config.machine == ARM64X ? *ctx.hybridSymtab : ctx.symtab; coff->Machine = symtab.isEC() ? AMD64 : symtab.machine; coff->NumberOfSections = ctx.outputSections.size(); coff->Characteristics = IMAGE_FILE_EXECUTABLE_IMAGE; @@ -2433,7 +2434,7 @@ void Writer::setECSymbols() { return a.first->getRVA() < b.first->getRVA(); }); - ChunkRange &chpePdata = ctx.hybridSymtab ? hybridPdata : pdata; + ChunkRange &chpePdata = ctx.config.machine == ARM64X ? hybridPdata : pdata; Symbol *rfeTableSym = ctx.symtab.findUnderscore("__arm64x_extra_rfe_table"); replaceSymbol(rfeTableSym, "__arm64x_extra_rfe_table", chpePdata.first); @@ -2478,7 +2479,7 @@ void Writer::setECSymbols() { delayIdata.getAuxIatCopy().empty() ? nullptr : delayIdata.getAuxIatCopy().front()); - if (ctx.hybridSymtab) { + if (ctx.config.machine == ARM64X) { // For the hybrid image, set the alternate entry point to the EC entry // point. In the hybrid view, it is swapped to the native entry point // using ARM64X relocations. @@ -2868,7 +2869,7 @@ void Writer::fixTlsAlignment() { } void Writer::prepareLoadConfig() { - ctx.forEachSymtab([&](SymbolTable &symtab) { + ctx.forEachActiveSymtab([&](SymbolTable &symtab) { if (!symtab.loadConfigSym) return; @@ -2928,7 +2929,7 @@ void Writer::prepareLoadConfig(SymbolTable &symtab, T *loadConfig) { IF_CONTAINS(CHPEMetadataPointer) { // On ARM64X, only the EC version of the load config contains // CHPEMetadataPointer. Copy its value to the native load config. - if (ctx.hybridSymtab && !symtab.isEC() && + if (ctx.config.machine == ARM64X && !symtab.isEC() && ctx.symtab.loadConfigSize >= offsetof(T, CHPEMetadataPointer) + sizeof(T::CHPEMetadataPointer)) { OutputSection *sec = diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index e667fdc0633c5..e45dd4d354afb 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1489,7 +1489,7 @@ template void elf::writeARMCmseImportLib(Ctx &ctx) { const uint64_t fileSize = sectionHeaderOff + shnum * sizeof(typename ELFT::Shdr); const unsigned flags = - ctx.arg.mmapOutputFile ? 0 : (unsigned)FileOutputBuffer::F_no_mmap; + ctx.arg.mmapOutputFile ? (unsigned)FileOutputBuffer::F_mmap : 0; unlinkAsync(ctx.arg.cmseOutputLib); Expected> bufferOrErr = FileOutputBuffer::create(ctx.arg.cmseOutputLib, fileSize, flags); diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index e8acdbefa32bb..76a37b706c5fa 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1421,7 +1421,7 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) { ctx.arg.mergeArmExidx = args.hasFlag(OPT_merge_exidx_entries, OPT_no_merge_exidx_entries, true); ctx.arg.mmapOutputFile = - args.hasFlag(OPT_mmap_output_file, OPT_no_mmap_output_file, true); + args.hasFlag(OPT_mmap_output_file, OPT_no_mmap_output_file, false); ctx.arg.nmagic = args.hasFlag(OPT_nmagic, OPT_no_nmagic, false); ctx.arg.noinhibitExec = args.hasArg(OPT_noinhibit_exec); ctx.arg.nostdlib = args.hasArg(OPT_nostdlib); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index e2aebff20e174..6a0552e808c7b 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2908,8 +2908,8 @@ template void Writer::openFile() { unsigned flags = 0; if (!ctx.arg.relocatable) flags |= FileOutputBuffer::F_executable; - if (!ctx.arg.mmapOutputFile) - flags |= FileOutputBuffer::F_no_mmap; + if (ctx.arg.mmapOutputFile) + flags |= FileOutputBuffer::F_mmap; Expected> bufferOrErr = FileOutputBuffer::create(ctx.arg.outputFile, fileSize, flags); diff --git a/lld/docs/ELF/warn_backrefs.rst b/lld/docs/ELF/warn_backrefs.rst index fac2145cc0c0e..bb9d86ce685c0 100644 --- a/lld/docs/ELF/warn_backrefs.rst +++ b/lld/docs/ELF/warn_backrefs.rst @@ -11,8 +11,8 @@ so far. When encountering an archive or an object file surrounded by symbol definitions; this may result in input files being loaded, updating the set of undefined symbol references. When all resolving definitions have been loaded from the archive, the linker moves on the next file and will not return -to it. This means that if an input file to the right of a archive cannot have -an undefined symbol resolved by a archive to the left of it. For example: +to it. This means that if an input file to the right of an archive cannot have +an undefined symbol resolved by an archive to the left of it. For example: ld def.a ref.o diff --git a/lld/docs/windows_support.rst b/lld/docs/windows_support.rst index e4640b4a5259a..38688a9629227 100644 --- a/lld/docs/windows_support.rst +++ b/lld/docs/windows_support.rst @@ -86,7 +86,7 @@ MSBuild.exe had been shipped as a component of the .NET framework, but since 2013 it's part of Visual Studio. You can find it at "C:\\Program Files (x86)\\msbuild". -You can build LLD as a 64 bit application. To do that, open VS2013 x64 command +You can build LLD as a 64-bit application. To do that, open VS2013 x64 command prompt and run cmake for "Visual Studio 12 Win64" target. Using Ninja diff --git a/lld/test/COFF/arm64ec-entry-mangle.test b/lld/test/COFF/arm64ec-entry-mangle.test index 6db16ef218dc8..1f029077ba51d 100644 --- a/lld/test/COFF/arm64ec-entry-mangle.test +++ b/lld/test/COFF/arm64ec-entry-mangle.test @@ -97,7 +97,7 @@ RUN: not lld-link -machine:arm64ec -dll -out:test.dll demangled-func.obj loadcon RUN: "-entry:#func" 2>&1 | FileCheck -check-prefix=FUNC-NOT-FOUND %s RUN: not lld-link -machine:arm64ec -dll -out:test.dll demangled-func.obj loadconfig-arm64ec.obj \ RUN: -noentry "-export:#func" 2>&1 | FileCheck -check-prefix=FUNC-NOT-FOUND %s -FUNC-NOT-FOUND: undefined symbol: #func +FUNC-NOT-FOUND: undefined symbol: #func (EC symbol) Verify that the linker recognizes the demangled x86_64 _DllMainCRTStartup. RUN: lld-link -machine:arm64ec -dll -out:test.dll x64-dll-main.obj loadconfig-arm64ec.obj diff --git a/lld/test/COFF/arm64ec-hybmp.s b/lld/test/COFF/arm64ec-hybmp.s index 5fc24d4250704..670ee3926ab5c 100644 --- a/lld/test/COFF/arm64ec-hybmp.s +++ b/lld/test/COFF/arm64ec-hybmp.s @@ -62,7 +62,7 @@ thunk: // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows undef-func.s -o undef-func.obj // RUN: not lld-link -machine:arm64ec -dll -noentry -out:test.dll undef-func.obj 2>&1 | FileCheck -check-prefix=UNDEF-FUNC %s -// UNDEF-FUNC: error: undefined symbol: func +// UNDEF-FUNC: error: undefined symbol: func (EC symbol) #--- undef-thunk.s .section .text,"xr",discard,func @@ -79,7 +79,7 @@ func: // RUN: llvm-mc -filetype=obj -triple=arm64ec-windows undef-thunk.s -o undef-thunk.obj // RUN: not lld-link -machine:arm64ec -dll -noentry -out:test.dll undef-thunk.obj 2>&1 | FileCheck -check-prefix=UNDEF-THUNK %s -// UNDEF-THUNK: error: undefined symbol: thunk +// UNDEF-THUNK: error: undefined symbol: thunk (EC symbol) #--- invalid-type.s .section .text,"xr",discard,func diff --git a/lld/test/COFF/arm64ec-lib.test b/lld/test/COFF/arm64ec-lib.test index 8698a5ceccbe7..1e6fa60209d94 100644 --- a/lld/test/COFF/arm64ec-lib.test +++ b/lld/test/COFF/arm64ec-lib.test @@ -29,11 +29,13 @@ RUN: lld-link -machine:arm64ec -dll -noentry -out:test2.dll symref-arm64ec.obj s Verify that both native and EC symbols can be referenced in a hybrid target. RUN: lld-link -machine:arm64x -dll -noentry -out:test3.dll symref-arm64ec.obj nsymref-aarch64.obj sym-arm64ec.lib \ RUN: loadconfig-arm64.obj loadconfig-arm64ec.obj +RUN: lld-link -machine:arm64ec -dll -noentry -out:test3ec.dll symref-arm64ec.obj nsymref-aarch64.obj sym-arm64ec.lib \ +RUN: loadconfig-arm64.obj loadconfig-arm64ec.obj Ensure that an EC symbol is not resolved using a regular symbol map. RUN: not lld-link -machine:arm64ec -dll -noentry -out:test-err.dll nsymref-arm64ec.obj sym-arm64ec.lib loadconfig-arm64ec.obj 2>&1 |\ RUN: FileCheck --check-prefix=ERR %s -ERR: error: undefined symbol: nsym +ERR: error: undefined symbol: nsym (EC symbol) Verify that a library symbol can be referenced, even if its name conflicts with an anti-dependency alias. RUN: lld-link -machine:arm64ec -dll -noentry -out:ref-alias-1.dll ref-alias.obj func.lib loadconfig-arm64ec.obj diff --git a/lld/test/COFF/arm64ec-patchable-thunks.test b/lld/test/COFF/arm64ec-patchable-thunks.test index 1e1ff1f7f2ee4..593405775ba87 100644 --- a/lld/test/COFF/arm64ec-patchable-thunks.test +++ b/lld/test/COFF/arm64ec-patchable-thunks.test @@ -57,7 +57,7 @@ RUN: llvm-readobj --coff-load-config test3.dll | FileCheck -check-prefix=PATCH-C RUN: not lld-link -out:test4.dll -machine:arm64ec test-sec.obj loadconfig-arm64ec.obj -dll -noentry 2>&1 | FileCheck --check-prefix=ERR %s -ERR: error: undefined symbol: EXP+#patchable_func +ERR: error: undefined symbol: EXP+#patchable_func (EC symbol) RUN: lld-link -out:testx.dll -machine:arm64x arm64ec-patchable.obj test-sec.obj \ diff --git a/lld/test/COFF/arm64ec-range-thunks.s b/lld/test/COFF/arm64ec-range-thunks.s index dcfa6365b4e3a..955e19c30c2ae 100644 --- a/lld/test/COFF/arm64ec-range-thunks.s +++ b/lld/test/COFF/arm64ec-range-thunks.s @@ -79,7 +79,11 @@ # RUN: -out:testx2.dll -verbose 2>&1 | FileCheck -check-prefix=VERBOSEX %s # VERBOSEX: Added 5 thunks with margin {{.*}} in 1 passes +# RUN: lld-link -machine:arm64ec -noentry -dll funcs-arm64ec.obj funcs-aarch64.obj loadconfig-arm64.obj loadconfig-arm64ec.obj \ +# RUN: -out:testx2ec.dll -verbose 2>&1 | FileCheck -check-prefix=VERBOSEX %s + # RUN: llvm-objdump -d testx2.dll | FileCheck --check-prefix=DISASMX %s +# RUN: llvm-objdump -d testx2ec.dll | FileCheck --check-prefix=DISASMX %s # DISASMX: Disassembly of section .code1: # DISASMX-EMPTY: @@ -126,6 +130,7 @@ # DISASMX-NEXT: 180016010: d61f0200 br x16 # RUN: llvm-readobj --coff-load-config testx2.dll | FileCheck --check-prefix=LOADCFGX2 %s +# RUN: llvm-readobj --coff-load-config testx2ec.dll | FileCheck --check-prefix=LOADCFGX2 %s # LOADCFGX2: CodeMap [ # LOADCFGX2-NEXT: 0x4000 - 0x4014 ARM64EC diff --git a/lld/test/COFF/arm64ec.test b/lld/test/COFF/arm64ec.test index 75288e97e598d..ea92689250063 100644 --- a/lld/test/COFF/arm64ec.test +++ b/lld/test/COFF/arm64ec.test @@ -35,14 +35,15 @@ RUN: llvm-readobj --file-headers test.dll | FileCheck -check-prefix=ARM64X-HEADE RUN: llvm-readobj --hex-dump=.data test.dll | FileCheck -check-prefix=ARM64X-DATA %s ARM64X-DATA: 03030303 01010101 02020202 +RUN: lld-link -out:test.dll -machine:arm64ec x86_64-data-sym.obj arm64-data-sym.obj \ +RUN: arm64ec-data-sym.obj arm64x-resource.obj -dll -noentry +RUN: llvm-readobj --file-headers test.dll | FileCheck -check-prefix=ARM64EC-HEADER %s +RUN: llvm-readobj --hex-dump=.data test.dll | FileCheck -check-prefix=ARM64X-DATA %s + RUN: not lld-link -out:test.dll -machine:arm64 arm64-data-sym.obj arm64ec-data-sym.obj \ RUN: -dll -noentry 2>&1 | FileCheck -check-prefix=INCOMPAT1 %s INCOMPAT1: lld-link: error: arm64ec-data-sym.obj: machine type arm64ec conflicts with arm64 -RUN: not lld-link -out:test.dll -machine:arm64ec arm64ec-data-sym.obj arm64-data-sym.obj \ -RUN: -dll -noentry 2>&1 | FileCheck -check-prefix=INCOMPAT2 %s -INCOMPAT2: lld-link: error: arm64-data-sym.obj: machine type arm64 conflicts with arm64ec - RUN: not lld-link -out:test.dll -machine:arm64 arm64-data-sym.obj x86_64-data-sym.obj \ RUN: -dll -noentry 2>&1 | FileCheck -check-prefix=INCOMPAT3 %s INCOMPAT3: lld-link: error: x86_64-data-sym.obj: machine type x64 conflicts with arm64 diff --git a/lld/test/COFF/arm64x-altnames.s b/lld/test/COFF/arm64x-altnames.s index 43a3f89db9a03..db348f85b6b00 100644 --- a/lld/test/COFF/arm64x-altnames.s +++ b/lld/test/COFF/arm64x-altnames.s @@ -10,6 +10,8 @@ // RUN: not lld-link -out:out.dll -machine:arm64x -dll -noentry test-arm64.obj test-arm64ec.obj -alternatename:sym=altsym \ // RUN: 2>&1 | FileCheck --check-prefix=ERR-NATIVE %s +// RUN: not lld-link -out:out.dll -machine:arm64ec -dll -noentry test-arm64.obj test-arm64ec.obj -alternatename:sym=altsym \ +// RUN: 2>&1 | FileCheck --check-prefix=ERR-NATIVE %s // ERR-NATIVE-NOT: test-arm64ec.obj // ERR-NATIVE: lld-link: error: undefined symbol: sym (native symbol) @@ -20,9 +22,13 @@ // RUN: not lld-link -out:out.dll -machine:arm64x -dll -noentry test-arm64.obj test-arm64ec.obj drectve-arm64ec.obj \ // RUN: 2>&1 | FileCheck --check-prefix=ERR-NATIVE %s +// RUN: not lld-link -out:out.dll -machine:arm64ec -dll -noentry test-arm64.obj test-arm64ec.obj drectve-arm64ec.obj \ +// RUN: 2>&1 | FileCheck --check-prefix=ERR-NATIVE %s // RUN: not lld-link -out:out.dll -machine:arm64x -dll -noentry test-arm64.obj test-arm64ec.obj drectve-arm64.obj \ // RUN: 2>&1 | FileCheck --check-prefix=ERR-EC %s +// RUN: not lld-link -out:out.dll -machine:arm64ec -dll -noentry test-arm64.obj test-arm64ec.obj drectve-arm64.obj \ +// RUN: 2>&1 | FileCheck --check-prefix=ERR-EC %s // ERR-EC-NOT: test-arm64.obj // ERR-EC: lld-link: error: undefined symbol: sym (EC symbol) diff --git a/lld/test/COFF/arm64x-buildid.s b/lld/test/COFF/arm64x-buildid.s index 99f50073eaa69..857bcae2c0566 100644 --- a/lld/test/COFF/arm64x-buildid.s +++ b/lld/test/COFF/arm64x-buildid.s @@ -6,6 +6,9 @@ # RUN: llvm-readobj --hex-dump=.test %t.dll | FileCheck %s # CHECK: 0x180003000 3c100000 3c100000 +# RUN: lld-link -machine:arm64ec -dll -noentry %t-arm64.obj %t-arm64ec.obj -debug -build-id -Brepro -out:%t-ec.dll +# RUN: llvm-readobj --hex-dump=.test %t-ec.dll | FileCheck %s + .section .test,"dr" .rva __buildid diff --git a/lld/test/COFF/arm64x-comm.s b/lld/test/COFF/arm64x-comm.s index 830e3d3fdaaa1..b950af5b70a44 100644 --- a/lld/test/COFF/arm64x-comm.s +++ b/lld/test/COFF/arm64x-comm.s @@ -8,6 +8,9 @@ // RUN: llvm-readobj --hex-dump=.test %t.dll | FileCheck %s // CHECK: 0x180004000 10200000 18200000 20200000 28200000 +// RUN: lld-link -machine:arm64ec -lldmingw -dll -noentry -out:%t-ec.dll %t-arm64.obj %t-arm64ec.obj +// RUN: llvm-readobj --hex-dump=.test %t-ec.dll | FileCheck %s + .data .word 0 diff --git a/lld/test/COFF/arm64x-crt-sec.s b/lld/test/COFF/arm64x-crt-sec.s index 5be70a1845f12..45141ec238aea 100644 --- a/lld/test/COFF/arm64x-crt-sec.s +++ b/lld/test/COFF/arm64x-crt-sec.s @@ -17,6 +17,9 @@ // RUN: lld-link -out:out3.dll -machine:arm64x -dll -noentry crt2-amd64.obj crt1-arm64ec.obj crt2-arm64.obj crt1-arm64.obj // RUN: llvm-readobj --hex-dump=.CRT out3.dll | FileCheck %s +// RUN: lld-link -out:out4.dll -machine:arm64ec -dll -noentry crt2-amd64.obj crt1-arm64ec.obj crt2-arm64.obj crt1-arm64.obj +// RUN: llvm-readobj --hex-dump=.CRT out4.dll | FileCheck %s + // CHECK: 0x180002000 01000000 00000000 02000000 00000000 // CHECK-NEXT: 0x180002010 03000000 00000000 11000000 00000000 // CHECK-NEXT: 0x180002020 12000000 00000000 13000000 00000000 diff --git a/lld/test/COFF/arm64x-ctors-sec.s b/lld/test/COFF/arm64x-ctors-sec.s index 283d5f045260d..3295b3f20b8b2 100644 --- a/lld/test/COFF/arm64x-ctors-sec.s +++ b/lld/test/COFF/arm64x-ctors-sec.s @@ -22,6 +22,10 @@ // RUN: ctor2-arm64.obj ctor1-arm64ec.obj ctor2-amd64.obj ctor1-arm64.obj // RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out3.dll | FileCheck %s +// RUN: lld-link -out:out4.dll -machine:arm64ec -lldmingw -dll -noentry test-arm64.obj test-arm64ec.obj \ +// RUN: ctor2-arm64.obj ctor1-arm64ec.obj ctor2-amd64.obj ctor1-arm64.obj +// RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out4.dll | FileCheck %s + // CHECK: Hex dump of section '.rdata': // CHECK-NEXT: 0x180001000 ffffffff ffffffff 01000000 00000000 // CHECK-NEXT: 0x180001010 02000000 00000000 03000000 00000000 diff --git a/lld/test/COFF/arm64x-guardcf.s b/lld/test/COFF/arm64x-guardcf.s index 750bf0b3862c5..9d307bde276e1 100644 --- a/lld/test/COFF/arm64x-guardcf.s +++ b/lld/test/COFF/arm64x-guardcf.s @@ -16,7 +16,7 @@ // RUN: lld-link -dll -noentry -machine:arm64x func-gfids-arm64.obj func-gfids-arm64ec.obj func-amd64.obj -guard:cf -out:out.dll \ // RUN: loadconfig-arm64ec.obj loadconfig-arm64.obj -// RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s +// RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefixes=LOADCFG,LOADCFGX %s // LOADCFG: LoadConfig [ // LOADCFG: GuardCFFunctionCount: 3 @@ -31,28 +31,36 @@ // LOADCFG-NEXT: 0x180002000 // LOADCFG-NEXT: 0x180003000 // LOADCFG-NEXT: ] -// LOADCFG: HybridObject { -// LOADCFG: LoadConfig [ -// LOADCFG: GuardCFFunctionCount: 3 -// LOADCFG-NEXT: GuardFlags [ (0x10500) -// LOADCFG-NEXT: CF_FUNCTION_TABLE_PRESENT (0x400) -// LOADCFG-NEXT: CF_INSTRUMENTED (0x100) -// LOADCFG-NEXT: CF_LONGJUMP_TABLE_PRESENT (0x10000) -// LOADCFG-NEXT: ] -// LOADCFG: ] -// LOADCFG: GuardFidTable [ -// LOADCFG-NEXT: 0x180001000 -// LOADCFG-NEXT: 0x180002000 -// LOADCFG-NEXT: 0x180003000 -// LOADCFG-NEXT: ] -// LOADCFG: ] +// LOADCFGX: HybridObject { +// LOADCFGX: LoadConfig [ +// LOADCFGX: GuardCFFunctionCount: 3 +// LOADCFG-NEXTX: GuardFlags [ (0x10500) +// LOADCFG-NEXTX: CF_FUNCTION_TABLE_PRESENT (0x400) +// LOADCFG-NEXTX: CF_INSTRUMENTED (0x100) +// LOADCFG-NEXTX: CF_LONGJUMP_TABLE_PRESENT (0x10000) +// LOADCFG-NEXTX: ] +// LOADCFGX: ] +// LOADCFGX: GuardFidTable [ +// LOADCFG-NEXTX: 0x180001000 +// LOADCFG-NEXTX: 0x180002000 +// LOADCFG-NEXTX: 0x180003000 +// LOADCFG-NEXTX: ] +// LOADCFGX: ] + +// RUN: lld-link -dll -noentry -machine:arm64ec func-gfids-arm64.obj func-gfids-arm64ec.obj func-amd64.obj -guard:cf -out:out-ec.dll \ +// RUN: loadconfig-arm64ec.obj loadconfig-arm64.obj +// RUN: llvm-readobj --coff-load-config out-ec.dll | FileCheck --check-prefix=LOADCFG %s // Check that exports from both views are present in CF guard tables. // RUN: lld-link -dll -noentry -machine:arm64x func-exp-arm64.obj func-exp-arm64ec.obj -guard:cf -out:out-exp.dll \ // RUN: loadconfig-arm64ec.obj loadconfig-arm64.obj -// RUN: llvm-readobj --coff-load-config out-exp.dll | FileCheck --check-prefix=LOADCFG %s +// RUN: llvm-readobj --coff-load-config out-exp.dll | FileCheck --check-prefixes=LOADCFG,LOADCFGX %s + +// RUN: lld-link -dll -noentry -machine:arm64ec func-exp-arm64.obj func-exp-arm64ec.obj -guard:cf -out:out-exp-ec.dll \ +// RUN: loadconfig-arm64ec.obj loadconfig-arm64.obj +// RUN: llvm-readobj --coff-load-config out-exp-ec.dll | FileCheck --check-prefixes=LOADCFG %s // Check that entry points from both views are present in CF guard tables. diff --git a/lld/test/COFF/arm64x-import.test b/lld/test/COFF/arm64x-import.test index 7441c71d87710..612b5f9b71de1 100644 --- a/lld/test/COFF/arm64x-import.test +++ b/lld/test/COFF/arm64x-import.test @@ -56,7 +56,7 @@ DISASM-12T-NEXT: 180002040: d65f03c0 ret DISASM-12T-NEXT: ... DISASM-12T-NEXT: 180003000: ff 25 fa 0f 00 00 jmpq *0xffa(%rip) # 0x180004000 -RUN: llvm-readobj --coff-imports test-12-thunks.dll | FileCheck --check-prefix=IMPORTS-12 %s +RUN: llvm-readobj --coff-imports test-12-thunks.dll | FileCheck --check-prefixes=IMPORTS-12,IMPORTS-12-EC %s IMPORTS-12: Import { IMPORTS-12-NEXT: Name: test.dll IMPORTS-12-NEXT: ImportLookupTableRVA: 0x5348 @@ -65,13 +65,13 @@ IMPORTS-12-NEXT: Symbol: func1 (0) IMPORTS-12-NEXT: Symbol: func2 (0) IMPORTS-12-NEXT: } IMPORTS-12-NEXT: HybridObject { -IMPORTS-12: Import { -IMPORTS-12-NEXT: Name: test.dll -IMPORTS-12-NEXT: ImportLookupTableRVA: 0x5348 -IMPORTS-12-NEXT: ImportAddressTableRVA: 0x4000 -IMPORTS-12-NEXT: Symbol: func1 (0) -IMPORTS-12-NEXT: Symbol: func2 (0) -IMPORTS-12-NEXT: } +IMPORTS-12-EC: Import { +IMPORTS-12-EC-NEXT: Name: test.dll +IMPORTS-12-EC-NEXT: ImportLookupTableRVA: 0x5348 +IMPORTS-12-EC-NEXT: ImportAddressTableRVA: 0x4000 +IMPORTS-12-EC-NEXT: Symbol: func1 (0) +IMPORTS-12-EC-NEXT: Symbol: func2 (0) +IMPORTS-12-EC-NEXT:} IMPORTS-12-NEXT: } RUN: llvm-readobj --hex-dump=.test test-12-thunks.dll | FileCheck --check-prefix=FUNC-12-THUNKS %s @@ -81,6 +81,13 @@ FUNC-12-THUNKS-NEXT: 0x180009010 08600000 08400000 RUN: llvm-readobj --hex-dump=.testa test-12-thunks.dll | FileCheck --check-prefix=FUNC-12-THUNKSA %s FUNC-12-THUNKSA: 0x18000a000 00400000 08400000 00100000 +RUN: lld-link -machine:arm64ec -dll -noentry -out:test-12-thunks-ec.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12-thunks-arm64ec.obj func12-thunks-arm64.obj imp-arm64ec.lib imp-arm64.lib +RUN: llvm-objdump -d test-12-thunks-ec.dll | FileCheck --check-prefix=DISASM-12T %s +RUN: llvm-readobj --hex-dump=.test test-12-thunks-ec.dll | FileCheck --check-prefix=FUNC-12-THUNKS %s +RUN: llvm-readobj --hex-dump=.testa test-12-thunks-ec.dll | FileCheck --check-prefix=FUNC-12-THUNKSA %s +RUN: llvm-readobj --coff-imports test-12-thunks-ec.dll | FileCheck --check-prefix=IMPORTS-12-EC %s + # If the ordinals of named imports don't match, use the EC value. @@ -146,7 +153,7 @@ IMPORTS-ORD2-NEXT: } RUN: lld-link -machine:arm64x -dll -noentry -out:test2.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ RUN: icall.obj func12-arm64ec.obj func123-arm64.obj imp-arm64x.lib -RUN: llvm-readobj --coff-imports test2.dll | FileCheck --check-prefix=IMPORTS-123-12 %s +RUN: llvm-readobj --coff-imports test2.dll | FileCheck --check-prefixes=IMPORTS-123-12,IMPORTS-123-12-EC %s IMPORTS-123-12: Import { IMPORTS-123-12-NEXT: Name: test.dll IMPORTS-123-12-NEXT: ImportLookupTableRVA: 0x3338 @@ -156,13 +163,13 @@ IMPORTS-123-12-NEXT: Symbol: func1 (0) IMPORTS-123-12-NEXT: Symbol: func2 (0) IMPORTS-123-12-NEXT: } IMPORTS-123-12-NEXT: HybridObject { -IMPORTS-123-12: Import { -IMPORTS-123-12-NEXT: Name: test.dll -IMPORTS-123-12-NEXT: ImportLookupTableRVA: 0x3340 -IMPORTS-123-12-NEXT: ImportAddressTableRVA: 0x2008 -IMPORTS-123-12-NEXT: Symbol: func1 (0) -IMPORTS-123-12-NEXT: Symbol: func2 (0) -IMPORTS-123-12-NEXT: } +IMPORTS-123-12-EC: Import { +IMPORTS-123-12-EC-NEXT: Name: test.dll +IMPORTS-123-12-EC-NEXT: ImportLookupTableRVA: 0x3340 +IMPORTS-123-12-EC-NEXT: ImportAddressTableRVA: 0x2008 +IMPORTS-123-12-EC-NEXT: Symbol: func1 (0) +IMPORTS-123-12-EC-NEXT: Symbol: func2 (0) +IMPORTS-123-12-EC-NEXT:} IMPORTS-123-12-NEXT: } RUN: llvm-readobj --hex-dump=.test test2.dll | FileCheck --check-prefix=TEST-123-12 %s @@ -175,13 +182,20 @@ RUN: llvm-readobj --hex-dump=.rdata test2.dll | FileCheck --check-prefix=TEST-12 TEST-123-12AUX: 0x180004000 00000000 00000000 08100080 01000000 TEST-123-12AUX-NEXT: 0x180004010 1c100080 01000000 00000000 00000000 +RUN: lld-link -machine:arm64ec -dll -noentry -out:test2-ec.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12-arm64ec.obj func123-arm64.obj imp-arm64x.lib +RUN: llvm-readobj --coff-imports test2-ec.dll | FileCheck --check-prefix=IMPORTS-123-12-EC %s +RUN: llvm-readobj --hex-dump=.test test2-ec.dll | FileCheck --check-prefix=TEST-123-12 %s +RUN: llvm-readobj --hex-dump=.testa test2-ec.dll | FileCheck --check-prefix=TEST-123-12A %s +RUN: llvm-readobj --hex-dump=.rdata test2-ec.dll | FileCheck --check-prefix=TEST-123-12AUX %s + # Link to the imported func1 and func2 from both native and EC code, and func3 from EC code. RUN: lld-link -machine:arm64x -dll -noentry -out:func-12-123.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ RUN: icall.obj func123-arm64ec.obj func12-arm64.obj imp-arm64x.lib -RUN: llvm-readobj --coff-imports func-12-123.dll | FileCheck --check-prefix=IMPORTS-12-123 %s +RUN: llvm-readobj --coff-imports func-12-123.dll | FileCheck --check-prefixes=IMPORTS-12-123,IMPORTS-12-123-EC %s IMPORTS-12-123: Import { IMPORTS-12-123-NEXT: Name: test.dll IMPORTS-12-123-NEXT: ImportLookupTableRVA: 0x3338 @@ -190,14 +204,14 @@ IMPORTS-12-123-NEXT: Symbol: func1 (0) IMPORTS-12-123-NEXT: Symbol: func2 (0) IMPORTS-12-123-NEXT: } IMPORTS-12-123-NEXT: HybridObject { -IMPORTS-12-123: Import { -IMPORTS-12-123-NEXT: Name: test.dll -IMPORTS-12-123-NEXT: ImportLookupTableRVA: 0x3338 -IMPORTS-12-123-NEXT: ImportAddressTableRVA: 0x2000 -IMPORTS-12-123-NEXT: Symbol: func1 (0) -IMPORTS-12-123-NEXT: Symbol: func2 (0) -IMPORTS-12-123-NEXT: Symbol: func3 (0) -IMPORTS-12-123-NEXT: } +IMPORTS-12-123-EC: Import { +IMPORTS-12-123-EC-NEXT: Name: test.dll +IMPORTS-12-123-EC-NEXT: ImportLookupTableRVA: 0x3338 +IMPORTS-12-123-EC-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-12-123-EC-NEXT: Symbol: func1 (0) +IMPORTS-12-123-EC-NEXT: Symbol: func2 (0) +IMPORTS-12-123-EC-NEXT: Symbol: func3 (0) +IMPORTS-12-123-EC-NEXT:} IMPORTS-12-123-NEXT: } RUN: llvm-readobj --hex-dump=.test func-12-123.dll | FileCheck --check-prefix=TEST-12-123 %s @@ -211,6 +225,12 @@ RUN: llvm-readobj --hex-dump=.rdata func-12-123.dll | FileCheck --check-prefix=T TEST-12-123AUX: 0x180004000 08100080 01000000 1c100080 01000000 TEST-12-123AUX-NEXT: 0x180004010 30100080 01000000 00000000 00000000 +RUN: lld-link -machine:arm64ec -dll -noentry -out:func-12-123-ec.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func123-arm64ec.obj func12-arm64.obj imp-arm64x.lib +RUN: llvm-readobj --coff-imports func-12-123-ec.dll | FileCheck --check-prefix=IMPORTS-12-123-EC %s +RUN: llvm-readobj --hex-dump=.test func-12-123-ec.dll | FileCheck --check-prefix=TEST-12-123 %s +RUN: llvm-readobj --hex-dump=.testa func-12-123-ec.dll | FileCheck --check-prefix=TEST-12-123A %s +RUN: llvm-readobj --hex-dump=.rdata func-12-123-ec.dll | FileCheck --check-prefix=TEST-12-123AUX %s # Link to the imported func2 and func3 from both native and EC code, func4 from native code, # and func1 from EC code. @@ -218,7 +238,7 @@ TEST-12-123AUX-NEXT: 0x180004010 30100080 01000000 00000000 00000000 RUN: lld-link -machine:arm64x -dll -noentry -out:test-234-123.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ RUN: icall.obj func123-arm64ec.obj func234-arm64.obj imp-arm64x.lib -RUN: llvm-readobj --coff-imports test-234-123.dll | FileCheck --check-prefix=IMPORTS-234-123 %s +RUN: llvm-readobj --coff-imports test-234-123.dll | FileCheck --check-prefixes=IMPORTS-234-123,IMPORTS-234-123-EC %s IMPORTS-234-123: Import { IMPORTS-234-123-NEXT: Name: test.dll IMPORTS-234-123-NEXT: ImportLookupTableRVA: 0x3338 @@ -228,14 +248,14 @@ IMPORTS-234-123-NEXT: Symbol: func2 (0) IMPORTS-234-123-NEXT: Symbol: func3 (0) IMPORTS-234-123-NEXT: } IMPORTS-234-123-NEXT: HybridObject { -IMPORTS-234-123: Import { -IMPORTS-234-123-NEXT: Name: test.dll -IMPORTS-234-123-NEXT: ImportLookupTableRVA: 0x3340 -IMPORTS-234-123-NEXT: ImportAddressTableRVA: 0x2008 -IMPORTS-234-123-NEXT: Symbol: func2 (0) -IMPORTS-234-123-NEXT: Symbol: func3 (0) -IMPORTS-234-123-NEXT: Symbol: func1 (0) -IMPORTS-234-123-NEXT: } +IMPORTS-234-123-EC: Import { +IMPORTS-234-123-EC-NEXT: Name: test.dll +IMPORTS-234-123-EC-NEXT: ImportLookupTableRVA: 0x3340 +IMPORTS-234-123-EC-NEXT: ImportAddressTableRVA: 0x2008 +IMPORTS-234-123-EC-NEXT: Symbol: func2 (0) +IMPORTS-234-123-EC-NEXT: Symbol: func3 (0) +IMPORTS-234-123-EC-NEXT: Symbol: func1 (0) +IMPORTS-234-123-EC-NEXT:} IMPORTS-234-123-NEXT: } RUN: llvm-readobj --hex-dump=.test test-234-123.dll | FileCheck --check-prefix=TEST-234-123 %s @@ -245,13 +265,19 @@ TEST-234-123-NEXT: 0x180007010 10400000 1020000 RUN: llvm-readobj --hex-dump=.testa test-234-123.dll | FileCheck --check-prefix=TEST-234-123A %s TEST-234-123A: 0x180008000 08200000 10200000 00200000 +RUN: lld-link -machine:arm64ec -dll -noentry -out:test-234-123-ec.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func123-arm64ec.obj func234-arm64.obj imp-arm64x.lib +RUN: llvm-readobj --coff-imports test-234-123-ec.dll | FileCheck --check-prefix=IMPORTS-234-123-EC %s +RUN: llvm-readobj --hex-dump=.test test-234-123-ec.dll | FileCheck --check-prefix=TEST-234-123 %s +RUN: llvm-readobj --hex-dump=.testa test-234-123-ec.dll | FileCheck --check-prefix=TEST-234-123A %s + # Link to the imported func3 and func4 from native code, and func1 and func2 from EC code. RUN: lld-link -machine:arm64x -dll -noentry -out:test-34-12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ RUN: icall.obj func12o-arm64ec.obj func34o-arm64.obj imp-arm64x.lib imp2.lib -RUN: llvm-readobj --coff-imports test-34-12.dll | FileCheck --check-prefix=IMPORTS-34-12 %s +RUN: llvm-readobj --coff-imports test-34-12.dll | FileCheck --check-prefixes=IMPORTS-34-12,IMPORTS-34-12-EC %s IMPORTS-34-12: Import { IMPORTS-34-12-NEXT: Name: test.dll IMPORTS-34-12-NEXT: ImportLookupTableRVA: 0x3350 @@ -266,19 +292,19 @@ IMPORTS-34-12-NEXT: ImportAddressTableRVA: 0x2028 IMPORTS-34-12-NEXT: Symbol: otherfunc (0) IMPORTS-34-12-NEXT: } IMPORTS-34-12-NEXT: HybridObject { -IMPORTS-34-12: Import { -IMPORTS-34-12-NEXT: Name: test.dll -IMPORTS-34-12-NEXT: ImportLookupTableRVA: 0x3360 -IMPORTS-34-12-NEXT: ImportAddressTableRVA: 0x2010 -IMPORTS-34-12-NEXT: Symbol: func1 (0) -IMPORTS-34-12-NEXT: Symbol: func2 (0) -IMPORTS-34-12-NEXT: } -IMPORTS-34-12-NEXT: Import { -IMPORTS-34-12-NEXT: Name: test2.dll -IMPORTS-34-12-NEXT: ImportLookupTableRVA: 0x3378 -IMPORTS-34-12-NEXT: ImportAddressTableRVA: 0x2028 -IMPORTS-34-12-NEXT: Symbol: otherfunc (0) -IMPORTS-34-12-NEXT: } +IMPORTS-34-12-EC: Import { +IMPORTS-34-12-EC-NEXT: Name: test.dll +IMPORTS-34-12-EC-NEXT: ImportLookupTableRVA: 0x3360 +IMPORTS-34-12-EC-NEXT: ImportAddressTableRVA: 0x2010 +IMPORTS-34-12-EC-NEXT: Symbol: func1 (0) +IMPORTS-34-12-EC-NEXT: Symbol: func2 (0) +IMPORTS-34-12-EC-NEXT:} +IMPORTS-34-12-EC-NEXT:Import { +IMPORTS-34-12-EC-NEXT: Name: test2.dll +IMPORTS-34-12-EC-NEXT: ImportLookupTableRVA: 0x3378 +IMPORTS-34-12-EC-NEXT: ImportAddressTableRVA: 0x2028 +IMPORTS-34-12-EC-NEXT: Symbol: otherfunc (0) +IMPORTS-34-12-EC-NEXT:} IMPORTS-34-12-NEXT: } RUN: llvm-readobj --hex-dump=.test test-34-12.dll | FileCheck --check-prefix=TEST-23-12 %s @@ -288,6 +314,12 @@ TEST-23-12-NEXT: 0x180007010 28400000 28200000 RUN: llvm-readobj --hex-dump=.testa test-34-12.dll | FileCheck --check-prefix=TEST-23-12A %s TEST-23-12A: 0x180008000 00200000 08200000 28200000 +RUN: lld-link -machine:arm64ec -dll -noentry -out:test-34-12-ec.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: icall.obj func12o-arm64ec.obj func34o-arm64.obj imp-arm64x.lib imp2.lib +RUN: llvm-readobj --coff-imports test-34-12-ec.dll | FileCheck --check-prefix=IMPORTS-34-12-EC %s +RUN: llvm-readobj --hex-dump=.test test-34-12-ec.dll | FileCheck --check-prefix=TEST-23-12 %s +RUN: llvm-readobj --hex-dump=.testa test-34-12-ec.dll | FileCheck --check-prefix=TEST-23-12A %s + # Link only to imported EC functions, with no native imports. @@ -335,7 +367,7 @@ IMPORTS-EC12-NEXT: } RUN: lld-link -machine:arm64x -dll -noentry -out:test-n12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ RUN: func12-arm64.obj imp-arm64x.lib -RUN: llvm-readobj --coff-imports test-n12.dll | FileCheck --check-prefix=IMPORTS-N12 %s +RUN: llvm-readobj --coff-imports test-n12.dll | FileCheck --check-prefixes=IMPORTS-N12,IMPORTS-N12-EC %s IMPORTS-N12: Arch: aarch64 IMPORTS-N12-NEXT: AddressSize: 64bit @@ -347,16 +379,20 @@ IMPORTS-N12-NEXT: Symbol: func1 (0) IMPORTS-N12-NEXT: Symbol: func2 (0) IMPORTS-N12-NEXT: } IMPORTS-N12-NEXT: HybridObject { -IMPORTS-N12-NEXT: Format: COFF-ARM64EC -IMPORTS-N12-NEXT: Arch: aarch64 -IMPORTS-N12-NEXT: AddressSize: 64bit -IMPORTS-N12-NEXT: Import { -IMPORTS-N12-NEXT: Name: test.dll -IMPORTS-N12-NEXT: ImportLookupTableRVA: 0x2340 -IMPORTS-N12-NEXT: ImportAddressTableRVA: 0x1010 -IMPORTS-N12-NEXT: } +IMPORTS-N12-EC: Format: COFF-ARM64EC +IMPORTS-N12-EC-NEXT: Arch: aarch64 +IMPORTS-N12-EC-NEXT: AddressSize: 64bit +IMPORTS-N12-EC-NEXT: Import { +IMPORTS-N12-EC-NEXT: Name: test.dll +IMPORTS-N12-EC-NEXT: ImportLookupTableRVA: 0x2340 +IMPORTS-N12-EC-NEXT: ImportAddressTableRVA: 0x1010 +IMPORTS-N12-EC-NEXT: } IMPORTS-N12-NEXT: } +RUN: lld-link -machine:arm64ec -dll -noentry -out:test-n12-ec.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: func12-arm64.obj imp-arm64x.lib +RUN: llvm-readobj --coff-imports test-n12-ec.dll | FileCheck --check-prefix=IMPORTS-N12-EC %s + RUN: lld-link -machine:arm64x -dll -noentry -out:test-dup.dll loadconfig-arm64.obj loadconfig-arm64ec.obj icall.obj \ RUN: func12-arm64ec.obj func34-arm64.obj dup.lib diff --git a/lld/test/COFF/arm64x-sameaddress.test b/lld/test/COFF/arm64x-sameaddress.test new file mode 100644 index 0000000000000..c69be9d268c3b --- /dev/null +++ b/lld/test/COFF/arm64x-sameaddress.test @@ -0,0 +1,56 @@ +REQUIRES: aarch64 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func-arm64ec.s -o func-arm64ec.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows func-arm64.s -o func-arm64.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows drectve.s -o drectve.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows drectve.s -o drectve-arm64.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj +RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj + +RUN: lld-link -machine:arm64x -dll -noentry -out:out.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: func-arm64.obj func-arm64ec.obj drectve.obj + +RUN: lld-link -machine:arm64x -dll -noentry -out:out-cmd.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: func-arm64.obj func-arm64ec.obj -arm64xsameaddress:func + +RUN: lld-link -machine:arm64ec -dll -noentry -out:out-ec.dll loadconfig-arm64ec.obj func-arm64ec.obj drectve.obj + +RUN: lld-link -machine:arm64x -dll -noentry -out:out-warn.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \ +RUN: func-arm64.obj func-arm64ec.obj drectve-arm64.obj 2>&1 | FileCheck --check-prefix=WARN %s +WARN: lld-link: warning: -arm64xsameaddress: is not allowed in non-ARM64EC files (drectve-arm64.obj) + +#--- func-arm64.s + .section .text,"xr",discard,func + .globl func +func: + mov x0, #1 + ret + +#--- func-arm64ec.s + .section .text,"xr",discard,"#func" + .globl "#func" +"#func": + mov x0, #2 + ret + + .weak_anti_dep func + .set func,"#func" + + .section .wowthk,"xr",discard,entry_thunk + .globl entry_thunk +entry_thunk: + mov x0, #3 + ret + + .section .test,"dr" + .rva func + + .section .hybmp$x,"yi" + .symidx "#func" + .symidx entry_thunk + .word 1 + +#--- drectve.s + .section .drectve, "yn" + .ascii " -arm64xsameaddress:func" diff --git a/lld/test/COFF/arm64x-symtab.s b/lld/test/COFF/arm64x-symtab.s index c634f8a6ed4c5..176e81a23be16 100644 --- a/lld/test/COFF/arm64x-symtab.s +++ b/lld/test/COFF/arm64x-symtab.s @@ -18,6 +18,8 @@ // RUN: not lld-link -machine:arm64x -dll -noentry -out:err1.dll symref-aarch64.obj sym-arm64ec.obj \ // RUN: 2>&1 | FileCheck --check-prefix=UNDEF %s +// RUN: not lld-link -machine:arm64ec -dll -noentry -out:err1.dll symref-aarch64.obj sym-arm64ec.obj \ +// RUN: 2>&1 | FileCheck --check-prefix=UNDEF %s // UNDEF: lld-link: error: undefined symbol: sym (native symbol) // UNDEF-NEXT: >>> referenced by symref-aarch64.obj:(.data) @@ -25,25 +27,34 @@ // RUN: not lld-link -machine:arm64x -dll -noentry -out:out.dll symref-arm64ec.obj sym-aarch64.obj \ // RUN: 2>&1 | FileCheck --check-prefix=UNDEFEC %s +// RUN: not lld-link -machine:arm64ec -dll -noentry -out:out.dll symref-arm64ec.obj sym-aarch64.obj \ +// RUN: 2>&1 | FileCheck --check-prefix=UNDEFEC %s // UNDEFEC: lld-link: error: undefined symbol: sym (EC symbol) // UNDEFEC-NEXT: >>> referenced by symref-arm64ec.obj:(.data) // RUN: not lld-link -machine:arm64x -dll -noentry -out:out.dll symref-x86_64.obj sym-aarch64.obj \ // RUN: 2>&1 | FileCheck --check-prefix=UNDEFX86 %s +// RUN: not lld-link -machine:arm64ec -dll -noentry -out:out.dll symref-x86_64.obj sym-aarch64.obj \ +// RUN: 2>&1 | FileCheck --check-prefix=UNDEFX86 %s // UNDEFX86: lld-link: error: undefined symbol: sym (EC symbol) // UNDEFX86-NEXT: >>> referenced by symref-x86_64.obj:(.data) // RUN: not lld-link -machine:arm64x -dll -noentry -out:err2.dll symref-aarch64.obj sym-x86_64.obj \ // RUN: 2>&1 | FileCheck --check-prefix=UNDEF %s +// RUN: not lld-link -machine:arm64ec -dll -noentry -out:err2.dll symref-aarch64.obj sym-x86_64.obj \ +// RUN: 2>&1 | FileCheck --check-prefix=UNDEF %s // Check that ARM64X target can have the same symbol names in both native and EC namespaces. // RUN: lld-link -machine:arm64x -dll -noentry -out:out.dll symref-aarch64.obj sym-aarch64.obj \ // RUN: symref-arm64ec.obj sym-x86_64.obj +// RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll symref-aarch64.obj sym-aarch64.obj \ +// RUN: symref-arm64ec.obj sym-x86_64.obj // Check that ARM64X target can reference both native and EC symbols from an archive. // RUN: lld-link -machine:arm64x -dll -noentry -out:out2.dll symref-aarch64.obj symref-arm64ec.obj sym.lib +// RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll symref-aarch64.obj symref-arm64ec.obj sym.lib // Check that EC object files can reference x86_64 library symbols. @@ -55,15 +66,20 @@ // RUN: not lld-link -machine:arm64x -dll -noentry -out:err3.dll symref-aarch64.obj sym-x86_64.lib \ // RUN: 2>&1 | FileCheck --check-prefix=UNDEF %s +// RUN: not lld-link -machine:arm64ec -dll -noentry -out:err3.dll symref-aarch64.obj sym-x86_64.lib \ +// RUN: 2>&1 | FileCheck --check-prefix=UNDEF %s // Check that native object files can reference native library symbols. // RUN: lld-link -machine:arm64x -dll -noentry -out:out6.dll symref-aarch64.obj sym-aarch64.lib +// RUN: lld-link -machine:arm64ec -dll -noentry -out:out6.dll symref-aarch64.obj sym-aarch64.lib // Check that EC object files can't reference native ARM64 library symbols. // RUN: not lld-link -machine:arm64x -dll -noentry -out:err4.dll symref-arm64ec.obj sym-aarch64.lib \ // RUN: 2>&1 | FileCheck --check-prefix=UNDEFEC %s +// RUN: not lld-link -machine:arm64ec -dll -noentry -out:err4.dll symref-arm64ec.obj sym-aarch64.lib \ +// RUN: 2>&1 | FileCheck --check-prefix=UNDEFEC %s #--- symref.s .data diff --git a/lld/test/COFF/arm64x-wrap.s b/lld/test/COFF/arm64x-wrap.s index 4f600e38f7a83..5530bc47c884e 100644 --- a/lld/test/COFF/arm64x-wrap.s +++ b/lld/test/COFF/arm64x-wrap.s @@ -15,6 +15,10 @@ // CHECK: 0x180004000 02000000 02000000 01000000 02000000 // CHECK: 0x180004010 02000000 01000000 +// RUN: lld-link -machine:arm64ec -dll -noentry test-arm64.obj test-arm64ec.obj other-arm64.obj other-arm64ec.obj \ +// RUN: loadconfig-arm64.obj loadconfig-arm64ec.obj -out:out-ec.dll -wrap:sym -wrap:nosuchsym +// RUN: llvm-readobj --hex-dump=.test out-ec.dll | FileCheck %s + #--- test.s .section .test,"dr" .word sym diff --git a/lld/test/COFF/autoimport-arm64ec-data.test b/lld/test/COFF/autoimport-arm64ec-data.test index 1f22ca4917557..52e64d6020a83 100644 --- a/lld/test/COFF/autoimport-arm64ec-data.test +++ b/lld/test/COFF/autoimport-arm64ec-data.test @@ -12,7 +12,7 @@ RUN: llvm-objdump -s out.dll | FileCheck --check-prefix=CONTENTS %s IMPORTS: Import { IMPORTS-NEXT: Name: test.dll -IMPORTS-NEXT: ImportLookupTableRVA: 0x40E0 +IMPORTS-NEXT: ImportLookupTableRVA: 0x4100 IMPORTS-NEXT: ImportAddressTableRVA: 0x3000 IMPORTS-NEXT: Symbol: variable (0) IMPORTS-NEXT: } diff --git a/lld/test/ELF/link-open-file.test b/lld/test/ELF/link-open-file.test index 17c7ba95e6ebe..8693a53ead5d5 100644 --- a/lld/test/ELF/link-open-file.test +++ b/lld/test/ELF/link-open-file.test @@ -10,8 +10,10 @@ ## FILE_SHARE_WRITE = 2 ## FILE_SHARE_DELETE = 4 -# RUN: %python %s %t.o 7 -# RUN: not %python %s %t.o 3 2>&1 | FileCheck %s +# RUN: %python %s %t.o 7 false +# RUN: not %python %s %t.o 3 false 2>&1 | FileCheck %s +# RUN: %python %s %t.o 7 true +# RUN: not %python %s %t.o 3 true 2>&1 | FileCheck %s # CHECK: error: failed to write output '{{.*}}': {{.*}} import contextlib @@ -26,6 +28,7 @@ import time object_file = sys.argv[1] share_flags = int(sys.argv[2]) +use_mmap = bool(sys.argv[3]) @contextlib.contextmanager def open_with_share_flags(filename, share_flags): @@ -55,7 +58,10 @@ os.makedirs(outdir) elf = os.path.join(outdir, 'output_file.elf') open(elf, 'wb').close() with open_with_share_flags(elf, share_flags): - subprocess.check_call(['ld.lld.exe', object_file, '-o', elf]) + args = ['ld.lld.exe', object_file, '-o', elf] + if use_mmap: + args.append("--mmap-output-file") + subprocess.check_call(args) ## Check the linker wrote the output file. with open(elf, 'rb') as f: diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst index e59dcc1972418..480430fede928 100644 --- a/lldb/docs/resources/build.rst +++ b/lldb/docs/resources/build.rst @@ -100,7 +100,7 @@ Windows * The Active Template Library (ATL). * `GnuWin32 `_ for CoreUtils and Make. * `Python 3 `_. Make sure to (1) get - the x64 variant if that's what you're targetting and (2) install the debug + the x64 variant if that's what you're targeting and (2) install the debug library if you want to build a debug lldb. The standalone installer is the easiest way to get the debug library. * `Python Tools for Visual Studio diff --git a/lldb/docs/resources/contributing.rst b/lldb/docs/resources/contributing.rst index d3d467533c9ea..48fd000765f66 100644 --- a/lldb/docs/resources/contributing.rst +++ b/lldb/docs/resources/contributing.rst @@ -39,7 +39,7 @@ in a few ways. The 2 main ones are: * `Use of asserts `_: See the :ref:`section below`. -For any other contradications, consider the +For any other contradictions, consider the `golden rule `_ before choosing to update the style of existing code. diff --git a/lldb/docs/resources/debugging.rst b/lldb/docs/resources/debugging.rst index ba23759b44cf5..ee3e45a49cbde 100644 --- a/lldb/docs/resources/debugging.rst +++ b/lldb/docs/resources/debugging.rst @@ -130,7 +130,7 @@ The inferior will stop, you place the breakpoint and then ``continue``. Go back to the inferior and input the command that should trigger the breakpoint. If you are running debugger and inferior in the same window, input ``ctrl+c`` -instead of ``process interrupt`` and then folllow the rest of the steps. +instead of ``process interrupt`` and then follow the rest of the steps. If you are doing this with ``lldb-server`` and find your breakpoint is never hit, check that you are breaking in code that is actually run by @@ -187,7 +187,7 @@ predictable way, or change the prompt of one or both copies of ``lldb``. If you are debugging a scenario where the ``lldb-server`` starts in ``platform`` mode, but you want to debug the ``gdbserver`` mode you'll have to work out what subprocess it's starting for the ``gdbserver`` part. One way is to look at the -list of runninng processes and take the command line from there. +list of running processes and take the command line from there. In theory it should be possible to use LLDB's ``target.process.follow-fork-mode`` or GDB's ``follow-fork-mode`` to @@ -387,8 +387,8 @@ an issue or asking for help. This is simply inspiration. Reduction ********* -The first step is to reduce uneeded compexity where it is cheap to do so. If -something is easily removed or frozen to a cerain value, do so. The goal is to +The first step is to reduce unneeded complexity where it is cheap to do so. If +something is easily removed or frozen to a certain value, do so. The goal is to keep the failure mode the same, with fewer dependencies. This includes, but is not limited to: @@ -396,11 +396,11 @@ This includes, but is not limited to: * Removing test cases that don't crash. * Replacing dynamic lookups with constant values. * Replace supporting functions with stubs that do nothing. -* Moving the test case to less unqiue system. If your machine has an exotic +* Moving the test case to less unique system. If your machine has an exotic extension, try it on a readily available commodity machine. * Removing irrelevant parts of the test program. * Reproducing the issue without using the LLDB test runner. -* Converting a remote debuging scenario into a local one. +* Converting a remote debugging scenario into a local one. Now we hopefully have a smaller reproducer than we started with. Next we need to find out what components of the software stack might be failing. @@ -578,14 +578,14 @@ Doing it this way instead of exactly copying what LLDB does will save a few ptrace calls. The AArch64 example program shows how to do this. * The inferior contains ``BRK #0`` then ``NOP``. -* 2 4 byte instructins means 8 bytes of data to replace, which matches the +* 2 4-byte instructions means 8 bytes of data to replace, which matches the minimum size you can write with ``PTRACE_POKETEXT``. * The inferior runs to the ``BRK``, which brings us into the debugger. * The debugger reads ``PC`` and writes ``NOP`` then ``NOP`` to the location pointed to by ``PC``. * The debugger then single steps the inferior to the next instruction (this is not required in this specific scenario, you could just continue but - it is included because this more cloesly matches what ``lldb`` does). + it is included because this more closely matches what ``lldb`` does). * The debugger then continues the inferior. * The inferior exits, and the whole program exits. diff --git a/lldb/docs/resources/qemu-testing.rst b/lldb/docs/resources/qemu-testing.rst index e102f84a1d31f..8571287a04262 100644 --- a/lldb/docs/resources/qemu-testing.rst +++ b/lldb/docs/resources/qemu-testing.rst @@ -156,7 +156,7 @@ certainly not forwarded. An example of this is shown below. :: - $ lldb-server plaform --server --listen 0.0.0.0:54321 --gdbserver-port 49140 + $ lldb-server platform --server --listen 0.0.0.0:54321 --gdbserver-port 49140 The result of this is that: diff --git a/lldb/docs/use/variable.rst b/lldb/docs/use/variable.rst index 3ad71cb93c51d..22c1fd64c4a96 100644 --- a/lldb/docs/use/variable.rst +++ b/lldb/docs/use/variable.rst @@ -961,7 +961,7 @@ printed one by one. [1] The `max_children` argument is optional (since lldb 3.8.0) and indicates the maximum number of children that lldb is interested in (at this moment). If the computation of the number of children is expensive (for example, requires -travesing a linked list to determine its size) your implementation may return +traversing a linked list to determine its size) your implementation may return `max_children` rather than the actual number. If the computation is cheap (e.g., the number is stored as a field of the object), then you can always return the true number of children (that is, ignore the `max_children` argument). diff --git a/lldb/include/lldb/Core/Address.h b/lldb/include/lldb/Core/Address.h index 9b5874f8b1fbe..85b2ab7bb3cfe 100644 --- a/lldb/include/lldb/Core/Address.h +++ b/lldb/include/lldb/Core/Address.h @@ -371,22 +371,15 @@ class Address { bool ResolveAddressUsingFileSections(lldb::addr_t addr, const SectionList *sections); - /// Resolve this address to its containing function and optionally get - /// that function's address range. + /// Resolve this address to its containing function. /// /// \param[out] sym_ctx /// The symbol context describing the function in which this address lies /// - /// \parm[out] addr_range_ptr - /// Pointer to the AddressRange to fill in with the function's address - /// range. Caller may pass null if they don't need the address range. - /// /// \return - /// Returns \b false if the function/symbol could not be resolved - /// or if the address range was requested and could not be resolved; + /// Returns \b false if the function/symbol could not be resolved; /// returns \b true otherwise. - bool ResolveFunctionScope(lldb_private::SymbolContext &sym_ctx, - lldb_private::AddressRange *addr_range_ptr = nullptr); + bool ResolveFunctionScope(lldb_private::SymbolContext &sym_ctx); /// Set the address to represent \a load_addr. /// diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index c974866306d2a..d3589e78b6bc7 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -8,9 +8,17 @@ import socket import string import subprocess +import signal import sys import threading import time +from typing import Any, Optional, Union, BinaryIO, TextIO + +## DAP type references +Event = dict[str, Any] +Request = dict[str, Any] +Response = dict[str, Any] +ProtocolMessage = Union[Event, Request, Response] def dump_memory(base_addr, data, num_per_line, outfile): @@ -97,55 +105,40 @@ def dump_dap_log(log_file): print("========= END =========", file=sys.stderr) -def read_packet_thread(vs_comm, log_file): - done = False - try: - while not done: - packet = read_packet(vs_comm.recv, trace_file=vs_comm.trace_file) - # `packet` will be `None` on EOF. We want to pass it down to - # handle_recv_packet anyway so the main thread can handle unexpected - # termination of lldb-dap and stop waiting for new packets. - done = not vs_comm.handle_recv_packet(packet) - finally: - # Wait for the process to fully exit before dumping the log file to - # ensure we have the entire log contents. - if vs_comm.process is not None: - try: - # Do not wait forever, some logs are better than none. - vs_comm.process.wait(timeout=20) - except subprocess.TimeoutExpired: - pass - dump_dap_log(log_file) - - class DebugCommunication(object): - def __init__(self, recv, send, init_commands, log_file=None): - self.trace_file = None + def __init__( + self, + recv: BinaryIO, + send: BinaryIO, + init_commands: list[str], + log_file: Optional[TextIO] = None, + ): + # For debugging test failures, try setting `trace_file = sys.stderr`. + self.trace_file: Optional[TextIO] = None + self.log_file = log_file self.send = send self.recv = recv - self.recv_packets = [] + self.recv_packets: list[Optional[ProtocolMessage]] = [] self.recv_condition = threading.Condition() - self.recv_thread = threading.Thread( - target=read_packet_thread, args=(self, log_file) - ) + self.recv_thread = threading.Thread(target=self._read_packet_thread) self.process_event_body = None - self.exit_status = None + self.exit_status: Optional[int] = None self.initialize_body = None - self.thread_stop_reasons = {} - self.progress_events = [] + self.progress_events: list[Event] = [] self.reverse_requests = [] self.sequence = 1 self.threads = None + self.thread_stop_reasons = {} self.recv_thread.start() self.output_condition = threading.Condition() - self.output = {} + self.output: dict[str, list[str]] = {} self.configuration_done_sent = False self.frame_scopes = {} self.init_commands = init_commands self.disassembled_instructions = {} @classmethod - def encode_content(cls, s): + def encode_content(cls, s: str) -> bytes: return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8") @classmethod @@ -155,6 +148,18 @@ def validate_response(cls, command, response): if command["seq"] != response["request_seq"]: raise ValueError("seq mismatch in response") + def _read_packet_thread(self): + done = False + try: + while not done: + packet = read_packet(self.recv, trace_file=self.trace_file) + # `packet` will be `None` on EOF. We want to pass it down to + # handle_recv_packet anyway so the main thread can handle unexpected + # termination of lldb-dap and stop waiting for new packets. + done = not self._handle_recv_packet(packet) + finally: + dump_dap_log(self.log_file) + def get_modules(self): module_list = self.request_modules()["body"]["modules"] modules = {} @@ -189,13 +194,13 @@ def collect_output(self, category, timeout_secs, pattern, clear=True): break return collected_output if collected_output else None - def enqueue_recv_packet(self, packet): + def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]): self.recv_condition.acquire() self.recv_packets.append(packet) self.recv_condition.notify() self.recv_condition.release() - def handle_recv_packet(self, packet): + def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: """Called by the read thread that is waiting for all incoming packets to store the incoming packet in "self.recv_packets" in a thread safe way. This function will then signal the "self.recv_condition" to @@ -204,7 +209,7 @@ def handle_recv_packet(self, packet): """ # If EOF, notify the read thread by enqueuing a None. if not packet: - self.enqueue_recv_packet(None) + self._enqueue_recv_packet(None) return False # Check the packet to see if is an event packet @@ -234,6 +239,18 @@ def handle_recv_packet(self, packet): # When a new process is attached or launched, remember the # details that are available in the body of the event self.process_event_body = body + elif event == "exited": + # Process exited, mark the status to indicate the process is not + # alive. + self.exit_status = body["exitCode"] + elif event == "continued": + # When the process continues, clear the known threads and + # thread_stop_reasons. + all_threads_continued = body.get("allThreadsContinued", True) + tid = body["threadId"] + if tid in self.thread_stop_reasons: + del self.thread_stop_reasons[tid] + self._process_continued(all_threads_continued) elif event == "stopped": # Each thread that stops with a reason will send a # 'stopped' event. We need to remember the thread stop @@ -251,10 +268,16 @@ def handle_recv_packet(self, packet): elif packet_type == "response": if packet["command"] == "disconnect": keepGoing = False - self.enqueue_recv_packet(packet) + self._enqueue_recv_packet(packet) return keepGoing - def send_packet(self, command_dict, set_sequence=True): + def _process_continued(self, all_threads_continued: bool): + self.threads = None + self.frame_scopes = {} + if all_threads_continued: + self.thread_stop_reasons = {} + + def send_packet(self, command_dict: Request, set_sequence=True): """Take the "command_dict" python dictionary and encode it as a JSON string and send the contents as a packet to the VSCode debug adapter""" @@ -272,7 +295,12 @@ def send_packet(self, command_dict, set_sequence=True): self.send.write(self.encode_content(json_str)) self.send.flush() - def recv_packet(self, filter_type=None, filter_event=None, timeout=None): + def recv_packet( + self, + filter_type: Optional[str] = None, + filter_event: Optional[Union[str, list[str]]] = None, + timeout: Optional[float] = None, + ) -> Optional[ProtocolMessage]: """Get a JSON packet from the VSCode debug adapter. This function assumes a thread that reads packets is running and will deliver any received packets by calling handle_recv_packet(...). This @@ -308,8 +336,6 @@ def recv_packet(self, filter_type=None, filter_event=None, timeout=None): finally: self.recv_condition.release() - return None - def send_recv(self, command): """Send a command python dictionary as JSON and receive the JSON response. Validates that the response is the correct sequence and @@ -359,47 +385,36 @@ def send_recv(self, command): return None - def wait_for_event(self, filter=None, timeout=None): - while True: - return self.recv_packet( - filter_type="event", filter_event=filter, timeout=timeout - ) - return None - - def wait_for_events(self, events, timeout=None): - """Wait for a list of events in `events` in any order. - Return the events not hit before the timeout expired""" - events = events[:] # Make a copy to avoid modifying the input - while events: - event_dict = self.wait_for_event(filter=events, timeout=timeout) - if event_dict is None: - break - events.remove(event_dict["event"]) - return events + def wait_for_event( + self, filter: Union[str, list[str]], timeout: Optional[float] = None + ) -> Optional[Event]: + """Wait for the first event that matches the filter.""" + return self.recv_packet( + filter_type="event", filter_event=filter, timeout=timeout + ) - def wait_for_stopped(self, timeout=None): + def wait_for_stopped( + self, timeout: Optional[float] = None + ) -> Optional[list[Event]]: stopped_events = [] stopped_event = self.wait_for_event( filter=["stopped", "exited"], timeout=timeout ) - exited = False while stopped_event: stopped_events.append(stopped_event) # If we exited, then we are done if stopped_event["event"] == "exited": - self.exit_status = stopped_event["body"]["exitCode"] - exited = True break # Otherwise we stopped and there might be one or more 'stopped' # events for each thread that stopped with a reason, so keep # checking for more 'stopped' events and return all of them - stopped_event = self.wait_for_event(filter="stopped", timeout=0.25) - if exited: - self.threads = [] + stopped_event = self.wait_for_event( + filter=["stopped", "exited"], timeout=0.25 + ) return stopped_events - def wait_for_breakpoint_events(self, timeout=None): - breakpoint_events = [] + def wait_for_breakpoint_events(self, timeout: Optional[float] = None): + breakpoint_events: list[Event] = [] while True: event = self.wait_for_event("breakpoint", timeout=timeout) if not event: @@ -407,14 +422,14 @@ def wait_for_breakpoint_events(self, timeout=None): breakpoint_events.append(event) return breakpoint_events - def wait_for_exited(self): - event_dict = self.wait_for_event("exited") + def wait_for_exited(self, timeout: Optional[float] = None): + event_dict = self.wait_for_event("exited", timeout=timeout) if event_dict is None: raise ValueError("didn't get exited event") return event_dict - def wait_for_terminated(self): - event_dict = self.wait_for_event("terminated") + def wait_for_terminated(self, timeout: Optional[float] = None): + event_dict = self.wait_for_event("terminated", timeout) if event_dict is None: raise ValueError("didn't get terminated event") return event_dict @@ -575,32 +590,30 @@ def replay_packets(self, replay_file_path): def request_attach( self, - program=None, - pid=None, - waitFor=None, - trace=None, - initCommands=None, - preRunCommands=None, - stopCommands=None, - exitCommands=None, - attachCommands=None, - terminateCommands=None, - coreFile=None, + *, + program: Optional[str] = None, + pid: Optional[int] = None, + waitFor=False, + initCommands: Optional[list[str]] = None, + preRunCommands: Optional[list[str]] = None, + attachCommands: Optional[list[str]] = None, + postRunCommands: Optional[list[str]] = None, + stopCommands: Optional[list[str]] = None, + exitCommands: Optional[list[str]] = None, + terminateCommands: Optional[list[str]] = None, + coreFile: Optional[str] = None, stopOnAttach=True, - postRunCommands=None, - sourceMap=None, - gdbRemotePort=None, - gdbRemoteHostname=None, + sourceMap: Optional[Union[list[tuple[str, str]], dict[str, str]]] = None, + gdbRemotePort: Optional[int] = None, + gdbRemoteHostname: Optional[str] = None, ): args_dict = {} if pid is not None: args_dict["pid"] = pid if program is not None: args_dict["program"] = program - if waitFor is not None: + if waitFor: args_dict["waitFor"] = waitFor - if trace: - args_dict["trace"] = trace args_dict["initCommands"] = self.init_commands if initCommands: args_dict["initCommands"].extend(initCommands) @@ -670,7 +683,7 @@ def _process_stopped(self): self.threads = None self.frame_scopes = {} - def request_continue(self, threadId=None): + def request_continue(self, threadId=None, singleThread=False): if self.exit_status is not None: raise ValueError("request_continue called after process exited") # If we have launched or attached, then the first continue is done by @@ -680,13 +693,18 @@ def request_continue(self, threadId=None): args_dict = {} if threadId is None: threadId = self.get_thread_id() - args_dict["threadId"] = threadId + if threadId: + args_dict["threadId"] = threadId + if singleThread: + args_dict["singleThread"] = True command_dict = { "command": "continue", "type": "request", "arguments": args_dict, } response = self.send_recv(command_dict) + if response["success"]: + self._process_continued(response["body"]["allThreadsContinued"]) # Caller must still call wait_for_stopped. return response @@ -774,7 +792,7 @@ def request_exceptionInfo(self, threadId=None): } return self.send_recv(command_dict) - def request_initialize(self, sourceInitFile): + def request_initialize(self, sourceInitFile=False): command_dict = { "command": "initialize", "type": "request", @@ -801,32 +819,32 @@ def request_initialize(self, sourceInitFile): def request_launch( self, - program, - args=None, - cwd=None, - env=None, - stopOnEntry=False, + program: str, + *, + args: Optional[list[str]] = None, + cwd: Optional[str] = None, + env: Optional[dict[str, str]] = None, + stopOnEntry=True, disableASLR=True, disableSTDIO=False, shellExpandArguments=False, - trace=False, - initCommands=None, - preRunCommands=None, - stopCommands=None, - exitCommands=None, - terminateCommands=None, - sourcePath=None, - debuggerRoot=None, - launchCommands=None, - sourceMap=None, runInTerminal=False, - postRunCommands=None, enableAutoVariableSummaries=False, displayExtendedBacktrace=False, enableSyntheticChildDebugging=False, - commandEscapePrefix=None, - customFrameFormat=None, - customThreadFormat=None, + initCommands: Optional[list[str]] = None, + preRunCommands: Optional[list[str]] = None, + launchCommands: Optional[list[str]] = None, + postRunCommands: Optional[list[str]] = None, + stopCommands: Optional[list[str]] = None, + exitCommands: Optional[list[str]] = None, + terminateCommands: Optional[list[str]] = None, + sourceMap: Optional[Union[list[tuple[str, str]], dict[str, str]]] = None, + sourcePath: Optional[str] = None, + debuggerRoot: Optional[str] = None, + commandEscapePrefix: Optional[str] = None, + customFrameFormat: Optional[str] = None, + customThreadFormat: Optional[str] = None, ): args_dict = {"program": program} if args: @@ -841,8 +859,6 @@ def request_launch( args_dict["disableSTDIO"] = disableSTDIO if shellExpandArguments: args_dict["shellExpandArguments"] = shellExpandArguments - if trace: - args_dict["trace"] = trace args_dict["initCommands"] = self.init_commands if initCommands: args_dict["initCommands"].extend(initCommands) @@ -1189,7 +1205,8 @@ def request_testGetTargetBreakpoints(self): def terminate(self): self.send.close() - # self.recv.close() + if self.recv_thread.is_alive(): + self.recv_thread.join() def request_setInstructionBreakpoints(self, memory_reference=[]): breakpoints = [] @@ -1210,11 +1227,11 @@ def request_setInstructionBreakpoints(self, memory_reference=[]): class DebugAdapterServer(DebugCommunication): def __init__( self, - executable=None, - connection=None, - init_commands=[], - log_file=None, - env=None, + executable: Optional[str] = None, + connection: Optional[str] = None, + init_commands: list[str] = [], + log_file: Optional[TextIO] = None, + env: Optional[dict[str, str]] = None, ): self.process = None self.connection = None @@ -1246,7 +1263,14 @@ def __init__( ) @classmethod - def launch(cls, /, executable, env=None, log_file=None, connection=None): + def launch( + cls, + *, + executable: str, + env: Optional[dict[str, str]] = None, + log_file: Optional[TextIO] = None, + connection: Optional[str] = None, + ) -> tuple[subprocess.Popen, Optional[str]]: adapter_env = os.environ.copy() if env is not None: adapter_env.update(env) @@ -1263,7 +1287,7 @@ def launch(cls, /, executable, env=None, log_file=None, connection=None): args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=sys.stderr, env=adapter_env, ) @@ -1288,21 +1312,49 @@ def launch(cls, /, executable, env=None, log_file=None, connection=None): return (process, connection) - def get_pid(self): + def get_pid(self) -> int: if self.process: return self.process.pid return -1 def terminate(self): - super(DebugAdapterServer, self).terminate() - if self.process is not None: - self.process.terminate() + try: + if self.process is not None: + process = self.process + self.process = None + try: + # When we close stdin it should signal the lldb-dap that no + # new messages will arrive and it should shutdown on its + # own. + process.stdin.close() + process.wait(timeout=20) + except subprocess.TimeoutExpired: + process.kill() + process.wait() + if process.returncode != 0: + raise DebugAdapterProcessError(process.returncode) + finally: + super(DebugAdapterServer, self).terminate() + + +class DebugAdapterError(Exception): + pass + + +class DebugAdapterProcessError(DebugAdapterError): + """Raised when the lldb-dap process exits with a non-zero exit status.""" + + def __init__(self, returncode): + self.returncode = returncode + + def __str__(self): + if self.returncode and self.returncode < 0: try: - self.process.wait(timeout=20) - except subprocess.TimeoutExpired: - self.process.kill() - self.process.wait() - self.process = None + return f"lldb-dap died with {signal.Signals(-self.returncode).name}." + except ValueError: + return f"lldb-dap died with unknown signal {-self.returncode}." + else: + return f"lldb-dap returned non-zero exit status {self.returncode}." def attach_options_specified(options): diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index c5a7eb76a58c7..d7cf8e2864324 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -1,5 +1,6 @@ import os import time +from typing import Optional import uuid import dap_server @@ -11,10 +12,14 @@ class DAPTestCaseBase(TestBase): # set timeout based on whether ASAN was enabled or not. Increase # timeout by a factor of 10 if ASAN is enabled. - timeoutval = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) + DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) NO_DEBUG_INFO_TESTCASE = True - def create_debug_adapter(self, lldbDAPEnv=None, connection=None): + def create_debug_adapter( + self, + lldbDAPEnv: Optional[dict[str, str]] = None, + connection: Optional[str] = None, + ): """Create the Visual Studio Code debug adapter""" self.assertTrue( is_exe(self.lldbDAPExec), "lldb-dap must exist and be executable" @@ -28,7 +33,11 @@ def create_debug_adapter(self, lldbDAPEnv=None, connection=None): env=lldbDAPEnv, ) - def build_and_create_debug_adapter(self, lldbDAPEnv=None, dictionary=None): + def build_and_create_debug_adapter( + self, + lldbDAPEnv: Optional[dict[str, str]] = None, + dictionary: Optional[dict] = None, + ): self.build(dictionary=dictionary) self.create_debug_adapter(lldbDAPEnv) @@ -78,13 +87,13 @@ def waitUntil(self, condition_callback): time.sleep(0.5) return False - def verify_breakpoint_hit(self, breakpoint_ids): + def verify_breakpoint_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT): """Wait for the process we are debugging to stop, and verify we hit any breakpoint location in the "breakpoint_ids" array. "breakpoint_ids" should be a list of breakpoint ID strings (["1", "2"]). The return value from self.set_source_breakpoints() or self.set_function_breakpoints() can be passed to this function""" - stopped_events = self.dap_server.wait_for_stopped() + stopped_events = self.dap_server.wait_for_stopped(timeout) for stopped_event in stopped_events: if "body" in stopped_event: body = stopped_event["body"] @@ -110,16 +119,15 @@ def verify_breakpoint_hit(self, breakpoint_ids): match_desc = "breakpoint %s." % (breakpoint_id) if match_desc in description: return - self.assertTrue(False, "breakpoint not hit") + self.assertTrue(False, f"breakpoint not hit, stopped_events={stopped_events}") - def verify_stop_exception_info(self, expected_description, timeout=timeoutval): + def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT): """Wait for the process we are debugging to stop, and verify the stop reason is 'exception' and that the description matches 'expected_description' """ - stopped_events = self.dap_server.wait_for_stopped(timeout=timeout) + stopped_events = self.dap_server.wait_for_stopped(timeout) for stopped_event in stopped_events: - print("stopped_event", stopped_event) if "body" in stopped_event: body = stopped_event["body"] if "reason" not in body: @@ -263,46 +271,61 @@ def set_global(self, name, value, id=None): return self.dap_server.request_setVariable(2, name, str(value), id=id) def stepIn( - self, threadId=None, targetId=None, waitForStop=True, granularity="statement" + self, + threadId=None, + targetId=None, + waitForStop=True, + granularity="statement", + timeout=DEFAULT_TIMEOUT, ): response = self.dap_server.request_stepIn( threadId=threadId, targetId=targetId, granularity=granularity ) self.assertTrue(response["success"]) if waitForStop: - return self.dap_server.wait_for_stopped() + return self.dap_server.wait_for_stopped(timeout) return None - def stepOver(self, threadId=None, waitForStop=True, granularity="statement"): + def stepOver( + self, + threadId=None, + waitForStop=True, + granularity="statement", + timeout=DEFAULT_TIMEOUT, + ): self.dap_server.request_next(threadId=threadId, granularity=granularity) if waitForStop: - return self.dap_server.wait_for_stopped() + return self.dap_server.wait_for_stopped(timeout) return None - def stepOut(self, threadId=None, waitForStop=True): + def stepOut(self, threadId=None, waitForStop=True, timeout=DEFAULT_TIMEOUT): self.dap_server.request_stepOut(threadId=threadId) if waitForStop: - return self.dap_server.wait_for_stopped() + return self.dap_server.wait_for_stopped(timeout) return None - def continue_to_next_stop(self): - self.dap_server.request_continue() - return self.dap_server.wait_for_stopped() + def do_continue(self): # `continue` is a keyword. + resp = self.dap_server.request_continue() + self.assertTrue(resp["success"], f"continue request failed: {resp}") + + def continue_to_next_stop(self, timeout=DEFAULT_TIMEOUT): + self.do_continue() + return self.dap_server.wait_for_stopped(timeout) - def continue_to_breakpoints(self, breakpoint_ids): - self.dap_server.request_continue() - self.verify_breakpoint_hit(breakpoint_ids) + def continue_to_breakpoints(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT): + self.do_continue() + self.verify_breakpoint_hit(breakpoint_ids, timeout) - def continue_to_exception_breakpoint(self, filter_label): - self.dap_server.request_continue() + def continue_to_exception_breakpoint(self, filter_label, timeout=DEFAULT_TIMEOUT): + self.do_continue() self.assertTrue( - self.verify_stop_exception_info(filter_label), + self.verify_stop_exception_info(filter_label, timeout), 'verify we got "%s"' % (filter_label), ) - def continue_to_exit(self, exitCode=0): - self.dap_server.request_continue() - stopped_events = self.dap_server.wait_for_stopped() + def continue_to_exit(self, exitCode=0, timeout=DEFAULT_TIMEOUT): + self.do_continue() + stopped_events = self.dap_server.wait_for_stopped(timeout) self.assertEqual( len(stopped_events), 1, "stopped_events = {}".format(stopped_events) ) @@ -330,27 +353,15 @@ def disassemble(self, threadId=None, frameIndex=None): def attach( self, - program=None, - pid=None, - waitFor=None, - trace=None, - initCommands=None, - preRunCommands=None, - stopCommands=None, - exitCommands=None, - attachCommands=None, - coreFile=None, + *, stopOnAttach=True, disconnectAutomatically=True, - terminateCommands=None, - postRunCommands=None, - sourceMap=None, sourceInitFile=False, expectFailure=False, - gdbRemotePort=None, - gdbRemoteHostname=None, sourceBreakpoints=None, functionBreakpoints=None, + timeout=DEFAULT_TIMEOUT, + **kwargs, ): """Build the default Makefile target, create the DAP debug adapter, and attach to the process. @@ -367,7 +378,7 @@ def cleanup(): self.addTearDownHook(cleanup) # Initialize and launch the program self.dap_server.request_initialize(sourceInitFile) - self.dap_server.wait_for_event("initialized") + self.dap_server.wait_for_event("initialized", timeout) # Set source breakpoints as part of the launch sequence. if sourceBreakpoints: @@ -389,64 +400,28 @@ def cleanup(): ) self.dap_server.request_configurationDone() - response = self.dap_server.request_attach( - program=program, - pid=pid, - waitFor=waitFor, - trace=trace, - initCommands=initCommands, - preRunCommands=preRunCommands, - stopCommands=stopCommands, - exitCommands=exitCommands, - attachCommands=attachCommands, - terminateCommands=terminateCommands, - coreFile=coreFile, - stopOnAttach=stopOnAttach, - postRunCommands=postRunCommands, - sourceMap=sourceMap, - gdbRemotePort=gdbRemotePort, - gdbRemoteHostname=gdbRemoteHostname, - ) + response = self.dap_server.request_attach(stopOnAttach=stopOnAttach, **kwargs) if expectFailure: return response if not (response and response["success"]): self.assertTrue( response["success"], "attach failed (%s)" % (response["message"]) ) + if stopOnAttach: + self.dap_server.wait_for_stopped(timeout) def launch( self, program=None, - args=None, - cwd=None, - env=None, - stopOnEntry=False, - disableASLR=False, - disableSTDIO=False, - shellExpandArguments=False, - trace=False, - initCommands=None, - preRunCommands=None, - stopCommands=None, - exitCommands=None, - terminateCommands=None, - sourcePath=None, - debuggerRoot=None, + *, sourceInitFile=False, - launchCommands=None, - sourceMap=None, disconnectAutomatically=True, - runInTerminal=False, - expectFailure=False, - postRunCommands=None, - enableAutoVariableSummaries=False, - displayExtendedBacktrace=False, - enableSyntheticChildDebugging=False, - commandEscapePrefix=None, - customFrameFormat=None, - customThreadFormat=None, sourceBreakpoints=None, functionBreakpoints=None, + expectFailure=False, + stopOnEntry=True, + timeout=DEFAULT_TIMEOUT, + **kwargs, ): """Sending launch request to dap""" @@ -462,7 +437,7 @@ def cleanup(): # Initialize and launch the program self.dap_server.request_initialize(sourceInitFile) - self.dap_server.wait_for_event("initialized") + self.dap_server.wait_for_event("initialized", timeout) # Set source breakpoints as part of the launch sequence. if sourceBreakpoints: @@ -487,76 +462,28 @@ def cleanup(): response = self.dap_server.request_launch( program, - args=args, - cwd=cwd, - env=env, stopOnEntry=stopOnEntry, - disableASLR=disableASLR, - disableSTDIO=disableSTDIO, - shellExpandArguments=shellExpandArguments, - trace=trace, - initCommands=initCommands, - preRunCommands=preRunCommands, - stopCommands=stopCommands, - exitCommands=exitCommands, - terminateCommands=terminateCommands, - sourcePath=sourcePath, - debuggerRoot=debuggerRoot, - launchCommands=launchCommands, - sourceMap=sourceMap, - runInTerminal=runInTerminal, - postRunCommands=postRunCommands, - enableAutoVariableSummaries=enableAutoVariableSummaries, - displayExtendedBacktrace=displayExtendedBacktrace, - enableSyntheticChildDebugging=enableSyntheticChildDebugging, - commandEscapePrefix=commandEscapePrefix, - customFrameFormat=customFrameFormat, - customThreadFormat=customThreadFormat, + **kwargs, ) if expectFailure: return response - if not (response and response["success"]): self.assertTrue( response["success"], "launch failed (%s)" % (response["body"]["error"]["format"]), ) + if stopOnEntry: + self.dap_server.wait_for_stopped(timeout) + return response def build_and_launch( self, program, - args=None, - cwd=None, - env=None, - stopOnEntry=False, - disableASLR=False, - disableSTDIO=False, - shellExpandArguments=False, - trace=False, - initCommands=None, - preRunCommands=None, - stopCommands=None, - exitCommands=None, - terminateCommands=None, - sourcePath=None, - debuggerRoot=None, - sourceInitFile=False, - runInTerminal=False, - disconnectAutomatically=True, - postRunCommands=None, - lldbDAPEnv=None, - enableAutoVariableSummaries=False, - displayExtendedBacktrace=False, - enableSyntheticChildDebugging=False, - commandEscapePrefix=None, - customFrameFormat=None, - customThreadFormat=None, - launchCommands=None, - expectFailure=False, - sourceBreakpoints=None, - functionBreakpoints=None, + *, + lldbDAPEnv: Optional[dict[str, str]] = None, + **kwargs, ): """Build the default Makefile target, create the DAP debug adapter, and launch the process. @@ -564,38 +491,7 @@ def build_and_launch( self.build_and_create_debug_adapter(lldbDAPEnv) self.assertTrue(os.path.exists(program), "executable must exist") - return self.launch( - program, - args, - cwd, - env, - stopOnEntry, - disableASLR, - disableSTDIO, - shellExpandArguments, - trace, - initCommands, - preRunCommands, - stopCommands, - exitCommands, - terminateCommands, - sourcePath, - debuggerRoot, - sourceInitFile, - runInTerminal=runInTerminal, - disconnectAutomatically=disconnectAutomatically, - postRunCommands=postRunCommands, - enableAutoVariableSummaries=enableAutoVariableSummaries, - enableSyntheticChildDebugging=enableSyntheticChildDebugging, - displayExtendedBacktrace=displayExtendedBacktrace, - commandEscapePrefix=commandEscapePrefix, - customFrameFormat=customFrameFormat, - customThreadFormat=customThreadFormat, - launchCommands=launchCommands, - expectFailure=expectFailure, - sourceBreakpoints=sourceBreakpoints, - functionBreakpoints=functionBreakpoints, - ) + return self.launch(program, **kwargs) def getBuiltinDebugServerTool(self): # Tries to find simulation/lldb-server/gdbserver tool path. diff --git a/lldb/source/Core/Address.cpp b/lldb/source/Core/Address.cpp index 1dab874a96583..a967bf5491211 100644 --- a/lldb/source/Core/Address.cpp +++ b/lldb/source/Core/Address.cpp @@ -263,22 +263,11 @@ bool Address::ResolveAddressUsingFileSections(addr_t file_addr, return false; // Failed to resolve this address to a section offset value } -/// if "addr_range_ptr" is not NULL, then fill in with the address range of the function. -bool Address::ResolveFunctionScope(SymbolContext &sym_ctx, - AddressRange *addr_range_ptr) { +bool Address::ResolveFunctionScope(SymbolContext &sym_ctx) { constexpr SymbolContextItem resolve_scope = eSymbolContextFunction | eSymbolContextSymbol; - if (!(CalculateSymbolContext(&sym_ctx, resolve_scope) & resolve_scope)) { - if (addr_range_ptr) - addr_range_ptr->Clear(); - return false; - } - - if (!addr_range_ptr) - return true; - - return sym_ctx.GetAddressRange(resolve_scope, 0, false, *addr_range_ptr); + return CalculateSymbolContext(&sym_ctx, resolve_scope) & resolve_scope; } ModuleSP Address::GetModule() const { diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp index fdadba62987d3..4cf68db466158 100644 --- a/lldb/source/Interpreter/Options.cpp +++ b/lldb/source/Interpreter/Options.cpp @@ -1076,7 +1076,7 @@ llvm::Expected Options::ParseAlias(const Args &args, if (!input_line.empty()) { llvm::StringRef tmp_arg = args_copy[idx].ref(); - size_t pos = input_line.find(std::string(tmp_arg)); + size_t pos = input_line.find(tmp_arg); if (pos != std::string::npos) input_line.erase(pos, tmp_arg.size()); } diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp index 7d8d0a4d3d671..3bafb21f7c33a 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp @@ -19,6 +19,7 @@ #include using namespace lldb; +using namespace lldb_private; LLDB_PLUGIN_DEFINE(ABIAArch64) @@ -200,3 +201,44 @@ void ABIAArch64::AugmentRegisterInfo( lldb::eEncodingIEEE754, lldb::eFormatFloat); } } + +UnwindPlanSP ABIAArch64::CreateFunctionEntryUnwindPlan() { + UnwindPlan::Row row; + + // Our previous Call Frame Address is the stack pointer + row.GetCFAValue().SetIsRegisterPlusOffset(LLDB_REGNUM_GENERIC_SP, 0); + + // Our previous PC is in the LR, all other registers are the same. + row.SetRegisterLocationToRegister(LLDB_REGNUM_GENERIC_PC, + LLDB_REGNUM_GENERIC_RA, true); + + auto plan_sp = std::make_shared(eRegisterKindGeneric); + plan_sp->AppendRow(std::move(row)); + plan_sp->SetSourceName("arm64 at-func-entry default"); + plan_sp->SetSourcedFromCompiler(eLazyBoolNo); + plan_sp->SetUnwindPlanValidAtAllInstructions(eLazyBoolNo); + plan_sp->SetUnwindPlanForSignalTrap(eLazyBoolNo); + return plan_sp; +} + +UnwindPlanSP ABIAArch64::CreateDefaultUnwindPlan() { + UnwindPlan::Row row; + const int32_t ptr_size = 8; + + row.GetCFAValue().SetIsRegisterPlusOffset(LLDB_REGNUM_GENERIC_FP, + 2 * ptr_size); + row.SetUnspecifiedRegistersAreUndefined(true); + + row.SetRegisterLocationToAtCFAPlusOffset(LLDB_REGNUM_GENERIC_FP, + ptr_size * -2, true); + row.SetRegisterLocationToAtCFAPlusOffset(LLDB_REGNUM_GENERIC_PC, + ptr_size * -1, true); + + auto plan_sp = std::make_shared(eRegisterKindGeneric); + plan_sp->AppendRow(std::move(row)); + plan_sp->SetSourceName("arm64 default unwind plan"); + plan_sp->SetSourcedFromCompiler(eLazyBoolNo); + plan_sp->SetUnwindPlanValidAtAllInstructions(eLazyBoolNo); + plan_sp->SetUnwindPlanForSignalTrap(eLazyBoolNo); + return plan_sp; +} diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h index 52e42f1260a83..53702f4da580d 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h +++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.h @@ -19,6 +19,9 @@ class ABIAArch64 : public lldb_private::MCBasedABI { lldb::addr_t FixCodeAddress(lldb::addr_t pc) override; lldb::addr_t FixDataAddress(lldb::addr_t pc) override; + lldb::UnwindPlanSP CreateFunctionEntryUnwindPlan() override; + lldb::UnwindPlanSP CreateDefaultUnwindPlan() override; + protected: virtual lldb::addr_t FixAddress(lldb::addr_t pc, lldb::addr_t mask) { return pc; diff --git a/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.cpp index f86ab8cbb1195..094e0523a4edf 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.cpp @@ -17,7 +17,6 @@ #include "lldb/Core/Module.h" #include "lldb/Core/PluginManager.h" #include "lldb/Core/Value.h" -#include "lldb/Symbol/UnwindPlan.h" #include "lldb/Target/Process.h" #include "lldb/Target/RegisterContext.h" #include "lldb/Target/Target.h" @@ -30,8 +29,6 @@ #include "lldb/Utility/Status.h" #include "lldb/ValueObject/ValueObjectConstResult.h" -#include "Utility/ARM64_DWARF_Registers.h" - using namespace lldb; using namespace lldb_private; @@ -344,48 +341,6 @@ ABIMacOSX_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, return error; } -UnwindPlanSP ABIMacOSX_arm64::CreateFunctionEntryUnwindPlan() { - uint32_t lr_reg_num = arm64_dwarf::lr; - uint32_t sp_reg_num = arm64_dwarf::sp; - uint32_t pc_reg_num = arm64_dwarf::pc; - - UnwindPlan::Row row; - - // Our previous Call Frame Address is the stack pointer - row.GetCFAValue().SetIsRegisterPlusOffset(sp_reg_num, 0); - - // Our previous PC is in the LR, all other registers are the same. - row.SetRegisterLocationToRegister(pc_reg_num, lr_reg_num, true); - - auto plan_sp = std::make_shared(eRegisterKindDWARF); - plan_sp->AppendRow(std::move(row)); - plan_sp->SetSourceName("arm64 at-func-entry default"); - plan_sp->SetSourcedFromCompiler(eLazyBoolNo); - return plan_sp; -} - -UnwindPlanSP ABIMacOSX_arm64::CreateDefaultUnwindPlan() { - uint32_t fp_reg_num = arm64_dwarf::fp; - uint32_t pc_reg_num = arm64_dwarf::pc; - - UnwindPlan::Row row; - const int32_t ptr_size = 8; - - row.GetCFAValue().SetIsRegisterPlusOffset(fp_reg_num, 2 * ptr_size); - row.SetUnspecifiedRegistersAreUndefined(true); - - row.SetRegisterLocationToAtCFAPlusOffset(fp_reg_num, ptr_size * -2, true); - row.SetRegisterLocationToAtCFAPlusOffset(pc_reg_num, ptr_size * -1, true); - - auto plan_sp = std::make_shared(eRegisterKindDWARF); - plan_sp->AppendRow(std::move(row)); - plan_sp->SetSourceName("arm64-apple-darwin default unwind plan"); - plan_sp->SetSourcedFromCompiler(eLazyBoolNo); - plan_sp->SetUnwindPlanValidAtAllInstructions(eLazyBoolNo); - plan_sp->SetUnwindPlanForSignalTrap(eLazyBoolNo); - return plan_sp; -} - // AAPCS64 (Procedure Call Standard for the ARM 64-bit Architecture) says // registers x19 through x28 and sp are callee preserved. v8-v15 are non- // volatile (and specifically only the lower 8 bytes of these regs), the rest diff --git a/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.h b/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.h index 94a60327c6181..c8851709f50ad 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.h +++ b/lldb/source/Plugins/ABI/AArch64/ABIMacOSX_arm64.h @@ -27,10 +27,6 @@ class ABIMacOSX_arm64 : public ABIAArch64 { bool GetArgumentValues(lldb_private::Thread &thread, lldb_private::ValueList &values) const override; - lldb::UnwindPlanSP CreateFunctionEntryUnwindPlan() override; - - lldb::UnwindPlanSP CreateDefaultUnwindPlan() override; - bool RegisterIsVolatile(const lldb_private::RegisterInfo *reg_info) override; // The arm64 ABI requires that stack frames be 16 byte aligned. diff --git a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp index 6e07c0982be0e..aa9c20b6bb2cf 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp @@ -30,8 +30,6 @@ #include "lldb/Utility/Status.h" #include "lldb/ValueObject/ValueObjectConstResult.h" -#include "Utility/ARM64_DWARF_Registers.h" - using namespace lldb; using namespace lldb_private; @@ -385,48 +383,6 @@ Status ABISysV_arm64::SetReturnValueObject(lldb::StackFrameSP &frame_sp, return error; } -UnwindPlanSP ABISysV_arm64::CreateFunctionEntryUnwindPlan() { - uint32_t lr_reg_num = arm64_dwarf::lr; - uint32_t sp_reg_num = arm64_dwarf::sp; - - UnwindPlan::Row row; - - // Our previous Call Frame Address is the stack pointer, all other registers - // are the same. - row.GetCFAValue().SetIsRegisterPlusOffset(sp_reg_num, 0); - - auto plan_sp = std::make_shared(eRegisterKindDWARF); - plan_sp->AppendRow(std::move(row)); - plan_sp->SetReturnAddressRegister(lr_reg_num); - plan_sp->SetSourceName("arm64 at-func-entry default"); - plan_sp->SetSourcedFromCompiler(eLazyBoolNo); - plan_sp->SetUnwindPlanValidAtAllInstructions(eLazyBoolNo); - plan_sp->SetUnwindPlanForSignalTrap(eLazyBoolNo); - return plan_sp; -} - -UnwindPlanSP ABISysV_arm64::CreateDefaultUnwindPlan() { - uint32_t fp_reg_num = arm64_dwarf::fp; - uint32_t pc_reg_num = arm64_dwarf::pc; - - UnwindPlan::Row row; - const int32_t ptr_size = 8; - - row.GetCFAValue().SetIsRegisterPlusOffset(fp_reg_num, 2 * ptr_size); - row.SetUnspecifiedRegistersAreUndefined(true); - - row.SetRegisterLocationToAtCFAPlusOffset(fp_reg_num, ptr_size * -2, true); - row.SetRegisterLocationToAtCFAPlusOffset(pc_reg_num, ptr_size * -1, true); - - auto plan_sp = std::make_shared(eRegisterKindDWARF); - plan_sp->AppendRow(std::move(row)); - plan_sp->SetSourceName("arm64 default unwind plan"); - plan_sp->SetSourcedFromCompiler(eLazyBoolNo); - plan_sp->SetUnwindPlanValidAtAllInstructions(eLazyBoolNo); - plan_sp->SetUnwindPlanForSignalTrap(eLazyBoolNo); - return plan_sp; -} - // AAPCS64 (Procedure Call Standard for the ARM 64-bit Architecture) says // registers x19 through x28 and sp are callee preserved. v8-v15 are non- // volatile (and specifically only the lower 8 bytes of these regs), the rest diff --git a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.h b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.h index 2b8e608d4caab..213fbf7417b2c 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.h +++ b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.h @@ -30,10 +30,6 @@ class ABISysV_arm64 : public ABIAArch64 { SetReturnValueObject(lldb::StackFrameSP &frame_sp, lldb::ValueObjectSP &new_value) override; - lldb::UnwindPlanSP CreateFunctionEntryUnwindPlan() override; - - lldb::UnwindPlanSP CreateDefaultUnwindPlan() override; - bool RegisterIsVolatile(const lldb_private::RegisterInfo *reg_info) override; // The arm64 ABI requires that stack frames be 16 byte aligned. diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ASTStructExtractor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ASTStructExtractor.cpp index a2722db5d24a0..451cf40e2818d 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ASTStructExtractor.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ASTStructExtractor.cpp @@ -123,8 +123,7 @@ void ASTStructExtractor::ExtractFromTopLevelDecl(Decl *D) { FunctionDecl *function_decl = dyn_cast(D); if (m_ast_context && function_decl && - !m_function.m_wrapper_function_name.compare( - function_decl->getNameAsString())) { + m_function.m_wrapper_function_name == function_decl->getNameAsString()) { ExtractFromFunctionDecl(function_decl); } } diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp index 1666677c360ba..e629355cd40b9 100644 --- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp @@ -39,7 +39,6 @@ using namespace lldb; using namespace lldb_private; LLDB_PLUGIN_DEFINE(ObjectFileXCOFF) - // FIXME: target 64bit at this moment. // Static methods. @@ -95,10 +94,11 @@ bool ObjectFileXCOFF::CreateBinary() { Log *log = GetLog(LLDBLog::Object); - auto binary = llvm::object::ObjectFile::createObjectFile( - llvm::MemoryBufferRef(toStringRef(m_data.GetData()), - m_file.GetFilename().GetStringRef()), - file_magic::xcoff_object_64); + auto memory_ref = llvm::MemoryBufferRef(toStringRef(m_data.GetData()), + m_file.GetFilename().GetStringRef()); + llvm::file_magic magic = llvm::identify_magic(memory_ref.getBuffer()); + + auto binary = llvm::object::ObjectFile::createObjectFile(memory_ref, magic); if (!binary) { LLDB_LOG_ERROR(log, binary.takeError(), "Failed to create binary for file ({1}): {0}", m_file); @@ -143,9 +143,9 @@ size_t ObjectFileXCOFF::GetModuleSpecifications( static uint32_t XCOFFHeaderSizeFromMagic(uint32_t magic) { switch (magic) { - // TODO: 32bit not supported. - // case XCOFF::XCOFF32: - // return sizeof(struct llvm::object::XCOFFFileHeader32); + case XCOFF::XCOFF32: + return sizeof(struct llvm::object::XCOFFFileHeader32); + break; case XCOFF::XCOFF64: return sizeof(struct llvm::object::XCOFFFileHeader64); break; @@ -169,8 +169,9 @@ bool ObjectFileXCOFF::MagicBytesMatch(DataBufferSP &data_sp, } bool ObjectFileXCOFF::ParseHeader() { - // Only 64-bit is supported for now - return m_binary->fileHeader64()->Magic == XCOFF::XCOFF64; + if (m_binary->is64Bit()) + return m_binary->fileHeader64()->Magic == XCOFF::XCOFF64; + return m_binary->fileHeader32()->Magic == XCOFF::XCOFF32; } ByteOrder ObjectFileXCOFF::GetByteOrder() const { return eByteOrderBig; } @@ -178,8 +179,9 @@ ByteOrder ObjectFileXCOFF::GetByteOrder() const { return eByteOrderBig; } bool ObjectFileXCOFF::IsExecutable() const { return true; } uint32_t ObjectFileXCOFF::GetAddressByteSize() const { - // 32-bit not supported. return 8 for 64-bit XCOFF::XCOFF64 - return 8; + if (m_binary->is64Bit()) + return 8; + return 4; } AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) { @@ -191,20 +193,37 @@ void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {} bool ObjectFileXCOFF::IsStripped() { return false; } void ObjectFileXCOFF::CreateSections(SectionList &unified_section_list) { + if (m_sections_up) return; m_sections_up = std::make_unique(); - ModuleSP module_sp(GetModule()); + if (m_binary->is64Bit()) + CreateSectionsWithBitness(unified_section_list); + else + CreateSectionsWithBitness(unified_section_list); +} +template +static auto GetSections(llvm::object::XCOFFObjectFile *binary) { + if constexpr (T::Is64Bit) + return binary->sections64(); + else + return binary->sections32(); +} + +template +void ObjectFileXCOFF::CreateSectionsWithBitness( + SectionList &unified_section_list) { + ModuleSP module_sp(GetModule()); if (!module_sp) return; std::lock_guard guard(module_sp->GetMutex()); int idx = 0; - for (const llvm::object::XCOFFSectionHeader64 §ion : - m_binary->sections64()) { + for (const typename T::SectionHeader §ion : + GetSections(m_binary.get())) { ConstString const_sect_name(section.Name); @@ -253,9 +272,13 @@ UUID ObjectFileXCOFF::GetUUID() { return UUID(); } uint32_t ObjectFileXCOFF::GetDependentModules(FileSpecList &files) { return 0; } ObjectFile::Type ObjectFileXCOFF::CalculateType() { - if (m_binary->fileHeader64()->Flags & XCOFF::F_EXEC) + + const auto flags = m_binary->is64Bit() ? m_binary->fileHeader64()->Flags + : m_binary->fileHeader32()->Flags; + + if (flags & XCOFF::F_EXEC) return eTypeExecutable; - else if (m_binary->fileHeader64()->Flags & XCOFF::F_SHROBJ) + else if (flags & XCOFF::F_SHROBJ) return eTypeSharedLibrary; return eTypeUnknown; } diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h index 2d4f9f3f2dab8..2cecd0315463a 100644 --- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h +++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h @@ -104,6 +104,18 @@ class ObjectFileXCOFF : public lldb_private::ObjectFile { private: bool CreateBinary(); + template + void + CreateSectionsWithBitness(lldb_private::SectionList &unified_section_list); + + struct XCOFF32 { + using SectionHeader = llvm::object::XCOFFSectionHeader32; + static constexpr bool Is64Bit = false; + }; + struct XCOFF64 { + using SectionHeader = llvm::object::XCOFFSectionHeader64; + static constexpr bool Is64Bit = true; + }; std::unique_ptr m_binary; }; diff --git a/lldb/source/Plugins/Process/AIX/CMakeLists.txt b/lldb/source/Plugins/Process/AIX/CMakeLists.txt index 9a3c77bd2ffeb..911f30349ef52 100644 --- a/lldb/source/Plugins/Process/AIX/CMakeLists.txt +++ b/lldb/source/Plugins/Process/AIX/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_library(lldbPluginProcessAIX NativeProcessAIX.cpp + NativeThreadAIX.cpp LINK_LIBS lldbCore diff --git a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp new file mode 100644 index 0000000000000..3bb608168ce30 --- /dev/null +++ b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp @@ -0,0 +1,58 @@ +//===-- NativeThreadAIX.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NativeThreadAIX.h" +#include "NativeProcessAIX.h" +#include "lldb/Utility/State.h" + +using namespace lldb; +using namespace lldb_private; +using namespace lldb_private::process_aix; + +NativeThreadAIX::NativeThreadAIX(NativeProcessAIX &process, lldb::tid_t tid) + : NativeThreadProtocol(process, tid), m_state(StateType::eStateInvalid) {} + +std::string NativeThreadAIX::GetName() { return ""; } + +lldb::StateType NativeThreadAIX::GetState() { return m_state; } + +bool NativeThreadAIX::GetStopReason(ThreadStopInfo &stop_info, + std::string &description) { + return false; +} + +Status NativeThreadAIX::SetWatchpoint(lldb::addr_t addr, size_t size, + uint32_t watch_flags, bool hardware) { + return Status("Unable to Set hardware watchpoint."); +} + +Status NativeThreadAIX::RemoveWatchpoint(lldb::addr_t addr) { + return Status("Clearing hardware watchpoint failed."); +} + +Status NativeThreadAIX::SetHardwareBreakpoint(lldb::addr_t addr, size_t size) { + return Status("Unable to set hardware breakpoint."); +} + +Status NativeThreadAIX::RemoveHardwareBreakpoint(lldb::addr_t addr) { + return Status("Clearing hardware breakpoint failed."); +} + +NativeProcessAIX &NativeThreadAIX::GetProcess() { + return static_cast(m_process); +} + +const NativeProcessAIX &NativeThreadAIX::GetProcess() const { + return static_cast(m_process); +} + +llvm::Expected> +NativeThreadAIX::GetSiginfo() const { + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "Not implemented"); +} diff --git a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.h b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.h new file mode 100644 index 0000000000000..e32d3db2c5fa2 --- /dev/null +++ b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.h @@ -0,0 +1,53 @@ +//===-- NativeThreadAIX.h ----------------------------------- -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_NATIVETHREADAIX_H_ +#define LLDB_SOURCE_PLUGINS_PROCESS_AIX_NATIVETHREADAIX_H_ + +#include "lldb/Host/common/NativeThreadProtocol.h" + +namespace lldb_private::process_aix { + +class NativeProcessAIX; + +class NativeThreadAIX : public NativeThreadProtocol { + friend class NativeProcessAIX; + +public: + NativeThreadAIX(NativeProcessAIX &process, lldb::tid_t tid); + + // NativeThreadProtocol Interface + std::string GetName() override; + + lldb::StateType GetState() override; + + bool GetStopReason(ThreadStopInfo &stop_info, + std::string &description) override; + + Status SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags, + bool hardware) override; + + Status RemoveWatchpoint(lldb::addr_t addr) override; + + Status SetHardwareBreakpoint(lldb::addr_t addr, size_t size) override; + + Status RemoveHardwareBreakpoint(lldb::addr_t addr) override; + + NativeProcessAIX &GetProcess(); + + const NativeProcessAIX &GetProcess() const; + + llvm::Expected> + GetSiginfo() const override; + +private: + lldb::StateType m_state; +}; +} // namespace lldb_private::process_aix + +#endif // #ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_NATIVETHREADAIX_H_ diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index a3e809f44ed23..e3a866e2b6d48 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -2479,8 +2479,8 @@ Function *DWARFASTParserClang::ParseFunctionFromDWARF( std::unique_ptr decl_up; if (decl_file || decl_line || decl_column) decl_up = std::make_unique( - die.GetCU()->GetFile(decl_file ? *decl_file : 0), - decl_line ? *decl_line : 0, decl_column ? *decl_column : 0); + die.GetCU()->GetFile(decl_file.value_or(0)), decl_line.value_or(0), + decl_column.value_or(0)); SymbolFileDWARF *dwarf = die.GetDWARF(); // Supply the type _only_ if it has already been parsed diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 907d63eb51afe..0fc7f79be70ec 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -1358,15 +1358,15 @@ size_t SymbolFileDWARF::ParseBlocksRecursive(CompileUnit &comp_unit, if (decl_file || decl_line || decl_column) decl_up = std::make_unique( comp_unit.GetSupportFiles().GetFileSpecAtIndex( - decl_file ? *decl_file : 0), - decl_line ? *decl_line : 0, decl_column ? *decl_column : 0); + decl_file.value_or(0)), + decl_line.value_or(0), decl_column.value_or(0)); std::unique_ptr call_up; if (call_file || call_line || call_column) call_up = std::make_unique( comp_unit.GetSupportFiles().GetFileSpecAtIndex( - call_file ? *call_file : 0), - call_line ? *call_line : 0, call_column ? *call_column : 0); + call_file.value_or(0)), + call_line.value_or(0), call_column.value_or(0)); block->SetInlinedFunctionInfo(name, mangled_name, decl_up.get(), call_up.get()); diff --git a/lldb/source/Symbol/FuncUnwinders.cpp b/lldb/source/Symbol/FuncUnwinders.cpp index faec24cde7fdd..12a6d101d9930 100644 --- a/lldb/source/Symbol/FuncUnwinders.cpp +++ b/lldb/source/Symbol/FuncUnwinders.cpp @@ -365,33 +365,30 @@ FuncUnwinders::GetAssemblyUnwindPlan(Target &target, Thread &thread) { LazyBool FuncUnwinders::CompareUnwindPlansForIdenticalInitialPCLocation( Thread &thread, const std::shared_ptr &a, const std::shared_ptr &b) { - LazyBool plans_are_identical = eLazyBoolCalculate; + if (!a || !b) + return eLazyBoolCalculate; - RegisterNumber pc_reg(thread, eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC); - uint32_t pc_reg_lldb_regnum = pc_reg.GetAsKind(eRegisterKindLLDB); + const UnwindPlan::Row *a_first_row = a->GetRowAtIndex(0); + const UnwindPlan::Row *b_first_row = b->GetRowAtIndex(0); + if (!a_first_row || !b_first_row) + return eLazyBoolCalculate; - if (a && b) { - const UnwindPlan::Row *a_first_row = a->GetRowAtIndex(0); - const UnwindPlan::Row *b_first_row = b->GetRowAtIndex(0); + RegisterNumber pc_reg(thread, eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC); + uint32_t a_pc_regnum = pc_reg.GetAsKind(a->GetRegisterKind()); + uint32_t b_pc_regnum = pc_reg.GetAsKind(b->GetRegisterKind()); - if (a_first_row && b_first_row) { - UnwindPlan::Row::AbstractRegisterLocation a_pc_regloc; - UnwindPlan::Row::AbstractRegisterLocation b_pc_regloc; + UnwindPlan::Row::AbstractRegisterLocation a_pc_regloc; + UnwindPlan::Row::AbstractRegisterLocation b_pc_regloc; - a_first_row->GetRegisterInfo(pc_reg_lldb_regnum, a_pc_regloc); - b_first_row->GetRegisterInfo(pc_reg_lldb_regnum, b_pc_regloc); + a_first_row->GetRegisterInfo(a_pc_regnum, a_pc_regloc); + b_first_row->GetRegisterInfo(b_pc_regnum, b_pc_regloc); - plans_are_identical = eLazyBoolYes; + if (a_first_row->GetCFAValue() != b_first_row->GetCFAValue()) + return eLazyBoolNo; + if (a_pc_regloc != b_pc_regloc) + return eLazyBoolNo; - if (a_first_row->GetCFAValue() != b_first_row->GetCFAValue()) { - plans_are_identical = eLazyBoolNo; - } - if (a_pc_regloc != b_pc_regloc) { - plans_are_identical = eLazyBoolNo; - } - } - } - return plans_are_identical; + return eLazyBoolYes; } std::shared_ptr diff --git a/lldb/source/Target/DynamicRegisterInfo.cpp b/lldb/source/Target/DynamicRegisterInfo.cpp index 9ad98a41c688c..b964dc5877a97 100644 --- a/lldb/source/Target/DynamicRegisterInfo.cpp +++ b/lldb/source/Target/DynamicRegisterInfo.cpp @@ -497,10 +497,7 @@ void DynamicRegisterInfo::Finalize(const ArchSpec &arch) { pos != end; ++pos) { if (pos->second.size() > 1) { llvm::sort(pos->second); - reg_num_collection::iterator unique_end = - std::unique(pos->second.begin(), pos->second.end()); - if (unique_end != pos->second.end()) - pos->second.erase(unique_end, pos->second.end()); + pos->second.erase(llvm::unique(pos->second), pos->second.end()); } assert(!pos->second.empty()); if (pos->second.back() != LLDB_INVALID_REGNUM) diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index cf4b96c6eda9f..2958923a98010 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -160,8 +160,7 @@ void RegisterContextUnwind::InitializeZerothFrame() { UnwindLogMsg("using architectural default unwind method"); } - AddressRange addr_range; - m_sym_ctx_valid = m_current_pc.ResolveFunctionScope(m_sym_ctx, &addr_range); + m_sym_ctx_valid = m_current_pc.ResolveFunctionScope(m_sym_ctx); if (m_sym_ctx.symbol) { UnwindLogMsg("with pc value of 0x%" PRIx64 ", symbol name is '%s'", @@ -185,15 +184,9 @@ void RegisterContextUnwind::InitializeZerothFrame() { // If we were able to find a symbol/function, set addr_range to the bounds of // that symbol/function. else treat the current pc value as the start_pc and // record no offset. - if (addr_range.GetBaseAddress().IsValid()) { - m_start_pc = addr_range.GetBaseAddress(); - if (m_current_pc.GetSection() == m_start_pc.GetSection()) { - m_current_offset = m_current_pc.GetOffset() - m_start_pc.GetOffset(); - } else if (m_current_pc.GetModule() == m_start_pc.GetModule()) { - // This means that whatever symbol we kicked up isn't really correct --- - // we should not cross section boundaries ... We really should NULL out - // the function/symbol in this case unless there is a bad assumption here - // due to inlined functions? + if (m_sym_ctx_valid) { + m_start_pc = m_sym_ctx.GetFunctionOrSymbolAddress(); + if (m_current_pc.GetModule() == m_start_pc.GetModule()) { m_current_offset = m_current_pc.GetFileAddress() - m_start_pc.GetFileAddress(); } @@ -499,8 +492,7 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { return; } - AddressRange addr_range; - m_sym_ctx_valid = m_current_pc.ResolveFunctionScope(m_sym_ctx, &addr_range); + m_sym_ctx_valid = m_current_pc.ResolveFunctionScope(m_sym_ctx); if (m_sym_ctx.symbol) { UnwindLogMsg("with pc value of 0x%" PRIx64 ", symbol name is '%s'", pc, @@ -524,9 +516,8 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { // Don't decrement if we're "above" an asynchronous event like // sigtramp. decr_pc_and_recompute_addr_range = false; - } else if (!addr_range.GetBaseAddress().IsValid() || - addr_range.GetBaseAddress().GetSection() != m_current_pc.GetSection() || - addr_range.GetBaseAddress().GetOffset() != m_current_pc.GetOffset()) { + } else if (Address addr = m_sym_ctx.GetFunctionOrSymbolAddress(); + addr != m_current_pc) { // If our "current" pc isn't the start of a function, decrement the pc // if we're up the stack. if (m_behaves_like_zeroth_frame) @@ -559,7 +550,7 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { Address temporary_pc; temporary_pc.SetLoadAddress(pc - 1, &process->GetTarget()); m_sym_ctx.Clear(false); - m_sym_ctx_valid = temporary_pc.ResolveFunctionScope(m_sym_ctx, &addr_range); + m_sym_ctx_valid = temporary_pc.ResolveFunctionScope(m_sym_ctx); UnwindLogMsg("Symbol is now %s", GetSymbolOrFunctionName(m_sym_ctx).AsCString("")); @@ -568,8 +559,8 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { // If we were able to find a symbol/function, set addr_range_ptr to the // bounds of that symbol/function. else treat the current pc value as the // start_pc and record no offset. - if (addr_range.GetBaseAddress().IsValid()) { - m_start_pc = addr_range.GetBaseAddress(); + if (m_sym_ctx_valid) { + m_start_pc = m_sym_ctx.GetFunctionOrSymbolAddress(); m_current_offset = pc - m_start_pc.GetLoadAddress(&process->GetTarget()); m_current_offset_backed_up_one = m_current_offset; if (decr_pc_and_recompute_addr_range && @@ -1952,8 +1943,7 @@ void RegisterContextUnwind::PropagateTrapHandlerFlagFromUnwindPlan( GetSymbolOrFunctionName(m_sym_ctx).AsCString("")); m_current_offset_backed_up_one = m_current_offset; - AddressRange addr_range; - m_sym_ctx_valid = m_current_pc.ResolveFunctionScope(m_sym_ctx, &addr_range); + m_sym_ctx_valid = m_current_pc.ResolveFunctionScope(m_sym_ctx); UnwindLogMsg("Symbol is now %s", GetSymbolOrFunctionName(m_sym_ctx).AsCString("")); @@ -1962,9 +1952,11 @@ void RegisterContextUnwind::PropagateTrapHandlerFlagFromUnwindPlan( Process *process = exe_ctx.GetProcessPtr(); Target *target = &process->GetTarget(); - m_start_pc = addr_range.GetBaseAddress(); - m_current_offset = - m_current_pc.GetLoadAddress(target) - m_start_pc.GetLoadAddress(target); + if (m_sym_ctx_valid) { + m_start_pc = m_sym_ctx.GetFunctionOrSymbolAddress(); + m_current_offset = m_current_pc.GetLoadAddress(target) - + m_start_pc.GetLoadAddress(target); + } } } diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 7f61f8689fb95..9660fc97970b0 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -1511,8 +1511,7 @@ bool Target::IgnoreWatchpointByID(lldb::watch_id_t watch_id, ModuleSP Target::GetExecutableModule() { // search for the first executable in the module list - for (size_t i = 0; i < m_images.GetSize(); ++i) { - ModuleSP module_sp = m_images.GetModuleAtIndex(i); + for (ModuleSP module_sp : m_images.Modules()) { lldb_private::ObjectFile *obj = module_sp->GetObjectFile(); if (obj == nullptr) continue; diff --git a/lldb/test/API/functionalities/unwind/frameless-faulted/Makefile b/lldb/test/API/functionalities/unwind/frameless-faulted/Makefile index 954c184d433ec..de52eec91f1ab 100644 --- a/lldb/test/API/functionalities/unwind/frameless-faulted/Makefile +++ b/lldb/test/API/functionalities/unwind/frameless-faulted/Makefile @@ -1,7 +1,7 @@ C_SOURCES := main.c interrupt-and-trap-funcs.o: interrupt-and-trap-funcs.s - $(CC) $(CFLAGS) -E -o interrupt-and-trap-funcs.s $(SRCDIR)/interrupt-and-trap-funcs.s + $(CPP) -o interrupt-and-trap-funcs.s $(SRCDIR)/interrupt-and-trap-funcs.s $(CC) $(CFLAGS) -c -o interrupt-and-trap-funcs.o interrupt-and-trap-funcs.s include Makefile.rules diff --git a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py index a9218d3c3dde3..55557e6e0030e 100644 --- a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py +++ b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py @@ -2,15 +2,11 @@ Test lldb-dap attach request """ -import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbdap_testcase -import os -import shutil import subprocess -import tempfile import threading import time @@ -26,8 +22,6 @@ def spawn_and_wait(program, delay): class TestDAP_attach(lldbdap_testcase.DAPTestCaseBase): def set_and_hit_breakpoint(self, continueToExit=True): - self.dap_server.wait_for_stopped() - source = "main.c" breakpoint1_line = line_number(source, "// breakpoint 1") lines = [breakpoint1_line] @@ -36,7 +30,12 @@ def set_and_hit_breakpoint(self, continueToExit=True): self.assertEqual( len(breakpoint_ids), len(lines), "expect correct number of breakpoints" ) - self.continue_to_breakpoints(breakpoint_ids) + # Test binary will sleep for 10s, offset the breakpoint timeout + # accordingly. + timeout_offset = 10 + self.continue_to_breakpoints( + breakpoint_ids, timeout=timeout_offset + self.DEFAULT_TIMEOUT + ) if continueToExit: self.continue_to_exit() @@ -160,7 +159,7 @@ def test_commands(self): # Continue after launch and hit the "pause()" call and stop the target. # Get output from the console. This should contain both the # "stopCommands" that were run after we stop. - self.dap_server.request_continue() + self.do_continue() time.sleep(0.5) self.dap_server.request_pause() self.dap_server.wait_for_stopped() @@ -198,9 +197,6 @@ def test_attach_command_process_failures(self): ) @skipIfNetBSD # Hangs on NetBSD as well - @skipIf( - archs=["arm", "aarch64"] - ) # Example of a flaky run http://lab.llvm.org:8011/builders/lldb-aarch64-ubuntu/builds/5517/steps/test/logs/stdio def test_terminate_commands(self): """ Tests that the "terminateCommands", that can be passed during diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py index 8581f10cef22a..25f031db5cac5 100644 --- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py +++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py @@ -60,7 +60,7 @@ def test_breakpoint_events(self): response = self.dap_server.request_setBreakpoints( main_source_path, [main_bp_line] ) - self.assertTrue(response) + self.assertTrue(response["success"]) breakpoints = response["body"]["breakpoints"] for breakpoint in breakpoints: main_bp_id = breakpoint["id"] @@ -72,7 +72,7 @@ def test_breakpoint_events(self): response = self.dap_server.request_setBreakpoints( foo_source_path, [foo_bp1_line] ) - self.assertTrue(response) + self.assertTrue(response["success"]) breakpoints = response["body"]["breakpoints"] for breakpoint in breakpoints: foo_bp_id = breakpoint["id"] @@ -81,9 +81,6 @@ def test_breakpoint_events(self): breakpoint["verified"], "expect foo breakpoint to not be verified" ) - # Make sure we're stopped. - self.dap_server.wait_for_stopped() - # Flush the breakpoint events. self.dap_server.wait_for_breakpoint_events(timeout=5) diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py index 479a91208a66c..948c146d4da68 100644 --- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py +++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py @@ -9,7 +9,7 @@ import lldbdap_testcase -class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase): +class TestDAP_cancel(lldbdap_testcase.DAPTestCaseBase): def send_async_req(self, command: str, arguments={}) -> int: seq = self.dap_server.sequence self.dap_server.send_packet( @@ -45,14 +45,13 @@ def test_pending_request(self): """ program = self.getBuildArtifact("a.out") self.build_and_launch(program, stopOnEntry=True) - self.continue_to_next_stop() # Use a relatively short timeout since this is only to ensure the # following request is queued. blocking_seq = self.async_blocking_request(duration=1.0) # Use a longer timeout to ensure we catch if the request was interrupted # properly. - pending_seq = self.async_blocking_request(duration=self.timeoutval / 2) + pending_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2) cancel_seq = self.async_cancel(requestId=pending_seq) blocking_resp = self.dap_server.recv_packet(filter_type=["response"]) @@ -78,12 +77,11 @@ def test_inflight_request(self): """ program = self.getBuildArtifact("a.out") self.build_and_launch(program, stopOnEntry=True) - self.continue_to_next_stop() - blocking_seq = self.async_blocking_request(duration=self.timeoutval / 2) + blocking_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2) # Wait for the sleep to start to cancel the inflight request. self.collect_console( - timeout_secs=self.timeoutval, + timeout_secs=self.DEFAULT_TIMEOUT, pattern="starting sleep", ) cancel_seq = self.async_cancel(requestId=blocking_seq) diff --git a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py index 223258fbdd3dc..ea6b2ea7f28ab 100644 --- a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py +++ b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py @@ -75,11 +75,12 @@ def test_command_directive_abort_on_error_attach_commands(self): ) command_abort_on_error = "settings set foo bar" program = self.build_and_create_debug_adapter_for_attach() - self.attach( - program, + resp = self.attach( + program=program, attachCommands=["?!" + command_quiet, "!" + command_abort_on_error], expectFailure=True, ) + self.assertFalse(resp["success"], "expected 'attach' failure") full_output = self.collect_console( timeout_secs=1.0, pattern=command_abort_on_error, diff --git a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py index a94288c7a669e..75876c248f86c 100644 --- a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py +++ b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py @@ -43,12 +43,12 @@ def verify_completions(self, actual_list, expected_list, not_expected_list=[]): for not_expected_item in not_expected_list: self.assertNotIn(not_expected_item, actual_list) - def setup_debugee(self, stopOnEntry=False): + def setup_debuggee(self): program = self.getBuildArtifact("a.out") source = "main.cpp" self.build_and_launch( program, - stopOnEntry=stopOnEntry, + stopOnEntry=True, sourceBreakpoints=[ ( source, @@ -64,7 +64,7 @@ def test_command_completions(self): """ Tests completion requests for lldb commands, within "repl-mode=command" """ - self.setup_debugee() + self.setup_debuggee() self.continue_to_next_stop() res = self.dap_server.request_evaluate( @@ -143,7 +143,7 @@ def test_variable_completions(self): """ Tests completion requests in "repl-mode=variable" """ - self.setup_debugee() + self.setup_debuggee() self.continue_to_next_stop() res = self.dap_server.request_evaluate( @@ -241,7 +241,7 @@ def test_auto_completions(self): """ Tests completion requests in "repl-mode=auto" """ - self.setup_debugee(stopOnEntry=True) + self.setup_debuggee() res = self.dap_server.request_evaluate( "`lldb-dap repl-mode auto", context="repl" diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py index 9cdb978368cc1..1f810afdbb667 100644 --- a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py +++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py @@ -176,9 +176,12 @@ def test_diagnositcs(self): f"target create --core {core}", context="repl" ) - output = self.get_important(timeout=2.0) + diagnostics = self.collect_important( + timeout_secs=self.DEFAULT_TIMEOUT, pattern="minidump file" + ) + self.assertIn( "warning: unable to retrieve process ID from minidump file", - output, + diagnostics, "diagnostic found in important output", ) diff --git a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py index 1896acea15a99..e678c5ee77fdc 100644 --- a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py +++ b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py @@ -2,7 +2,6 @@ Test lldb-dap coreFile attaching """ - import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -19,7 +18,7 @@ def test_core_file(self): core_file = os.path.join(current_dir, "linux-x86_64.core") self.create_debug_adapter() - self.attach(exe_file, coreFile=core_file) + self.attach(program=exe_file, coreFile=core_file) expected_frames = [ { @@ -51,7 +50,8 @@ def test_core_file(self): self.assertEqual(self.get_stackFrames(), expected_frames) # Resuming should have no effect and keep the process stopped - self.continue_to_next_stop() + resp = self.dap_server.request_continue() + self.assertFalse(resp["success"]) self.assertEqual(self.get_stackFrames(), expected_frames) self.dap_server.request_next(threadId=32259) @@ -67,7 +67,7 @@ def test_core_file_source_mapping_array(self): self.create_debug_adapter() source_map = [["/home/labath/test", current_dir]] - self.attach(exe_file, coreFile=core_file, sourceMap=source_map) + self.attach(program=exe_file, coreFile=core_file, sourceMap=source_map) self.assertIn(current_dir, self.get_stackFrames()[0]["source"]["path"]) @@ -81,6 +81,6 @@ def test_core_file_source_mapping_object(self): self.create_debug_adapter() source_map = {"/home/labath/test": current_dir} - self.attach(exe_file, coreFile=core_file, sourceMap=source_map) + self.attach(program=exe_file, coreFile=core_file, sourceMap=source_map) self.assertIn(current_dir, self.get_stackFrames()[0]["source"]["path"]) diff --git a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py index ec7387dabb0c2..f044bcae41892 100644 --- a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py +++ b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py @@ -16,6 +16,7 @@ def test_stopped_description(self): """ program = self.getBuildArtifact("a.out") self.build_and_launch(program) + self.do_continue() self.assertTrue(self.verify_stop_exception_info("signal SIGABRT")) exceptionInfo = self.get_exceptionInfo() diff --git a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py index f05f876e57b49..b72b98de412b4 100644 --- a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py +++ b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py @@ -22,13 +22,9 @@ def cleanup(): process.terminate() process.wait() stdout_data = process.stdout.read().decode() - stderr_data = process.stderr.read().decode() print("========= STDOUT =========", file=sys.stderr) print(stdout_data, file=sys.stderr) print("========= END =========", file=sys.stderr) - print("========= STDERR =========", file=sys.stderr) - print(stderr_data, file=sys.stderr) - print("========= END =========", file=sys.stderr) print("========= DEBUG ADAPTER PROTOCOL LOGS =========", file=sys.stderr) with open(log_file_path, "r") as file: print(file.read(), file=sys.stderr) diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py index 7c85f05c1ba45..0063954791fd5 100644 --- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py +++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py @@ -75,9 +75,7 @@ def test_termination(self): self.dap_server.request_disconnect() # Wait until the underlying lldb-dap process dies. - self.dap_server.process.wait( - timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval - ) + self.dap_server.process.wait(timeout=self.DEFAULT_TIMEOUT) # Check the return code self.assertEqual(self.dap_server.process.poll(), 0) @@ -90,15 +88,16 @@ def test_stopOnEntry(self): program = self.getBuildArtifact("a.out") self.build_and_launch(program, stopOnEntry=True) - stopped_events = self.dap_server.wait_for_stopped() - for stopped_event in stopped_events: - if "body" in stopped_event: - body = stopped_event["body"] - if "reason" in body: - reason = body["reason"] - self.assertNotEqual( - reason, "breakpoint", 'verify stop isn\'t "main" breakpoint' - ) + self.assertTrue( + len(self.dap_server.thread_stop_reasons) > 0, + "expected stopped event during launch", + ) + for _, body in self.dap_server.thread_stop_reasons.items(): + if "reason" in body: + reason = body["reason"] + self.assertNotEqual( + reason, "breakpoint", 'verify stop isn\'t "main" breakpoint' + ) @skipIfWindows def test_cwd(self): @@ -393,14 +392,14 @@ def test_commands(self): # Get output from the console. This should contain both the # "stopCommands" that were run after the first breakpoint was hit self.continue_to_breakpoints(breakpoint_ids) - output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval) + output = self.get_console(timeout=self.DEFAULT_TIMEOUT) self.verify_commands("stopCommands", output, stopCommands) # Continue again and hit the second breakpoint. # Get output from the console. This should contain both the # "stopCommands" that were run after the second breakpoint was hit self.continue_to_breakpoints(breakpoint_ids) - output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval) + output = self.get_console(timeout=self.DEFAULT_TIMEOUT) self.verify_commands("stopCommands", output, stopCommands) # Continue until the program exits @@ -462,21 +461,21 @@ def test_extra_launch_commands(self): self.verify_commands("launchCommands", output, launchCommands) # Verify the "stopCommands" here self.continue_to_next_stop() - output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval) + output = self.get_console(timeout=self.DEFAULT_TIMEOUT) self.verify_commands("stopCommands", output, stopCommands) # Continue and hit the second breakpoint. # Get output from the console. This should contain both the # "stopCommands" that were run after the first breakpoint was hit self.continue_to_next_stop() - output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval) + output = self.get_console(timeout=self.DEFAULT_TIMEOUT) self.verify_commands("stopCommands", output, stopCommands) # Continue until the program exits self.continue_to_exit() # Get output from the console. This should contain both the # "exitCommands" that were run after the second breakpoint was hit - output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval) + output = self.get_console(timeout=self.DEFAULT_TIMEOUT) self.verify_commands("exitCommands", output, exitCommands) def test_failing_launch_commands(self): @@ -531,7 +530,7 @@ def test_terminate_commands(self): terminateCommands = ["expr 4+2"] self.launch( - program=program, + program, stopOnEntry=True, terminateCommands=terminateCommands, disconnectAutomatically=False, diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py index 3fc0f752ee39e..b333efd7bfb1f 100644 --- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py +++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py @@ -14,7 +14,7 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase): def run_test(self, symbol_basename, expect_debug_info_size): program_basename = "a.out.stripped" program = self.getBuildArtifact(program_basename) - self.build_and_launch(program) + self.build_and_launch(program, stopOnEntry=True) functions = ["foo"] breakpoint_ids = self.set_function_breakpoints(functions) self.assertEqual(len(breakpoint_ids), len(functions), "expect one breakpoint") @@ -108,7 +108,7 @@ def test_modules_dsym(self): @skipIfWindows def test_compile_units(self): program = self.getBuildArtifact("a.out") - self.build_and_launch(program) + self.build_and_launch(program, stopOnEntry=True) source = "main.cpp" main_source_path = self.getSourcePath(source) breakpoint1_line = line_number(source, "// breakpoint 1") diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py index 49131ad9ecb17..0425b55a5e552 100644 --- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py +++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py @@ -37,14 +37,14 @@ def test_output(self): # Disconnecting from the server to ensure any pending IO is flushed. self.dap_server.request_disconnect() - output += self.get_stdout(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval) + output += self.get_stdout(timeout=self.DEFAULT_TIMEOUT) self.assertTrue(output and len(output) > 0, "expect program stdout") self.assertIn( "abcdefghi\r\nhello world\r\nfinally\0\0", output, "full stdout not found in: " + repr(output), ) - console = self.get_console(timeout=self.timeoutval) + console = self.get_console(timeout=self.DEFAULT_TIMEOUT) self.assertTrue(console and len(console) > 0, "expect dap messages") self.assertIn( "out\0\0\r\nerr\0\0\r\n", console, f"full console message not found" diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py index 5f95c7bfb1556..8681b31e8eb1b 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py @@ -22,9 +22,8 @@ def test_basic_functionality(self): [bp_A, bp_B] = self.set_source_breakpoints("main.c", [line_A, line_B]) # Verify we hit A, then B. - self.verify_breakpoint_hit([bp_A]) - self.dap_server.request_continue() - self.verify_breakpoint_hit([bp_B]) + self.continue_to_breakpoints([bp_A]) + self.continue_to_breakpoints([bp_B]) # Make sure i has been modified from its initial value of 0. self.assertEqual( @@ -34,8 +33,9 @@ def test_basic_functionality(self): ) # Restart then check we stop back at A and program state has been reset. - self.dap_server.request_restart() - self.verify_breakpoint_hit([bp_A]) + resp = self.dap_server.request_restart() + self.assertTrue(resp["success"]) + self.continue_to_breakpoints([bp_A]) self.assertEqual( int(self.dap_server.get_local_variable_value("i")), 0, @@ -50,27 +50,26 @@ def test_stopOnEntry(self): program = self.getBuildArtifact("a.out") self.build_and_launch(program, stopOnEntry=True) [bp_main] = self.set_function_breakpoints(["main"]) - self.dap_server.request_configurationDone() - # Once the "configuration done" event is sent, we should get a stopped # event immediately because of stopOnEntry. - stopped_events = self.dap_server.wait_for_stopped() - for stopped_event in stopped_events: - if "body" in stopped_event: - body = stopped_event["body"] - if "reason" in body: - reason = body["reason"] - self.assertNotEqual( - reason, "breakpoint", 'verify stop isn\'t "main" breakpoint' - ) + self.assertTrue( + len(self.dap_server.thread_stop_reasons) > 0, + "expected stopped event during launch", + ) + for _, body in self.dap_server.thread_stop_reasons.items(): + if "reason" in body: + reason = body["reason"] + self.assertNotEqual( + reason, "breakpoint", 'verify stop isn\'t "main" breakpoint' + ) # Then, if we continue, we should hit the breakpoint at main. - self.dap_server.request_continue() - self.verify_breakpoint_hit([bp_main]) + self.continue_to_breakpoints([bp_main]) # Restart and check that we still get a stopped event before reaching # main. - self.dap_server.request_restart() + resp = self.dap_server.request_restart() + self.assertTrue(resp["success"]) stopped_events = self.dap_server.wait_for_stopped() for stopped_event in stopped_events: if "body" in stopped_event: @@ -96,8 +95,7 @@ def test_arguments(self): [bp_A] = self.set_source_breakpoints("main.c", [line_A]) # Verify we hit A, then B. - self.dap_server.request_configurationDone() - self.verify_breakpoint_hit([bp_A]) + self.continue_to_breakpoints([bp_A]) # We don't set any arguments in the initial launch request, so argc # should be 1. @@ -109,7 +107,7 @@ def test_arguments(self): # Restart with some extra 'args' and check that the new argc reflects # the updated launch config. - self.dap_server.request_restart( + resp = self.dap_server.request_restart( restartArguments={ "arguments": { "program": program, @@ -117,6 +115,7 @@ def test_arguments(self): } } ) + self.assertTrue(resp["success"]) self.verify_breakpoint_hit([bp_A]) self.assertEqual( int(self.dap_server.get_local_variable_value("argc")), diff --git a/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py b/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py index 7e28a5af4331c..33e038408fa34 100644 --- a/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py +++ b/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py @@ -2,7 +2,6 @@ Test stop hooks """ - from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * import lldbdap_testcase @@ -17,10 +16,6 @@ def test_stop_hooks_before_run(self): program = self.getBuildArtifact("a.out") preRunCommands = ["target stop-hook add -o help"] self.build_and_launch(program, stopOnEntry=True, preRunCommands=preRunCommands) - - # The first stop is on entry. - self.dap_server.wait_for_stopped() - breakpoint_ids = self.set_function_breakpoints(["main"]) # This request hangs if the race happens, because, in that case, the # command interpreter is in synchronous mode while lldb-dap expects diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py index 296e4911f4052..340be0b39010d 100644 --- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py +++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py @@ -116,7 +116,7 @@ def darwin_dwarf_missing_obj(self, initCommands): self.create_debug_adapter() self.assertTrue(os.path.exists(program), "executable must exist") - self.launch(program=program, initCommands=initCommands) + self.launch(program, initCommands=initCommands) functions = ["main"] breakpoint_ids = self.set_function_breakpoints(functions) diff --git a/lldb/test/Shell/ObjectFile/XCOFF/basic-info32.yaml b/lldb/test/Shell/ObjectFile/XCOFF/basic-info32.yaml new file mode 100644 index 0000000000000..dd1569fe52994 --- /dev/null +++ b/lldb/test/Shell/ObjectFile/XCOFF/basic-info32.yaml @@ -0,0 +1,110 @@ +# RUN: yaml2obj %s -o %t +# RUN: lldb-test object-file %t | FileCheck %s + +# CHECK: Plugin name: xcoff +# CHECK: Architecture: powerpc64-ibm-aix +# CHECK: Executable: true +# CHECK: Stripped: false +# CHECK: Type: executable +# CHECK: Strata: unknown +# CHECK: Name: .text +# CHECK-NEXT: Type: code +# CHECK-NEXT: Permissions: r-x +# CHECK: Name: .data +# CHECK-NEXT: Type: data +# CHECK-NEXT: Permissions: rw- +# CHECK: Name: .bss +# CHECK-NEXT: Type: zero-fill +# CHECK-NEXT: Permissions: rw- +# CHECK: Name: .loader +# CHECK-NEXT: Type: regular +# CHECK-NEXT: Permissions: r-- +# CHECK: Name: .dwline +# CHECK-NEXT: Type: dwarf-line +# CHECK-NEXT: Permissions: r-- +# CHECK: Name: .dwinfo +# CHECK-NEXT: Type: dwarf-info +# CHECK-NEXT: Permissions: r-- +# CHECK: Name: .dwabrev +# CHECK-NEXT: Type: dwarf-abbrev +# CHECK-NEXT: Permissions: r-- + +--- !XCOFF +FileHeader: + MagicNumber: 0x1DF + NumberOfSections: 7 + CreationTime: 000000000 + Flags: 0x1002 +Sections: + - Name: .text + Address: 0x10000268 + Size: 0x512 + FileOffsetToData: 0x268 + FileOffsetToRelocations: 0xECC + FileOffsetToLineNumbers: 0x0 + NumberOfRelocations: 0x24 + NumberOfLineNumbers: 0x0 + Flags: [ STYP_TEXT ] + SectionData: 80C20000 + - Name: .data + Address: 0x2000077A + Size: 0x242 + FileOffsetToData: 0x77A + FileOffsetToRelocations: 0x1034 + FileOffsetToLineNumbers: 0x0 + NumberOfRelocations: 0x25 + NumberOfLineNumbers: 0x0 + Flags: [ STYP_DATA ] + SectionData: '' + - Name: .bss + Address: 0x200009BC + Size: 0x10 + FileOffsetToData: 0x0 + FileOffsetToRelocations: 0x0 + FileOffsetToLineNumbers: 0x0 + NumberOfRelocations: 0x0 + NumberOfLineNumbers: 0x0 + Flags: [ STYP_BSS ] + SectionData: '' + - Name: .loader + Address: 0x0 + Size: 0x3A4 + FileOffsetToData: 0x9BC + FileOffsetToRelocations: 0x0 + FileOffsetToLineNumbers: 0x0 + NumberOfRelocations: 0x0 + NumberOfLineNumbers: 0x0 + Flags: [ STYP_LOADER ] + SectionData: 00000001 + - Name: .dwline + Address: 0x0 + Size: 0x73 + FileOffsetToData: 0xD60 + FileOffsetToRelocations: 0x11A6 + FileOffsetToLineNumbers: 0x0 + NumberOfRelocations: 0x5 + NumberOfLineNumbers: 0x0 + Flags: [ STYP_DWARF ] + SectionData: FFFFFFFF + - Name: .dwinfo + Address: 0x0 + Size: 0xB4 + FileOffsetToData: 0xDD4 + FileOffsetToRelocations: 0x11D8 + FileOffsetToLineNumbers: 0x0 + NumberOfRelocations: 0x6 + NumberOfLineNumbers: 0x0 + Flags: [ STYP_DWARF ] + SectionData: FFFFFFFF + - Name: .dwabrev + Address: 0x0 + Size: 0x43 + FileOffsetToData: 0xE88 + FileOffsetToRelocations: 0x0 + FileOffsetToLineNumbers: 0x0 + NumberOfRelocations: 0x0 + NumberOfLineNumbers: 0x0 + Flags: [ STYP_DWARF ] + SectionData: 01110125 +StringTable: {} +... diff --git a/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s b/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s index ede04c88a030f..a7b5431a7afaf 100644 --- a/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s +++ b/lldb/test/Shell/Unwind/Inputs/basic-block-sections-with-dwarf.s @@ -19,19 +19,16 @@ baz: .Lbaz_end: .size baz, .Lbaz_end-baz - .type foo,@function -foo: +foo.__part.3: .cfi_startproc - pushq %rbx - .cfi_def_cfa_offset 16 + .cfi_def_cfa_offset 32 .cfi_offset %rbx, -16 - movl %edi, %ebx - cmpl $0, %ebx - je foo.__part.2 - jmp foo.__part.1 + addq $24, %rsp + .cfi_def_cfa %rsp, 8 + retq +.Lfoo.__part.3_end: + .size foo.__part.3, .Lfoo.__part.3_end-foo.__part.3 .cfi_endproc -.Lfoo_end: - .size foo, .Lfoo_end-foo # NB: Deliberately inserting padding to separate the two parts of the function # as we're currently only parsing a single FDE entry from a (coalesced) address @@ -40,11 +37,13 @@ foo: foo.__part.1: .cfi_startproc - .cfi_def_cfa_offset 16 + .cfi_def_cfa_offset 32 .cfi_offset %rbx, -16 subq $16, %rsp - .cfi_def_cfa_offset 32 + .cfi_def_cfa_offset 48 callq bar + addq $16, %rsp + .cfi_def_cfa_offset 32 jmp foo.__part.3 .Lfoo.__part.1_end: .size foo.__part.1, .Lfoo.__part.1_end-foo.__part.1 @@ -52,46 +51,50 @@ foo.__part.1: bar: .cfi_startproc - subq $24, %rsp - .cfi_def_cfa_offset 32 + subq $88, %rsp + .cfi_def_cfa_offset 96 xorl %edi, %edi callq foo - addq $24, %rsp + addq $88, %rsp .cfi_def_cfa %rsp, 8 retq .cfi_endproc .Lbar_end: .size bar, .Lbar_end-bar -foo.__part.2: + .type foo,@function +foo: .cfi_startproc + pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset %rbx, -16 + movl %edi, %ebx + cmpl $0, %ebx + je foo.__part.2 subq $16, %rsp .cfi_def_cfa_offset 32 - callq baz - jmp foo.__part.3 -.Lfoo.__part.2_end: - .size foo.__part.2, .Lfoo.__part.2_end-foo.__part.2 + jmp foo.__part.1 .cfi_endproc +.Lfoo_end: + .size foo, .Lfoo_end-foo # NB: Deliberately inserting padding to separate the two parts of the function # as we're currently only parsing a single FDE entry from a (coalesced) address # range. nop -foo.__part.3: +foo.__part.2: .cfi_startproc - .cfi_def_cfa_offset 32 + .cfi_def_cfa_offset 16 .cfi_offset %rbx, -16 - addq $24, %rsp - .cfi_def_cfa %rsp, 8 - retq -.Lfoo.__part.3_end: - .size foo.__part.3, .Lfoo.__part.3_end-foo.__part.3 + subq $16, %rsp + .cfi_def_cfa_offset 32 + callq baz + jmp foo.__part.3 +.Lfoo.__part.2_end: + .size foo.__part.2, .Lfoo.__part.2_end-foo.__part.2 .cfi_endproc - .globl main .type main,@function main: diff --git a/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test b/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test index a4ed73e14de01..b83e388e79d21 100644 --- a/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test +++ b/lldb/test/Shell/Unwind/basic-block-sections-with-dwarf-static.test @@ -22,15 +22,17 @@ image show-unwind --cached true -n foo # CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes. # CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no. # CHECK-NEXT: This UnwindPlan is for a trap handler function: no. -# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 6-0x0000000000000010)[{{.*}}.text + 17-0x000000000000001c)[{{.*}}.text + 44-0x0000000000000037)[{{.*}}.text + 56-0x000000000000003d) -# CHECK-NEXT: row[0]: 0: CFA=rsp +8 => rip=[CFA-8] -# CHECK-NEXT: row[1]: 1: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8] -# CHECK-NEXT: row[2]: 11: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8] -# CHECK-NEXT: row[3]: 15: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] -# CHECK-NEXT: row[4]: 38: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8] -# CHECK-NEXT: row[5]: 42: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] -# CHECK-NEXT: row[6]: 50: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] -# CHECK-NEXT: row[7]: 54: CFA=rsp +8 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 6-0x000000000000000b)[{{.*}}.text + 12-0x000000000000001b)[{{.*}}.text + 43-0x0000000000000039)[{{.*}}.text + 58-0x0000000000000045) +# CHECK-NEXT: row[0]: -37: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[1]: -33: CFA=rsp +8 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[2]: -31: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[3]: -27: CFA=rsp+48 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[4]: -18: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[5]: 0: CFA=rsp +8 => rip=[CFA-8] +# CHECK-NEXT: row[6]: 1: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[7]: 12: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[8]: 15: CFA=rsp+16 => rbx=[CFA-16] rip=[CFA-8] +# CHECK-NEXT: row[9]: 19: CFA=rsp+32 => rbx=[CFA-16] rip=[CFA-8] # CHECK-EMPTY: image show-unwind --cached true -n bar @@ -41,8 +43,8 @@ image show-unwind --cached true -n bar # CHECK-NEXT: This UnwindPlan is sourced from the compiler: yes. # CHECK-NEXT: This UnwindPlan is valid at all instruction locations: no. # CHECK-NEXT: This UnwindPlan is for a trap handler function: no. -# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 28-0x000000000000002c) +# CHECK-NEXT: Address range of this UnwindPlan: [{{.*}}.text + 27-0x000000000000002b) # CHECK-NEXT: row[0]: 0: CFA=rsp +8 => rip=[CFA-8] -# CHECK-NEXT: row[1]: 4: CFA=rsp+32 => rip=[CFA-8] +# CHECK-NEXT: row[1]: 4: CFA=rsp+96 => rip=[CFA-8] # CHECK-NEXT: row[2]: 15: CFA=rsp +8 => rip=[CFA-8] # CHECK-EMPTY: diff --git a/lldb/tools/debugserver/source/CMakeLists.txt b/lldb/tools/debugserver/source/CMakeLists.txt index f7ff76c3e8e84..8340b5ad8948d 100644 --- a/lldb/tools/debugserver/source/CMakeLists.txt +++ b/lldb/tools/debugserver/source/CMakeLists.txt @@ -154,6 +154,21 @@ endif() add_definitions(-DLLDB_USE_OS_LOG) +# Make sure we have the macOS SDK root as mig needs it and will silently +# fail to generate its output files without it. +if(CMAKE_OSX_SYSROOT) + set(MIG_SYSROOT ${CMAKE_OSX_SYSROOT}) +else() + execute_process(COMMAND xcrun --show-sdk-path + OUTPUT_VARIABLE MIG_SYSROOT + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +if(NOT MIG_SYSROOT) + message(FATAL_ERROR "Unable to obtain sysroot required by mig (Mach Interface Generator). Set CMAKE_OSX_SYSROOT to explicitly specify a sysroot.") +endif() + if(${CMAKE_OSX_SYSROOT} MATCHES ".Internal.sdk$") message(STATUS "LLDB debugserver energy support is enabled") add_definitions(-DLLDB_ENERGY) @@ -177,7 +192,7 @@ endif() separate_arguments(MIG_ARCH_FLAGS_SEPARTED NATIVE_COMMAND "${MIG_ARCH_FLAGS}") add_custom_command(OUTPUT ${generated_mach_interfaces} - VERBATIM COMMAND mig ${MIG_ARCH_FLAGS_SEPARTED} -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CURRENT_SOURCE_DIR}/MacOSX/dbgnub-mig.defs + VERBATIM COMMAND mig ${MIG_ARCH_FLAGS_SEPARTED} -isysroot ${MIG_SYSROOT} ${CMAKE_CURRENT_SOURCE_DIR}/MacOSX/dbgnub-mig.defs DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/MacOSX/dbgnub-mig.defs ) diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 4feca1253be20..56a0c38b00037 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -559,17 +559,6 @@ lldb::SBFrame DAP::GetLLDBFrame(const llvm::json::Object &arguments) { return GetLLDBFrame(frame_id); } -llvm::json::Value DAP::CreateTopLevelScopes() { - llvm::json::Array scopes; - scopes.emplace_back( - CreateScope("Locals", VARREF_LOCALS, variables.locals.GetSize(), false)); - scopes.emplace_back(CreateScope("Globals", VARREF_GLOBALS, - variables.globals.GetSize(), false)); - scopes.emplace_back(CreateScope("Registers", VARREF_REGS, - variables.registers.GetSize(), false)); - return llvm::json::Value(std::move(scopes)); -} - ReplMode DAP::DetectReplMode(lldb::SBFrame frame, std::string &expression, bool partial_expression) { // Check for the escape hatch prefix. @@ -1194,8 +1183,7 @@ bool SendEventRequestHandler::DoExecute(lldb::SBDebugger debugger, "exited", "initialize", "loadedSource", "module", "process", "stopped", "terminated", "thread"}; - if (std::find(internal_events.begin(), internal_events.end(), name) != - std::end(internal_events)) { + if (llvm::is_contained(internal_events, name)) { std::string msg = llvm::formatv("Invalid use of lldb-dap send-event, event \"{0}\" " "should be handled by lldb-dap internally.", diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index c2e4c2dea582e..c1a1130b1e59f 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -226,7 +226,8 @@ struct DAP { /// \param[in] default_repl_mode /// Default repl mode behavior, as configured by the binary. /// \param[in] pre_init_commands - /// LLDB commands to execute as soon as the debugger instance is allocaed. + /// LLDB commands to execute as soon as the debugger instance is + /// allocated. /// \param[in] transport /// Transport for this debug session. DAP(Log *log, const ReplMode default_repl_mode, @@ -283,10 +284,10 @@ struct DAP { lldb::SBThread GetLLDBThread(const llvm::json::Object &arguments); lldb::SBFrame GetLLDBFrame(uint64_t frame_id); + /// TODO: remove this function when we finish migrating to the + /// new protocol types. lldb::SBFrame GetLLDBFrame(const llvm::json::Object &arguments); - llvm::json::Value CreateTopLevelScopes(); - void PopulateExceptionBreakpoints(); /// Attempt to determine if an expression is a variable expression or diff --git a/lldb/tools/lldb-dap/DAPError.cpp b/lldb/tools/lldb-dap/DAPError.cpp index dcb955af0345f..60347d577f821 100644 --- a/lldb/tools/lldb-dap/DAPError.cpp +++ b/lldb/tools/lldb-dap/DAPError.cpp @@ -8,6 +8,7 @@ #include "DAPError.h" #include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" #include namespace lldb_dap { @@ -26,4 +27,12 @@ std::error_code DAPError::convertToErrorCode() const { return llvm::inconvertibleErrorCode(); } +char NotStoppedError::ID; + +void NotStoppedError::log(llvm::raw_ostream &OS) const { OS << "not stopped"; } + +std::error_code NotStoppedError::convertToErrorCode() const { + return llvm::inconvertibleErrorCode(); +} + } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/DAPError.h b/lldb/tools/lldb-dap/DAPError.h index 564651b1f587d..4c94bdd6ac3d6 100644 --- a/lldb/tools/lldb-dap/DAPError.h +++ b/lldb/tools/lldb-dap/DAPError.h @@ -13,7 +13,7 @@ namespace lldb_dap { -/// An Error that is reported as a DAP Error Message, which may be presented to +/// An error that is reported as a DAP Error Message, which may be presented to /// the user. class DAPError : public llvm::ErrorInfo { public: @@ -40,4 +40,13 @@ class DAPError : public llvm::ErrorInfo { std::optional m_url_label; }; +/// An error that indicates the current request handler cannot execute because +/// the process is not stopped. +class NotStoppedError : public llvm::ErrorInfo { +public: + static char ID; + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override; +}; + } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Handler/ContinueRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ContinueRequestHandler.cpp index ca4c9141eca38..361c86421cf1b 100644 --- a/lldb/tools/lldb-dap/Handler/ContinueRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ContinueRequestHandler.cpp @@ -31,6 +31,9 @@ ContinueRequestHandler::Run(const ContinueArguments &args) const { SBProcess process = dap.target.GetProcess(); SBError error; + if (!SBDebugger::StateIsStoppedState(process.GetState())) + return make_error(); + if (args.singleThread) dap.GetLLDBThread(args.threadId).Resume(error); else @@ -40,7 +43,7 @@ ContinueRequestHandler::Run(const ContinueArguments &args) const { return ToError(error); ContinueResponseBody body; - body.allThreadsContinued = args.singleThread; + body.allThreadsContinued = !args.singleThread; return body; } diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h index b0002440cf72e..383f9e24a729a 100644 --- a/lldb/tools/lldb-dap/Handler/RequestHandler.h +++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h @@ -106,11 +106,13 @@ class RequestHandler : public BaseRequestHandler { DAP_LOG(dap.log, "({0}) malformed request {1}, expected arguments but got none", dap.transport.GetClientName(), request.command); - response.success = false; - response.message = llvm::formatv("arguments required for command '{0}' " - "but none received", - request.command) - .str(); + HandleErrorResponse( + llvm::make_error( + llvm::formatv("arguments required for command '{0}' " + "but none received", + request.command) + .str()), + response); dap.Send(response); return; } @@ -123,26 +125,21 @@ class RequestHandler : public BaseRequestHandler { OS << "invalid arguments for request '" << request.command << "': " << llvm::toString(root.getError()) << "\n"; root.printErrorContext(*request.arguments, OS); - - response.success = false; - response.body = ToResponse(llvm::make_error(parse_failure)); - + HandleErrorResponse(llvm::make_error(parse_failure), response); dap.Send(response); return; } if constexpr (std::is_same_v) { if (llvm::Error err = Run(arguments)) { - response.success = false; - response.body = ToResponse(std::move(err)); + HandleErrorResponse(std::move(err), response); } else { response.success = true; } } else { Resp body = Run(arguments); if (llvm::Error err = body.takeError()) { - response.success = false; - response.body = ToResponse(std::move(err)); + HandleErrorResponse(std::move(err), response); } else { response.success = true; response.body = std::move(*body); @@ -172,26 +169,36 @@ class RequestHandler : public BaseRequestHandler { /// error. virtual void PostRun() const {}; - protocol::ErrorResponseBody ToResponse(llvm::Error err) const { - protocol::ErrorMessage error_message; - // Default to showing the user errors unless otherwise specified by a - // DAPError. - error_message.showUser = true; - error_message.sendTelemetry = false; - if (llvm::Error unhandled = llvm::handleErrors( - std::move(err), [&](const DAPError &E) -> llvm::Error { - error_message.format = E.getMessage(); - error_message.showUser = E.getShowUser(); - error_message.id = E.convertToErrorCode().value(); - error_message.url = E.getURL(); - error_message.urlLabel = E.getURLLabel(); - return llvm::Error::success(); - })) { - error_message.format = llvm::toString(std::move(unhandled)); - } - protocol::ErrorResponseBody body; - body.error = error_message; - return body; + void HandleErrorResponse(llvm::Error err, + protocol::Response &response) const { + response.success = false; + llvm::handleAllErrors( + std::move(err), + [&](const NotStoppedError &err) { + response.message = lldb_dap::protocol::eResponseMessageNotStopped; + }, + [&](const DAPError &err) { + protocol::ErrorMessage error_message; + error_message.sendTelemetry = false; + error_message.format = err.getMessage(); + error_message.showUser = err.getShowUser(); + error_message.id = err.convertToErrorCode().value(); + error_message.url = err.getURL(); + error_message.urlLabel = err.getURLLabel(); + protocol::ErrorResponseBody body; + body.error = error_message; + response.body = body; + }, + [&](const llvm::ErrorInfoBase &err) { + protocol::ErrorMessage error_message; + error_message.showUser = true; + error_message.sendTelemetry = false; + error_message.format = err.message(); + error_message.id = err.convertToErrorCode().value(); + protocol::ErrorResponseBody body; + body.error = error_message; + response.body = body; + }); } }; @@ -452,11 +459,15 @@ class PauseRequestHandler : public LegacyRequestHandler { void operator()(const llvm::json::Object &request) const override; }; -class ScopesRequestHandler : public LegacyRequestHandler { +class ScopesRequestHandler final + : public RequestHandler> { public: - using LegacyRequestHandler::LegacyRequestHandler; + using RequestHandler::RequestHandler; static llvm::StringLiteral GetCommand() { return "scopes"; } - void operator()(const llvm::json::Object &request) const override; + + llvm::Expected + Run(const protocol::ScopesArguments &args) const override; }; class SetVariableRequestHandler final diff --git a/lldb/tools/lldb-dap/Handler/ScopesRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ScopesRequestHandler.cpp index 7d1608f59f9a4..aaad0e20f9c21 100644 --- a/lldb/tools/lldb-dap/Handler/ScopesRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ScopesRequestHandler.cpp @@ -7,69 +7,56 @@ //===----------------------------------------------------------------------===// #include "DAP.h" -#include "EventHelper.h" -#include "JSONUtils.h" #include "RequestHandler.h" +using namespace lldb_dap::protocol; namespace lldb_dap { -// "ScopesRequest": { -// "allOf": [ { "$ref": "#/definitions/Request" }, { -// "type": "object", -// "description": "Scopes request; value of command field is 'scopes'. The -// request returns the variable scopes for a given stackframe ID.", -// "properties": { -// "command": { -// "type": "string", -// "enum": [ "scopes" ] -// }, -// "arguments": { -// "$ref": "#/definitions/ScopesArguments" -// } -// }, -// "required": [ "command", "arguments" ] -// }] -// }, -// "ScopesArguments": { -// "type": "object", -// "description": "Arguments for 'scopes' request.", -// "properties": { -// "frameId": { -// "type": "integer", -// "description": "Retrieve the scopes for this stackframe." -// } -// }, -// "required": [ "frameId" ] -// }, -// "ScopesResponse": { -// "allOf": [ { "$ref": "#/definitions/Response" }, { -// "type": "object", -// "description": "Response to 'scopes' request.", -// "properties": { -// "body": { -// "type": "object", -// "properties": { -// "scopes": { -// "type": "array", -// "items": { -// "$ref": "#/definitions/Scope" -// }, -// "description": "The scopes of the stackframe. If the array has -// length zero, there are no scopes available." -// } -// }, -// "required": [ "scopes" ] -// } -// }, -// "required": [ "body" ] -// }] -// } -void ScopesRequestHandler::operator()(const llvm::json::Object &request) const { - llvm::json::Object response; - FillResponse(request, response); - llvm::json::Object body; - const auto *arguments = request.getObject("arguments"); - lldb::SBFrame frame = dap.GetLLDBFrame(*arguments); +/// Creates a `protocol::Scope` struct. +/// +/// +/// \param[in] name +/// The value to place into the "name" key +/// +/// \param[in] variablesReference +/// The value to place into the "variablesReference" key +/// +/// \param[in] namedVariables +/// The value to place into the "namedVariables" key +/// +/// \param[in] expensive +/// The value to place into the "expensive" key +/// +/// \return +/// A `protocol::Scope` +static Scope CreateScope(const llvm::StringRef name, int64_t variablesReference, + int64_t namedVariables, bool expensive) { + Scope scope; + scope.name = name; + + // TODO: Support "arguments" and "return value" scope. + // At the moment lldb-dap includes the arguments and return_value into the + // "locals" scope. + // vscode only expands the first non-expensive scope, this causes friction + // if we add the arguments above the local scope as the locals scope will not + // be expanded if we enter a function with arguments. It becomes more + // annoying when the scope has arguments, return_value and locals. + if (variablesReference == VARREF_LOCALS) + scope.presentationHint = Scope::eScopePresentationHintLocals; + else if (variablesReference == VARREF_REGS) + scope.presentationHint = Scope::eScopePresentationHintRegisters; + + scope.variablesReference = variablesReference; + scope.namedVariables = namedVariables; + scope.expensive = expensive; + + return scope; +} + +llvm::Expected +ScopesRequestHandler::Run(const ScopesArguments &args) const { + lldb::SBFrame frame = dap.GetLLDBFrame(args.frameId); + // As the user selects different stack frames in the GUI, a "scopes" request // will be sent to the DAP. This is the only way we know that the user has // selected a frame in a thread. There are no other notifications that are @@ -78,9 +65,9 @@ void ScopesRequestHandler::operator()(const llvm::json::Object &request) const { // are sent, this allows users to type commands in the debugger console // with a backtick character to run lldb commands and these lldb commands // will now have the right context selected as they are run. If the user - // types "`bt" into the debugger console and we had another thread selected + // types "`bt" into the debugger console, and we had another thread selected // in the LLDB library, we would show the wrong thing to the user. If the - // users switches threads with a lldb command like "`thread select 14", the + // users switch threads with a lldb command like "`thread select 14", the // GUI will not update as there are no "event" notification packets that // allow us to change the currently selected thread or frame in the GUI that // I am aware of. @@ -88,7 +75,6 @@ void ScopesRequestHandler::operator()(const llvm::json::Object &request) const { frame.GetThread().GetProcess().SetSelectedThread(frame.GetThread()); frame.GetThread().SetSelectedFrame(frame.GetFrameID()); } - dap.variables.locals = frame.GetVariables(/*arguments=*/true, /*locals=*/true, /*statics=*/false, @@ -98,9 +84,15 @@ void ScopesRequestHandler::operator()(const llvm::json::Object &request) const { /*statics=*/true, /*in_scope_only=*/true); dap.variables.registers = frame.GetRegisters(); - body.try_emplace("scopes", dap.CreateTopLevelScopes()); - response.try_emplace("body", std::move(body)); - dap.SendJSON(llvm::json::Value(std::move(response))); + + std::vector scopes = {CreateScope("Locals", VARREF_LOCALS, + dap.variables.locals.GetSize(), false), + CreateScope("Globals", VARREF_GLOBALS, + dap.variables.globals.GetSize(), false), + CreateScope("Registers", VARREF_REGS, + dap.variables.registers.GetSize(), false)}; + + return ScopesResponseBody{std::move(scopes)}; } } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 279e6d3d93814..a8bd672583a5d 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -416,9 +416,11 @@ llvm::json::Value CreateModule(lldb::SBTarget &target, lldb::SBModule &module, } else { object.try_emplace("symbolStatus", "Symbols not found."); } - std::string loaded_addr = std::to_string( - module.GetObjectFileHeaderAddress().GetLoadAddress(target)); - object.try_emplace("addressRange", loaded_addr); + std::string load_address = + llvm::formatv("{0:x}", + module.GetObjectFileHeaderAddress().GetLoadAddress(target)) + .str(); + object.try_emplace("addressRange", load_address); std::string version_str; uint32_t version_nums[3]; uint32_t num_versions = diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h index 9c4dd0584bd21..783f291338d8c 100644 --- a/lldb/tools/lldb-dap/JSONUtils.h +++ b/lldb/tools/lldb-dap/JSONUtils.h @@ -238,27 +238,6 @@ llvm::json::Object CreateEventObject(const llvm::StringRef event_name); protocol::ExceptionBreakpointsFilter CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp); -/// Create a "Scope" JSON object as described in the debug adapter definition. -/// -/// \param[in] name -/// The value to place into the "name" key -// -/// \param[in] variablesReference -/// The value to place into the "variablesReference" key -// -/// \param[in] namedVariables -/// The value to place into the "namedVariables" key -// -/// \param[in] expensive -/// The value to place into the "expensive" key -/// -/// \return -/// A "Scope" JSON object with that follows the formal JSON -/// definition outlined by Microsoft. -llvm::json::Value CreateScope(const llvm::StringRef name, - int64_t variablesReference, - int64_t namedVariables, bool expensive); - /// Create a "Source" JSON object as described in the debug adapter definition. /// /// \param[in] file diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp index 316e146d43a0f..7efab87d39986 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp @@ -335,6 +335,20 @@ llvm::json::Value toJSON(const SetVariableResponseBody &SVR) { return llvm::json::Value(std::move(Body)); } +bool fromJSON(const llvm::json::Value &Params, ScopesArguments &SCA, + llvm::json::Path P) { + json::ObjectMapper O(Params, P); + return O && O.map("frameId", SCA.frameId); +} + +llvm::json::Value toJSON(const ScopesResponseBody &SCR) { + llvm::json::Array scopes; + for (const Scope &scope : SCR.scopes) { + scopes.emplace_back(toJSON(scope)); + } + + return llvm::json::Object{{"scopes", std::move(scopes)}}; +} bool fromJSON(const json::Value &Params, SourceArguments &SA, json::Path P) { json::ObjectMapper O(Params, P); diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h index c6456b4113320..4e08b4728453b 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h @@ -439,6 +439,19 @@ struct SetVariableResponseBody { }; llvm::json::Value toJSON(const SetVariableResponseBody &); +struct ScopesArguments { + /// Retrieve the scopes for the stack frame identified by `frameId`. The + /// `frameId` must have been obtained in the current suspended state. See + /// 'Lifetime of Object References' in the Overview section for details. + uint64_t frameId = LLDB_INVALID_FRAME_ID; +}; +bool fromJSON(const llvm::json::Value &, ScopesArguments &, llvm::json::Path); + +struct ScopesResponseBody { + std::vector scopes; +}; +llvm::json::Value toJSON(const ScopesResponseBody &); + /// Arguments for `source` request. struct SourceArguments { /// Specifies the source content to load. Either `source.path` or @@ -693,7 +706,7 @@ llvm::json::Value toJSON(const DataBreakpointInfoResponseBody &); struct SetDataBreakpointsArguments { /// The contents of this array replaces all existing data breakpoints. An /// empty array clears all data breakpoints. - std::vector breakpoints; + std::vector breakpoints; }; bool fromJSON(const llvm::json::Value &, SetDataBreakpointsArguments &, llvm::json::Path); diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp index 7c2f4b20f4956..ce7519e3b16b8 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp @@ -16,17 +16,18 @@ using namespace llvm; namespace lldb_dap::protocol { -bool fromJSON(const json::Value &Params, PresentationHint &PH, json::Path P) { +bool fromJSON(const json::Value &Params, Source::PresentationHint &PH, + json::Path P) { auto rawHint = Params.getAsString(); if (!rawHint) { P.report("expected a string"); return false; } - std::optional hint = - StringSwitch>(*rawHint) - .Case("normal", ePresentationHintNormal) - .Case("emphasize", ePresentationHintEmphasize) - .Case("deemphasize", ePresentationHintDeemphasize) + std::optional hint = + StringSwitch>(*rawHint) + .Case("normal", Source::eSourcePresentationHintNormal) + .Case("emphasize", Source::eSourcePresentationHintEmphasize) + .Case("deemphasize", Source::eSourcePresentationHintDeemphasize) .Default(std::nullopt); if (!hint) { P.report("unexpected value"); @@ -43,13 +44,13 @@ bool fromJSON(const json::Value &Params, Source &S, json::Path P) { O.map("sourceReference", S.sourceReference); } -llvm::json::Value toJSON(PresentationHint hint) { +llvm::json::Value toJSON(Source::PresentationHint hint) { switch (hint) { - case ePresentationHintNormal: + case Source::eSourcePresentationHintNormal: return "normal"; - case ePresentationHintEmphasize: + case Source::eSourcePresentationHintEmphasize: return "emphasize"; - case ePresentationHintDeemphasize: + case Source::eSourcePresentationHintDeemphasize: return "deemphasize"; } llvm_unreachable("unhandled presentation hint."); @@ -105,7 +106,7 @@ bool fromJSON(const json::Value &Params, ColumnType &CT, json::Path P) { .Case("string", eColumnTypeString) .Case("number", eColumnTypeNumber) .Case("boolean", eColumnTypeBoolean) - .Case("unixTimestampUTC ", eColumnTypeTimestamp) + .Case("unixTimestampUTC", eColumnTypeTimestamp) .Default(std::nullopt); if (!columnType) { P.report("unexpected value, expected 'string', 'number', 'boolean', or " @@ -165,6 +166,32 @@ json::Value toJSON(const ChecksumAlgorithm &CA) { llvm_unreachable("unhandled checksum algorithm."); } +bool fromJSON(const llvm::json::Value &Params, ChecksumAlgorithm &CA, + llvm::json::Path P) { + auto rawAlgorithm = Params.getAsString(); + if (!rawAlgorithm) { + P.report("expected a string"); + return false; + } + + std::optional algorithm = + llvm::StringSwitch>(*rawAlgorithm) + .Case("MD5", eChecksumAlgorithmMD5) + .Case("SHA1", eChecksumAlgorithmSHA1) + .Case("SHA256", eChecksumAlgorithmSHA256) + .Case("timestamp", eChecksumAlgorithmTimestamp) + .Default(std::nullopt); + + if (!algorithm) { + P.report( + "unexpected value, expected 'MD5', 'SHA1', 'SHA256', or 'timestamp'"); + return false; + } + + CA = *algorithm; + return true; +} + json::Value toJSON(const BreakpointModeApplicability &BMA) { switch (BMA) { case eBreakpointModeApplicabilitySource: @@ -304,6 +331,84 @@ static llvm::StringLiteral ToString(AdapterFeature feature) { llvm_unreachable("unhandled adapter feature."); } +llvm::json::Value toJSON(const AdapterFeature &feature) { + return ToString(feature); +} + +bool fromJSON(const llvm::json::Value &Params, AdapterFeature &feature, + llvm::json::Path P) { + auto rawFeature = Params.getAsString(); + if (!rawFeature) { + P.report("expected a string"); + return false; + } + + std::optional parsedFeature = + llvm::StringSwitch>(*rawFeature) + .Case("supportsANSIStyling", eAdapterFeatureANSIStyling) + .Case("supportsBreakpointLocationsRequest", + eAdapterFeatureBreakpointLocationsRequest) + .Case("supportsCancelRequest", eAdapterFeatureCancelRequest) + .Case("supportsClipboardContext", eAdapterFeatureClipboardContext) + .Case("supportsCompletionsRequest", eAdapterFeatureCompletionsRequest) + .Case("supportsConditionalBreakpoints", + eAdapterFeatureConditionalBreakpoints) + .Case("supportsConfigurationDoneRequest", + eAdapterFeatureConfigurationDoneRequest) + .Case("supportsDataBreakpointBytes", + eAdapterFeatureDataBreakpointBytes) + .Case("supportsDataBreakpoints", eAdapterFeatureDataBreakpoints) + .Case("supportsDelayedStackTraceLoading", + eAdapterFeatureDelayedStackTraceLoading) + .Case("supportsDisassembleRequest", eAdapterFeatureDisassembleRequest) + .Case("supportsEvaluateForHovers", eAdapterFeatureEvaluateForHovers) + .Case("supportsExceptionFilterOptions", + eAdapterFeatureExceptionFilterOptions) + .Case("supportsExceptionInfoRequest", + eAdapterFeatureExceptionInfoRequest) + .Case("supportsExceptionOptions", eAdapterFeatureExceptionOptions) + .Case("supportsFunctionBreakpoints", + eAdapterFeatureFunctionBreakpoints) + .Case("supportsGotoTargetsRequest", eAdapterFeatureGotoTargetsRequest) + .Case("supportsHitConditionalBreakpoints", + eAdapterFeatureHitConditionalBreakpoints) + .Case("supportsInstructionBreakpoints", + eAdapterFeatureInstructionBreakpoints) + .Case("supportsLoadedSourcesRequest", + eAdapterFeatureLoadedSourcesRequest) + .Case("supportsLogPoints", eAdapterFeatureLogPoints) + .Case("supportsModulesRequest", eAdapterFeatureModulesRequest) + .Case("supportsReadMemoryRequest", eAdapterFeatureReadMemoryRequest) + .Case("supportsRestartFrame", eAdapterFeatureRestartFrame) + .Case("supportsRestartRequest", eAdapterFeatureRestartRequest) + .Case("supportsSetExpression", eAdapterFeatureSetExpression) + .Case("supportsSetVariable", eAdapterFeatureSetVariable) + .Case("supportsSingleThreadExecutionRequests", + eAdapterFeatureSingleThreadExecutionRequests) + .Case("supportsStepBack", eAdapterFeatureStepBack) + .Case("supportsStepInTargetsRequest", + eAdapterFeatureStepInTargetsRequest) + .Case("supportsSteppingGranularity", + eAdapterFeatureSteppingGranularity) + .Case("supportsTerminateRequest", eAdapterFeatureTerminateRequest) + .Case("supportsTerminateThreadsRequest", + eAdapterFeatureTerminateThreadsRequest) + .Case("supportSuspendDebuggee", eAdapterFeatureSuspendDebuggee) + .Case("supportsValueFormattingOptions", + eAdapterFeatureValueFormattingOptions) + .Case("supportsWriteMemoryRequest", eAdapterFeatureWriteMemoryRequest) + .Case("supportTerminateDebuggee", eAdapterFeatureTerminateDebuggee) + .Default(std::nullopt); + + if (!parsedFeature) { + P.report("unexpected value for AdapterFeature"); + return false; + } + + feature = *parsedFeature; + return true; +} + json::Value toJSON(const Capabilities &C) { json::Object result; @@ -331,6 +436,116 @@ json::Value toJSON(const Capabilities &C) { return result; } +bool fromJSON(const json::Value &Params, Scope::PresentationHint &PH, + json::Path P) { + auto rawHint = Params.getAsString(); + if (!rawHint) { + P.report("expected a string"); + return false; + } + const std::optional hint = + StringSwitch>(*rawHint) + .Case("arguments", Scope::eScopePresentationHintArguments) + .Case("locals", Scope::eScopePresentationHintLocals) + .Case("registers", Scope::eScopePresentationHintRegisters) + .Case("returnValue", Scope::eScopePresentationHintReturnValue) + .Default(std::nullopt); + if (!hint) { + P.report("unexpected value"); + return false; + } + PH = *hint; + return true; +} + +bool fromJSON(const json::Value &Params, Scope &S, json::Path P) { + json::ObjectMapper O(Params, P); + return O && O.map("name", S.name) && + O.mapOptional("presentationHint", S.presentationHint) && + O.map("variablesReference", S.variablesReference) && + O.mapOptional("namedVariables", S.namedVariables) && + O.map("indexedVariables", S.indexedVariables) && + O.mapOptional("source", S.source) && O.map("expensive", S.expensive) && + O.mapOptional("line", S.line) && O.mapOptional("column", S.column) && + O.mapOptional("endLine", S.endLine) && + O.mapOptional("endColumn", S.endColumn); +} + +llvm::json::Value toJSON(const Scope &SC) { + llvm::json::Object result{{"name", SC.name}, + {"variablesReference", SC.variablesReference}, + {"expensive", SC.expensive}}; + + if (SC.presentationHint.has_value()) { + llvm::StringRef presentationHint; + switch (*SC.presentationHint) { + case Scope::eScopePresentationHintArguments: + presentationHint = "arguments"; + break; + case Scope::eScopePresentationHintLocals: + presentationHint = "locals"; + break; + case Scope::eScopePresentationHintRegisters: + presentationHint = "registers"; + break; + case Scope::eScopePresentationHintReturnValue: + presentationHint = "returnValue"; + break; + } + + result.insert({"presentationHint", presentationHint}); + } + + if (SC.namedVariables.has_value()) + result.insert({"namedVariables", SC.namedVariables}); + + if (SC.indexedVariables.has_value()) + result.insert({"indexedVariables", SC.indexedVariables}); + + if (SC.source.has_value()) + result.insert({"source", SC.source}); + + if (SC.line.has_value()) + result.insert({"line", SC.line}); + + if (SC.column.has_value()) + result.insert({"column", SC.column}); + + if (SC.endLine.has_value()) + result.insert({"endLine", SC.endLine}); + + if (SC.endColumn.has_value()) + result.insert({"endColumn", SC.endColumn}); + + return result; +} + +bool fromJSON(const llvm::json::Value &Params, Capabilities &C, + llvm::json::Path P) { + auto *Object = Params.getAsObject(); + if (!Object) { + P.report("expected an object"); + return false; + } + // Check for the presence of supported features. + for (unsigned i = eAdapterFeatureFirst; i <= eAdapterFeatureLast; ++i) { + AdapterFeature feature = static_cast(i); + if (Object->getBoolean(ToString(feature))) + C.supportedFeatures.insert(feature); + } + llvm::json::ObjectMapper O(Params, P); + return O && + O.mapOptional("exceptionBreakpointFilters", + C.exceptionBreakpointFilters) && + O.mapOptional("completionTriggerCharacters", + C.completionTriggerCharacters) && + O.mapOptional("additionalModuleColumns", C.additionalModuleColumns) && + O.mapOptional("supportedChecksumAlgorithms", + C.supportedChecksumAlgorithms) && + O.mapOptional("breakpointModes", C.breakpointModes) && + O.mapOptional("$__lldb_version", C.lldbExtVersion); +} + bool fromJSON(const llvm::json::Value &Params, SteppingGranularity &SG, llvm::json::Path P) { auto raw_granularity = Params.getAsString(); @@ -352,6 +567,18 @@ bool fromJSON(const llvm::json::Value &Params, SteppingGranularity &SG, return true; } +llvm::json::Value toJSON(const SteppingGranularity &SG) { + switch (SG) { + case eSteppingGranularityStatement: + return "statement"; + case eSteppingGranularityLine: + return "line"; + case eSteppingGranularityInstruction: + return "instruction"; + } + llvm_unreachable("unhandled stepping granularity."); +} + bool fromJSON(const llvm::json::Value &Params, ValueFormat &VF, llvm::json::Path P) { json::ObjectMapper O(Params, P); @@ -446,18 +673,48 @@ bool fromJSON(const llvm::json::Value &Params, Breakpoint &BP, bool fromJSON(const llvm::json::Value &Params, SourceBreakpoint &SB, llvm::json::Path P) { - json::ObjectMapper O(Params, P); - return O && O.map("line", SB.line) && O.map("column", SB.column) && - O.map("condition", SB.condition) && - O.map("hitCondition", SB.hitCondition) && - O.map("logMessage", SB.logMessage) && O.map("mode", SB.mode); + llvm::json::ObjectMapper O(Params, P); + return O && O.map("line", SB.line) && O.mapOptional("column", SB.column) && + O.mapOptional("condition", SB.condition) && + O.mapOptional("hitCondition", SB.hitCondition) && + O.mapOptional("logMessage", SB.logMessage) && + O.mapOptional("mode", SB.mode); +} + +llvm::json::Value toJSON(const SourceBreakpoint &SB) { + llvm::json::Object result{{"line", SB.line}}; + + if (SB.column) + result.insert({"column", *SB.column}); + if (SB.condition) + result.insert({"condition", *SB.condition}); + if (SB.hitCondition) + result.insert({"hitCondition", *SB.hitCondition}); + if (SB.logMessage) + result.insert({"logMessage", *SB.logMessage}); + if (SB.mode) + result.insert({"mode", *SB.mode}); + + return result; } bool fromJSON(const llvm::json::Value &Params, FunctionBreakpoint &FB, llvm::json::Path P) { - json::ObjectMapper O(Params, P); - return O && O.map("name", FB.name) && O.map("condition", FB.condition) && - O.map("hitCondition", FB.hitCondition); + llvm::json::ObjectMapper O(Params, P); + return O && O.map("name", FB.name) && + O.mapOptional("condition", FB.condition) && + O.mapOptional("hitCondition", FB.hitCondition); +} + +llvm::json::Value toJSON(const FunctionBreakpoint &FB) { + llvm::json::Object result{{"name", FB.name}}; + + if (FB.condition) + result.insert({"condition", *FB.condition}); + if (FB.hitCondition) + result.insert({"hitCondition", *FB.hitCondition}); + + return result; } bool fromJSON(const llvm::json::Value &Params, DataBreakpointAccessType &DBAT, @@ -493,21 +750,36 @@ llvm::json::Value toJSON(const DataBreakpointAccessType &DBAT) { llvm_unreachable("unhandled data breakpoint access type."); } -bool fromJSON(const llvm::json::Value &Params, DataBreakpointInfo &DBI, +bool fromJSON(const llvm::json::Value &Params, DataBreakpoint &DBI, llvm::json::Path P) { - json::ObjectMapper O(Params, P); + llvm::json::ObjectMapper O(Params, P); return O && O.map("dataId", DBI.dataId) && - O.map("accessType", DBI.accessType) && - O.map("condition", DBI.condition) && - O.map("hitCondition", DBI.hitCondition); + O.mapOptional("accessType", DBI.accessType) && + O.mapOptional("condition", DBI.condition) && + O.mapOptional("hitCondition", DBI.hitCondition); +} + +llvm::json::Value toJSON(const DataBreakpoint &DBI) { + llvm::json::Object result{{"dataId", DBI.dataId}}; + + if (DBI.accessType) + result.insert({"accessType", *DBI.accessType}); + if (DBI.condition) + result.insert({"condition", *DBI.condition}); + if (DBI.hitCondition) + result.insert({"hitCondition", *DBI.hitCondition}); + + return result; } bool fromJSON(const llvm::json::Value &Params, InstructionBreakpoint &IB, llvm::json::Path P) { - json::ObjectMapper O(Params, P); + llvm::json::ObjectMapper O(Params, P); return O && O.map("instructionReference", IB.instructionReference) && - O.map("offset", IB.offset) && O.map("condition", IB.condition) && - O.map("hitCondition", IB.hitCondition) && O.map("mode", IB.mode); + O.mapOptional("offset", IB.offset) && + O.mapOptional("condition", IB.condition) && + O.mapOptional("hitCondition", IB.hitCondition) && + O.mapOptional("mode", IB.mode); } } // namespace lldb_dap::protocol diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h index cab188637acd5..3df77ee7374a7 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h @@ -27,6 +27,8 @@ #include #include +#define LLDB_DAP_INVALID_VARRERF UINT64_MAX + namespace lldb_dap::protocol { /// An `ExceptionBreakpointsFilter` is shown in the UI as an filter option for @@ -102,6 +104,7 @@ enum ChecksumAlgorithm : unsigned { eChecksumAlgorithmSHA256, eChecksumAlgorithmTimestamp }; +bool fromJSON(const llvm::json::Value &, ChecksumAlgorithm &, llvm::json::Path); llvm::json::Value toJSON(const ChecksumAlgorithm &); /// Describes one or more type of breakpoint a BreakpointMode applies to. This @@ -237,7 +240,11 @@ enum AdapterFeature : unsigned { /// The debug adapter supports the `terminateDebuggee` attribute on the /// `disconnect` request. eAdapterFeatureTerminateDebuggee, + eAdapterFeatureFirst = eAdapterFeatureANSIStyling, + eAdapterFeatureLast = eAdapterFeatureTerminateDebuggee, }; +bool fromJSON(const llvm::json::Value &, AdapterFeature &, llvm::json::Path); +llvm::json::Value toJSON(const AdapterFeature &); /// Information about the capabilities of a debug adapter. struct Capabilities { @@ -275,19 +282,19 @@ struct Capabilities { /// @} }; +bool fromJSON(const llvm::json::Value &, Capabilities &, llvm::json::Path); llvm::json::Value toJSON(const Capabilities &); -enum PresentationHint : unsigned { - ePresentationHintNormal, - ePresentationHintEmphasize, - ePresentationHintDeemphasize, -}; -llvm::json::Value toJSON(PresentationHint hint); - /// A `Source` is a descriptor for source code. It is returned from the debug /// adapter as part of a `StackFrame` and it is used by clients when specifying /// breakpoints. struct Source { + enum PresentationHint : unsigned { + eSourcePresentationHintNormal, + eSourcePresentationHintEmphasize, + eSourcePresentationHintDeemphasize, + }; + /// The short name of the source. Every source returned from the debug adapter /// has a name. When sending a source to the debug adapter this name is /// optional. @@ -311,9 +318,82 @@ struct Source { // unsupported keys: origin, sources, adapterData, checksums }; +bool fromJSON(const llvm::json::Value &, Source::PresentationHint &, + llvm::json::Path); +llvm::json::Value toJSON(Source::PresentationHint); bool fromJSON(const llvm::json::Value &, Source &, llvm::json::Path); llvm::json::Value toJSON(const Source &); +/// A `Scope` is a named container for variables. Optionally a scope can map to +/// a source or a range within a source. +struct Scope { + enum PresentationHint : unsigned { + eScopePresentationHintArguments, + eScopePresentationHintLocals, + eScopePresentationHintRegisters, + eScopePresentationHintReturnValue + }; + /// Name of the scope such as 'Arguments', 'Locals', or 'Registers'. This + /// string is shown in the UI as is and can be translated. + //// + std::string name; + + /// A hint for how to present this scope in the UI. If this attribute is + /// missing, the scope is shown with a generic UI. + /// Values: + /// 'arguments': Scope contains method arguments. + /// 'locals': Scope contains local variables. + /// 'registers': Scope contains registers. Only a single `registers` scope + /// should be returned from a `scopes` request. + /// 'returnValue': Scope contains one or more return values. + /// etc. + std::optional presentationHint; + + /// The variables of this scope can be retrieved by passing the value of + /// `variablesReference` to the `variables` request as long as execution + /// remains suspended. See 'Lifetime of Object References' in the Overview + /// section for details. + //// + uint64_t variablesReference = LLDB_DAP_INVALID_VARRERF; + + /// The number of named variables in this scope. + /// The client can use this information to present the variables in a paged UI + /// and fetch them in chunks. + std::optional namedVariables; + + /// The number of indexed variables in this scope. + /// The client can use this information to present the variables in a paged UI + /// and fetch them in chunks. + std::optional indexedVariables; + + /// The source for this scope. + std::optional source; + + /// If true, the number of variables in this scope is large or expensive to + /// retrieve. + bool expensive = false; + + /// The start line of the range covered by this scope. + std::optional line; + + /// Start position of the range covered by the scope. It is measured in UTF-16 + /// code units and the client capability `columnsStartAt1` determines whether + /// it is 0- or 1-based. + std::optional column; + + /// The end line of the range covered by this scope. + std::optional endLine; + + /// End position of the range covered by the scope. It is measured in UTF-16 + /// code units and the client capability `columnsStartAt1` determines whether + /// it is 0- or 1-based. + std::optional endColumn; +}; +bool fromJSON(const llvm::json::Value &Params, Scope::PresentationHint &PH, + llvm::json::Path); +bool fromJSON(const llvm::json::Value &, Scope &, llvm::json::Path); +llvm::json::Value toJSON(const Scope &); + /// The granularity of one `step` in the stepping requests `next`, `stepIn`, /// `stepOut` and `stepBack`. enum SteppingGranularity : unsigned { @@ -332,6 +412,7 @@ enum SteppingGranularity : unsigned { }; bool fromJSON(const llvm::json::Value &, SteppingGranularity &, llvm::json::Path); +llvm::json::Value toJSON(const SteppingGranularity &); /// Provides formatting information for a value. struct ValueFormat { @@ -464,6 +545,7 @@ struct SourceBreakpoint { std::optional mode; }; bool fromJSON(const llvm::json::Value &, SourceBreakpoint &, llvm::json::Path); +llvm::json::Value toJSON(const SourceBreakpoint &); /// Properties of a breakpoint passed to the `setFunctionBreakpoints` request. struct FunctionBreakpoint { @@ -483,6 +565,7 @@ struct FunctionBreakpoint { }; bool fromJSON(const llvm::json::Value &, FunctionBreakpoint &, llvm::json::Path); +llvm::json::Value toJSON(const FunctionBreakpoint &); /// This enumeration defines all possible access types for data breakpoints. /// Values: ‘read’, ‘write’, ‘readWrite’ @@ -496,7 +579,7 @@ bool fromJSON(const llvm::json::Value &, DataBreakpointAccessType &, llvm::json::Value toJSON(const DataBreakpointAccessType &); /// Properties of a data breakpoint passed to the `setDataBreakpoints` request. -struct DataBreakpointInfo { +struct DataBreakpoint { /// An id representing the data. This id is returned from the /// `dataBreakpointInfo` request. std::string dataId; @@ -511,8 +594,8 @@ struct DataBreakpointInfo { /// The debug adapter is expected to interpret the expression as needed. std::optional hitCondition; }; -bool fromJSON(const llvm::json::Value &, DataBreakpointInfo &, - llvm::json::Path); +bool fromJSON(const llvm::json::Value &, DataBreakpoint &, llvm::json::Path); +llvm::json::Value toJSON(const DataBreakpoint &); /// Properties of a breakpoint passed to the `setInstructionBreakpoints` request struct InstructionBreakpoint { diff --git a/lldb/tools/lldb-dap/Watchpoint.cpp b/lldb/tools/lldb-dap/Watchpoint.cpp index 73ed4fdbae1b8..0acc980890be8 100644 --- a/lldb/tools/lldb-dap/Watchpoint.cpp +++ b/lldb/tools/lldb-dap/Watchpoint.cpp @@ -17,7 +17,7 @@ #include namespace lldb_dap { -Watchpoint::Watchpoint(DAP &d, const protocol::DataBreakpointInfo &breakpoint) +Watchpoint::Watchpoint(DAP &d, const protocol::DataBreakpoint &breakpoint) : BreakpointBase(d, breakpoint.condition, breakpoint.hitCondition) { llvm::StringRef dataId = breakpoint.dataId; auto [addr_str, size_str] = dataId.split('/'); diff --git a/lldb/tools/lldb-dap/Watchpoint.h b/lldb/tools/lldb-dap/Watchpoint.h index b7fe58fe73501..d943e1218bdcd 100644 --- a/lldb/tools/lldb-dap/Watchpoint.h +++ b/lldb/tools/lldb-dap/Watchpoint.h @@ -22,7 +22,7 @@ namespace lldb_dap { class Watchpoint : public BreakpointBase { public: - Watchpoint(DAP &d, const protocol::DataBreakpointInfo &breakpoint); + Watchpoint(DAP &d, const protocol::DataBreakpoint &breakpoint); Watchpoint(DAP &d, lldb::SBWatchpoint wp) : BreakpointBase(d), m_wp(wp) {} void SetCondition() override; diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json index e3e46526f379f..d5ca604798799 100644 --- a/lldb/tools/lldb-dap/package.json +++ b/lldb/tools/lldb-dap/package.json @@ -244,6 +244,26 @@ } } ], + "commands": [ + { + "command": "lldb-dap.modules.copyProperty", + "title": "Copy Value" + } + ], + "menus": { + "commandPalette": [ + { + "command": "lldb-dap.modules.copyProperty", + "when": "false" + } + ], + "view/item/context": [ + { + "command": "lldb-dap.modules.copyProperty", + "when": "view == lldb-dap.modules && viewItem == property" + } + ] + }, "breakpoints": [ { "language": "ada" diff --git a/lldb/tools/lldb-dap/src-ts/extension.ts b/lldb/tools/lldb-dap/src-ts/extension.ts index a5c0a09ae60cf..c8e5146e29cea 100644 --- a/lldb/tools/lldb-dap/src-ts/extension.ts +++ b/lldb/tools/lldb-dap/src-ts/extension.ts @@ -6,7 +6,10 @@ import { LaunchUriHandler } from "./uri-launch-handler"; import { LLDBDapConfigurationProvider } from "./debug-configuration-provider"; import { LLDBDapServer } from "./lldb-dap-server"; import { DebugSessionTracker } from "./debug-session-tracker"; -import { ModulesDataProvider } from "./ui/modules-data-provider"; +import { + ModulesDataProvider, + ModuleProperty, +} from "./ui/modules-data-provider"; /** * This class represents the extension and manages its life cycle. Other extensions @@ -40,6 +43,11 @@ export class LLDBDapExtension extends DisposableContext { ), vscode.window.registerUriHandler(new LaunchUriHandler()), ); + + vscode.commands.registerCommand( + "lldb-dap.modules.copyProperty", + (node: ModuleProperty) => vscode.env.clipboard.writeText(node.value), + ); } } diff --git a/lldb/tools/lldb-dap/src-ts/ui/modules-data-provider.ts b/lldb/tools/lldb-dap/src-ts/ui/modules-data-provider.ts index 478c162de8878..091c1d69ac647 100644 --- a/lldb/tools/lldb-dap/src-ts/ui/modules-data-provider.ts +++ b/lldb/tools/lldb-dap/src-ts/ui/modules-data-provider.ts @@ -2,60 +2,86 @@ import * as vscode from "vscode"; import { DebugProtocol } from "@vscode/debugprotocol"; import { DebugSessionTracker } from "../debug-session-tracker"; -/** A tree data provider for listing loaded modules for the active debug session. */ -export class ModulesDataProvider - implements vscode.TreeDataProvider -{ - private changeTreeData = new vscode.EventEmitter(); - readonly onDidChangeTreeData = this.changeTreeData.event; +export interface ModuleProperty { + key: string; + value: string; +} - constructor(private readonly tracker: DebugSessionTracker) { - tracker.onDidChangeModules(() => this.changeTreeData.fire()); - vscode.debug.onDidChangeActiveDebugSession(() => - this.changeTreeData.fire(), - ); +/** Type to represent both Module and ModuleProperty since TreeDataProvider + * expects one concrete type */ +type TreeData = DebugProtocol.Module | ModuleProperty; + +function isModule(type: TreeData): type is DebugProtocol.Module { + return (type as DebugProtocol.Module).id !== undefined; +} + +class ModuleItem extends vscode.TreeItem { + constructor(module: DebugProtocol.Module) { + super(module.name, vscode.TreeItemCollapsibleState.Collapsed); + this.description = module.symbolStatus; } - getTreeItem(module: DebugProtocol.Module): vscode.TreeItem { - let treeItem = new vscode.TreeItem(/*label=*/ module.name); - if (module.path) { - treeItem.description = `${module.id} -- ${module.path}`; - } else { - treeItem.description = `${module.id}`; - } + static getProperties(module: DebugProtocol.Module): ModuleProperty[] { + // does not include the name and symbol status as it is show in the parent. + let children: ModuleProperty[] = []; + children.push({ key: "id:", value: module.id.toString() }); - const tooltip = new vscode.MarkdownString(); - tooltip.appendMarkdown(`# ${module.name}\n\n`); - tooltip.appendMarkdown(`- **ID**: ${module.id}\n`); if (module.addressRange) { - tooltip.appendMarkdown( - `- **Load address**: 0x${Number(module.addressRange).toString(16)}\n`, - ); + children.push({ + key: "load address:", + value: module.addressRange, + }); } if (module.path) { - tooltip.appendMarkdown(`- **Path**: ${module.path}\n`); + children.push({ key: "path:", value: module.path }); } if (module.version) { - tooltip.appendMarkdown(`- **Version**: ${module.version}\n`); - } - if (module.symbolStatus) { - tooltip.appendMarkdown(`- **Symbol status**: ${module.symbolStatus}\n`); + children.push({ key: "version:", value: module.version }); } if (module.symbolFilePath) { - tooltip.appendMarkdown( - `- **Symbol file path**: ${module.symbolFilePath}\n`, - ); + children.push({ key: "symbol filepath:", value: module.symbolFilePath }); + } + return children; + } +} + +/** A tree data provider for listing loaded modules for the active debug session. */ +export class ModulesDataProvider implements vscode.TreeDataProvider { + private changeTreeData = new vscode.EventEmitter(); + readonly onDidChangeTreeData = this.changeTreeData.event; + + constructor(private readonly tracker: DebugSessionTracker) { + tracker.onDidChangeModules(() => this.changeTreeData.fire()); + vscode.debug.onDidChangeActiveDebugSession(() => + this.changeTreeData.fire(), + ); + } + + getTreeItem(module: TreeData): vscode.TreeItem { + if (isModule(module)) { + return new ModuleItem(module); } - treeItem.tooltip = tooltip; - return treeItem; + let item = new vscode.TreeItem(module.key); + item.description = module.value; + item.tooltip = `${module.key} ${module.value}`; + item.contextValue = "property"; + return item; } - getChildren(): DebugProtocol.Module[] { + getChildren(element?: TreeData): TreeData[] { if (!vscode.debug.activeDebugSession) { return []; } - return this.tracker.debugSessionModules(vscode.debug.activeDebugSession); + if (!element) { + return this.tracker.debugSessionModules(vscode.debug.activeDebugSession); + } + + if (isModule(element)) { + return ModuleItem.getProperties(element); + } + + return []; } } diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt index 8b240654046e2..af7d11e2e95e2 100644 --- a/lldb/unittests/DAP/CMakeLists.txt +++ b/lldb/unittests/DAP/CMakeLists.txt @@ -1,7 +1,11 @@ add_lldb_unittest(DAPTests + DAPTest.cpp + Handler/DisconnectTest.cpp JSONUtilsTest.cpp LLDBUtilsTest.cpp ProtocolTypesTest.cpp + TestBase.cpp + TransportTest.cpp LINK_LIBS lldbDAP diff --git a/lldb/unittests/DAP/DAPTest.cpp b/lldb/unittests/DAP/DAPTest.cpp new file mode 100644 index 0000000000000..5fb6bf7e564ab --- /dev/null +++ b/lldb/unittests/DAP/DAPTest.cpp @@ -0,0 +1,38 @@ +//===-- DAPTest.cpp -------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DAP.h" +#include "Protocol/ProtocolBase.h" +#include "TestBase.h" +#include "Transport.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" +#include +#include +#include + +using namespace llvm; +using namespace lldb; +using namespace lldb_dap; +using namespace lldb_dap_tests; +using namespace lldb_dap::protocol; + +class DAPTest : public TransportBase {}; + +TEST_F(DAPTest, SendProtocolMessages) { + DAP dap{ + /*log=*/nullptr, + /*default_repl_mode=*/ReplMode::Auto, + /*pre_init_commands=*/{}, + /*transport=*/*to_dap, + }; + dap.Send(Event{/*event=*/"my-event", /*body=*/std::nullopt}); + ASSERT_THAT_EXPECTED(from_dap->Read(std::chrono::milliseconds(1)), + HasValue(testing::VariantWith(testing::FieldsAre( + /*event=*/"my-event", /*body=*/std::nullopt)))); +} diff --git a/lldb/unittests/DAP/Handler/DisconnectTest.cpp b/lldb/unittests/DAP/Handler/DisconnectTest.cpp new file mode 100644 index 0000000000000..6f3470239e974 --- /dev/null +++ b/lldb/unittests/DAP/Handler/DisconnectTest.cpp @@ -0,0 +1,35 @@ +//===-- DisconnectTest.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DAP.h" +#include "Handler/RequestHandler.h" +#include "Protocol/ProtocolBase.h" +#include "TestBase.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" +#include +#include + +using namespace llvm; +using namespace lldb; +using namespace lldb_dap; +using namespace lldb_dap_tests; +using namespace lldb_dap::protocol; + +class DisconnectRequestHandlerTest : public DAPTestBase {}; + +TEST_F(DisconnectRequestHandlerTest, DisconnectingTriggersTerminated) { + DisconnectRequestHandler handler(*dap); + EXPECT_FALSE(dap->disconnecting); + ASSERT_THAT_ERROR(handler.Run(std::nullopt), Succeeded()); + EXPECT_TRUE(dap->disconnecting); + std::vector messages = DrainOutput(); + EXPECT_THAT(messages, + testing::Contains(testing::VariantWith(testing::FieldsAre( + /*event=*/"terminated", /*body=*/std::nullopt)))); +} diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp index 56b21f18fa7cd..5d5125dc78fba 100644 --- a/lldb/unittests/DAP/ProtocolTypesTest.cpp +++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp @@ -50,7 +50,7 @@ TEST(ProtocolTypesTest, Source) { source.name = "testName"; source.path = "/path/to/source"; source.sourceReference = 12345; - source.presentationHint = ePresentationHintEmphasize; + source.presentationHint = Source::eSourcePresentationHintEmphasize; llvm::Expected deserialized_source = roundtrip(source); ASSERT_THAT_EXPECTED(deserialized_source, llvm::Succeeded()); @@ -101,8 +101,8 @@ TEST(ProtocolTypesTest, Breakpoint) { breakpoint.id = 42; breakpoint.verified = true; breakpoint.message = "Breakpoint set successfully"; - breakpoint.source = - Source{"test.cpp", "/path/to/test.cpp", 123, ePresentationHintNormal}; + breakpoint.source = Source{"test.cpp", "/path/to/test.cpp", 123, + Source::eSourcePresentationHintNormal}; breakpoint.line = 10; breakpoint.column = 5; breakpoint.endLine = 15; @@ -132,3 +132,401 @@ TEST(ProtocolTypesTest, Breakpoint) { EXPECT_EQ(breakpoint.offset, deserialized_breakpoint->offset); EXPECT_EQ(breakpoint.reason, deserialized_breakpoint->reason); } + +TEST(ProtocolTypesTest, SourceBreakpoint) { + SourceBreakpoint source_breakpoint; + source_breakpoint.line = 42; + source_breakpoint.column = 5; + source_breakpoint.condition = "x > 10"; + source_breakpoint.hitCondition = "5"; + source_breakpoint.logMessage = "Breakpoint hit at line 42"; + source_breakpoint.mode = "hardware"; + + llvm::Expected deserialized_source_breakpoint = + roundtrip(source_breakpoint); + ASSERT_THAT_EXPECTED(deserialized_source_breakpoint, llvm::Succeeded()); + + EXPECT_EQ(source_breakpoint.line, deserialized_source_breakpoint->line); + EXPECT_EQ(source_breakpoint.column, deserialized_source_breakpoint->column); + EXPECT_EQ(source_breakpoint.condition, + deserialized_source_breakpoint->condition); + EXPECT_EQ(source_breakpoint.hitCondition, + deserialized_source_breakpoint->hitCondition); + EXPECT_EQ(source_breakpoint.logMessage, + deserialized_source_breakpoint->logMessage); + EXPECT_EQ(source_breakpoint.mode, deserialized_source_breakpoint->mode); +} + +TEST(ProtocolTypesTest, FunctionBreakpoint) { + FunctionBreakpoint function_breakpoint; + function_breakpoint.name = "myFunction"; + function_breakpoint.condition = "x == 0"; + function_breakpoint.hitCondition = "3"; + + llvm::Expected deserialized_function_breakpoint = + roundtrip(function_breakpoint); + ASSERT_THAT_EXPECTED(deserialized_function_breakpoint, llvm::Succeeded()); + + EXPECT_EQ(function_breakpoint.name, deserialized_function_breakpoint->name); + EXPECT_EQ(function_breakpoint.condition, + deserialized_function_breakpoint->condition); + EXPECT_EQ(function_breakpoint.hitCondition, + deserialized_function_breakpoint->hitCondition); +} + +TEST(ProtocolTypesTest, DataBreakpoint) { + DataBreakpoint data_breakpoint_info; + data_breakpoint_info.dataId = "variable1"; + data_breakpoint_info.accessType = eDataBreakpointAccessTypeReadWrite; + data_breakpoint_info.condition = "x > 100"; + data_breakpoint_info.hitCondition = "10"; + + llvm::Expected deserialized_data_breakpoint_info = + roundtrip(data_breakpoint_info); + ASSERT_THAT_EXPECTED(deserialized_data_breakpoint_info, llvm::Succeeded()); + + EXPECT_EQ(data_breakpoint_info.dataId, + deserialized_data_breakpoint_info->dataId); + EXPECT_EQ(data_breakpoint_info.accessType, + deserialized_data_breakpoint_info->accessType); + EXPECT_EQ(data_breakpoint_info.condition, + deserialized_data_breakpoint_info->condition); + EXPECT_EQ(data_breakpoint_info.hitCondition, + deserialized_data_breakpoint_info->hitCondition); +} + +TEST(ProtocolTypesTest, Capabilities) { + Capabilities capabilities; + + // Populate supported features. + capabilities.supportedFeatures.insert(eAdapterFeatureANSIStyling); + capabilities.supportedFeatures.insert( + eAdapterFeatureBreakpointLocationsRequest); + + // Populate optional fields. + capabilities.exceptionBreakpointFilters = { + {{"filter1", "Filter 1", "Description 1", true, true, "Condition 1"}, + {"filter2", "Filter 2", "Description 2", false, false, "Condition 2"}}}; + + capabilities.completionTriggerCharacters = {".", "->"}; + capabilities.additionalModuleColumns = { + {"moduleName", "Module Name", "uppercase", eColumnTypeString, 20}}; + capabilities.supportedChecksumAlgorithms = {eChecksumAlgorithmMD5, + eChecksumAlgorithmSHA256}; + capabilities.breakpointModes = {{"hardware", + "Hardware Breakpoint", + "Description", + {eBreakpointModeApplicabilitySource}}}; + capabilities.lldbExtVersion = "1.0.0"; + + // Perform roundtrip serialization and deserialization. + llvm::Expected deserialized_capabilities = + roundtrip(capabilities); + ASSERT_THAT_EXPECTED(deserialized_capabilities, llvm::Succeeded()); + + // Verify supported features. + EXPECT_EQ(capabilities.supportedFeatures, + deserialized_capabilities->supportedFeatures); + + // Verify exception breakpoint filters. + ASSERT_TRUE( + deserialized_capabilities->exceptionBreakpointFilters.has_value()); + EXPECT_EQ(capabilities.exceptionBreakpointFilters->size(), + deserialized_capabilities->exceptionBreakpointFilters->size()); + for (size_t i = 0; i < capabilities.exceptionBreakpointFilters->size(); ++i) { + const auto &original = capabilities.exceptionBreakpointFilters->at(i); + const auto &deserialized = + deserialized_capabilities->exceptionBreakpointFilters->at(i); + EXPECT_EQ(original.filter, deserialized.filter); + EXPECT_EQ(original.label, deserialized.label); + EXPECT_EQ(original.description, deserialized.description); + EXPECT_EQ(original.defaultState, deserialized.defaultState); + EXPECT_EQ(original.supportsCondition, deserialized.supportsCondition); + EXPECT_EQ(original.conditionDescription, deserialized.conditionDescription); + } + + // Verify completion trigger characters. + ASSERT_TRUE( + deserialized_capabilities->completionTriggerCharacters.has_value()); + EXPECT_EQ(capabilities.completionTriggerCharacters, + deserialized_capabilities->completionTriggerCharacters); + + // Verify additional module columns. + ASSERT_TRUE(deserialized_capabilities->additionalModuleColumns.has_value()); + EXPECT_EQ(capabilities.additionalModuleColumns->size(), + deserialized_capabilities->additionalModuleColumns->size()); + for (size_t i = 0; i < capabilities.additionalModuleColumns->size(); ++i) { + const auto &original = capabilities.additionalModuleColumns->at(i); + const auto &deserialized = + deserialized_capabilities->additionalModuleColumns->at(i); + EXPECT_EQ(original.attributeName, deserialized.attributeName); + EXPECT_EQ(original.label, deserialized.label); + EXPECT_EQ(original.format, deserialized.format); + EXPECT_EQ(original.type, deserialized.type); + EXPECT_EQ(original.width, deserialized.width); + } + + // Verify supported checksum algorithms. + ASSERT_TRUE( + deserialized_capabilities->supportedChecksumAlgorithms.has_value()); + EXPECT_EQ(capabilities.supportedChecksumAlgorithms, + deserialized_capabilities->supportedChecksumAlgorithms); + + // Verify breakpoint modes. + ASSERT_TRUE(deserialized_capabilities->breakpointModes.has_value()); + EXPECT_EQ(capabilities.breakpointModes->size(), + deserialized_capabilities->breakpointModes->size()); + for (size_t i = 0; i < capabilities.breakpointModes->size(); ++i) { + const auto &original = capabilities.breakpointModes->at(i); + const auto &deserialized = + deserialized_capabilities->breakpointModes->at(i); + EXPECT_EQ(original.mode, deserialized.mode); + EXPECT_EQ(original.label, deserialized.label); + EXPECT_EQ(original.description, deserialized.description); + EXPECT_EQ(original.appliesTo, deserialized.appliesTo); + } + + // Verify lldb extension version. + ASSERT_TRUE(deserialized_capabilities->lldbExtVersion.has_value()); + EXPECT_EQ(capabilities.lldbExtVersion, + deserialized_capabilities->lldbExtVersion); +} + +TEST(ProtocolTypesTest, Scope) { + Scope scope; + scope.name = "Locals"; + scope.presentationHint = Scope::eScopePresentationHintLocals; + scope.variablesReference = 1; + scope.namedVariables = 2; + scope.indexedVariables = std::nullopt; + scope.expensive = false; + scope.line = 2; + scope.column = 3; + scope.endLine = 10; + scope.endColumn = 20; + + Source source; + source.name = "testName"; + source.path = "/path/to/source"; + source.sourceReference = 12345; + source.presentationHint = Source::eSourcePresentationHintNormal; + scope.source = source; + + llvm::Expected deserialized_scope = roundtrip(scope); + ASSERT_THAT_EXPECTED(deserialized_scope, llvm::Succeeded()); + EXPECT_EQ(scope.name, deserialized_scope->name); + EXPECT_EQ(scope.presentationHint, deserialized_scope->presentationHint); + EXPECT_EQ(scope.variablesReference, deserialized_scope->variablesReference); + EXPECT_EQ(scope.namedVariables, deserialized_scope->namedVariables); + EXPECT_EQ(scope.indexedVariables, deserialized_scope->indexedVariables); + EXPECT_EQ(scope.expensive, deserialized_scope->expensive); + EXPECT_EQ(scope.line, deserialized_scope->line); + EXPECT_EQ(scope.column, deserialized_scope->column); + EXPECT_EQ(scope.endLine, deserialized_scope->endLine); + EXPECT_EQ(scope.endColumn, deserialized_scope->endColumn); + + EXPECT_THAT(deserialized_scope->source.has_value(), true); + const Source &deserialized_source = deserialized_scope->source.value(); + + EXPECT_EQ(source.path, deserialized_source.path); + EXPECT_EQ(source.sourceReference, deserialized_source.sourceReference); + EXPECT_EQ(source.presentationHint, deserialized_source.presentationHint); +} + +TEST(ProtocolTypesTest, PresentationHint) { + // Test all PresentationHint values. + std::vector> test_cases = + {{Source::eSourcePresentationHintNormal, "normal"}, + {Source::eSourcePresentationHintEmphasize, "emphasize"}, + {Source::eSourcePresentationHintDeemphasize, "deemphasize"}}; + + for (const auto &test_case : test_cases) { + // Serialize the PresentationHint to JSON. + llvm::json::Value serialized = toJSON(test_case.first); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), test_case.second); + + // Deserialize the JSON back to PresentationHint. + Source::PresentationHint deserialized; + llvm::json::Path::Root root; + ASSERT_TRUE(fromJSON(serialized, deserialized, root)) + << llvm::toString(root.getError()); + EXPECT_EQ(deserialized, test_case.first); + } + + // Test invalid value. + llvm::json::Value invalid_value = "invalid_hint"; + Source::PresentationHint deserialized_invalid; + llvm::json::Path::Root root; + EXPECT_FALSE(fromJSON(invalid_value, deserialized_invalid, root)); +} + +TEST(ProtocolTypesTest, SteppingGranularity) { + // Test all SteppingGranularity values. + std::vector> test_cases = { + {eSteppingGranularityStatement, "statement"}, + {eSteppingGranularityLine, "line"}, + {eSteppingGranularityInstruction, "instruction"}}; + + for (const auto &test_case : test_cases) { + // Serialize the SteppingGranularity to JSON. + llvm::json::Value serialized = toJSON(test_case.first); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), test_case.second); + + // Deserialize the JSON back to SteppingGranularity. + SteppingGranularity deserialized; + llvm::json::Path::Root root; + ASSERT_TRUE(fromJSON(serialized, deserialized, root)) + << llvm::toString(root.getError()); + EXPECT_EQ(deserialized, test_case.first); + } + + // Test invalid value. + llvm::json::Value invalid_value = "invalid_granularity"; + SteppingGranularity deserialized_invalid; + llvm::json::Path::Root root; + EXPECT_FALSE(fromJSON(invalid_value, deserialized_invalid, root)); +} + +TEST(ProtocolTypesTest, BreakpointReason) { + // Test all BreakpointReason values. + std::vector> test_cases = { + {BreakpointReason::eBreakpointReasonPending, "pending"}, + {BreakpointReason::eBreakpointReasonFailed, "failed"}}; + + for (const auto &test_case : test_cases) { + // Serialize the BreakpointReason to JSON. + llvm::json::Value serialized = toJSON(test_case.first); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), test_case.second); + + // Deserialize the JSON back to BreakpointReason. + BreakpointReason deserialized; + llvm::json::Path::Root root; + ASSERT_TRUE(fromJSON(serialized, deserialized, root)) + << llvm::toString(root.getError()); + EXPECT_EQ(deserialized, test_case.first); + } + + // Test invalid value. + llvm::json::Value invalid_value = "invalid_reason"; + BreakpointReason deserialized_invalid; + llvm::json::Path::Root root; + EXPECT_FALSE(fromJSON(invalid_value, deserialized_invalid, root)); +} + +TEST(ProtocolTypesTest, DataBreakpointAccessType) { + // Test all DataBreakpointAccessType values. + std::vector> test_cases = + {{eDataBreakpointAccessTypeRead, "read"}, + {eDataBreakpointAccessTypeWrite, "write"}, + {eDataBreakpointAccessTypeReadWrite, "readWrite"}}; + + for (const auto &test_case : test_cases) { + // Serialize the DataBreakpointAccessType to JSON. + llvm::json::Value serialized = toJSON(test_case.first); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), test_case.second); + + // Deserialize the JSON back to DataBreakpointAccessType. + DataBreakpointAccessType deserialized; + llvm::json::Path::Root root; + ASSERT_TRUE(fromJSON(serialized, deserialized, root)) + << llvm::toString(root.getError()); + EXPECT_EQ(deserialized, test_case.first); + } + + // Test invalid value + llvm::json::Value invalid_value = "invalid_access_type"; + DataBreakpointAccessType deserialized_invalid; + llvm::json::Path::Root root; + EXPECT_FALSE(fromJSON(invalid_value, deserialized_invalid, root)); +} + +TEST(ProtocolTypesTest, ColumnType) { + // Test all ColumnType values. + std::vector> test_cases = { + {eColumnTypeString, "string"}, + {eColumnTypeNumber, "number"}, + {eColumnTypeBoolean, "boolean"}, + {eColumnTypeTimestamp, "unixTimestampUTC"}}; + + for (const auto &test_case : test_cases) { + // Serialize the ColumnType to JSON. + llvm::json::Value serialized = toJSON(test_case.first); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), test_case.second); + + // Deserialize the JSON back to ColumnType. + ColumnType deserialized; + llvm::json::Path::Root root; + ASSERT_TRUE(fromJSON(serialized, deserialized, root)) + << llvm::toString(root.getError()); + EXPECT_EQ(deserialized, test_case.first); + } + + // Test invalid value. + llvm::json::Value invalid_value = "invalid_column_type"; + ColumnType deserialized_invalid; + llvm::json::Path::Root root; + EXPECT_FALSE(fromJSON(invalid_value, deserialized_invalid, root)); +} + +TEST(ProtocolTypesTest, BreakpointModeApplicability) { + // Test all BreakpointModeApplicability values. + std::vector> + test_cases = {{eBreakpointModeApplicabilitySource, "source"}, + {eBreakpointModeApplicabilityException, "exception"}, + {eBreakpointModeApplicabilityData, "data"}, + {eBreakpointModeApplicabilityInstruction, "instruction"}}; + + for (const auto &test_case : test_cases) { + // Serialize the BreakpointModeApplicability to JSON. + llvm::json::Value serialized = toJSON(test_case.first); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), test_case.second); + + // Deserialize the JSON back to BreakpointModeApplicability. + BreakpointModeApplicability deserialized; + llvm::json::Path::Root root; + ASSERT_TRUE(fromJSON(serialized, deserialized, root)) + << llvm::toString(root.getError()); + EXPECT_EQ(deserialized, test_case.first); + } + + // Test invalid value. + llvm::json::Value invalid_value = "invalid_applicability"; + BreakpointModeApplicability deserialized_invalid; + llvm::json::Path::Root root; + EXPECT_FALSE(fromJSON(invalid_value, deserialized_invalid, root)); +} + +TEST(ProtocolTypesTest, ChecksumAlgorithm) { + // Test all ChecksumAlgorithm values. + std::vector> test_cases = { + {eChecksumAlgorithmMD5, "MD5"}, + {eChecksumAlgorithmSHA1, "SHA1"}, + {eChecksumAlgorithmSHA256, "SHA256"}, + {eChecksumAlgorithmTimestamp, "timestamp"}}; + + for (const auto &test_case : test_cases) { + // Serialize the ChecksumAlgorithm to JSON. + llvm::json::Value serialized = toJSON(test_case.first); + ASSERT_EQ(serialized.kind(), llvm::json::Value::Kind::String); + EXPECT_EQ(serialized.getAsString(), test_case.second); + + // Deserialize the JSON back to ChecksumAlgorithm. + ChecksumAlgorithm deserialized; + llvm::json::Path::Root root; + ASSERT_TRUE(fromJSON(serialized, deserialized, root)) + << llvm::toString(root.getError()); + EXPECT_EQ(deserialized, test_case.first); + } + + // Test invalid value. + llvm::json::Value invalid_value = "invalid_algorithm"; + ChecksumAlgorithm deserialized_invalid; + llvm::json::Path::Root root; + EXPECT_FALSE(fromJSON(invalid_value, deserialized_invalid, root)); +} diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp new file mode 100644 index 0000000000000..eb146cb2fa9f4 --- /dev/null +++ b/lldb/unittests/DAP/TestBase.cpp @@ -0,0 +1,70 @@ +//===-- TestBase.cpp ------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestBase.h" +#include "Protocol/ProtocolBase.h" +#include "lldb/Host/File.h" +#include "lldb/Host/Pipe.h" +#include "llvm/Testing/Support/Error.h" + +using namespace llvm; +using namespace lldb; +using namespace lldb_dap; +using namespace lldb_dap::protocol; +using namespace lldb_dap_tests; +using lldb_private::File; +using lldb_private::NativeFile; +using lldb_private::Pipe; + +void PipeBase::SetUp() { + ASSERT_THAT_ERROR(input.CreateNew(false).ToError(), Succeeded()); + ASSERT_THAT_ERROR(output.CreateNew(false).ToError(), Succeeded()); +} + +void TransportBase::SetUp() { + PipeBase::SetUp(); + to_dap = std::make_unique( + "to_dap", nullptr, + std::make_shared(input.GetReadFileDescriptor(), + File::eOpenOptionReadOnly, + NativeFile::Unowned), + std::make_shared(output.GetWriteFileDescriptor(), + File::eOpenOptionWriteOnly, + NativeFile::Unowned)); + from_dap = std::make_unique( + "from_dap", nullptr, + std::make_shared(output.GetReadFileDescriptor(), + File::eOpenOptionReadOnly, + NativeFile::Unowned), + std::make_shared(input.GetWriteFileDescriptor(), + File::eOpenOptionWriteOnly, + NativeFile::Unowned)); +} + +void DAPTestBase::SetUp() { + TransportBase::SetUp(); + dap = std::make_unique( + /*log=*/nullptr, + /*default_repl_mode=*/ReplMode::Auto, + /*pre_init_commands=*/std::vector(), + /*transport=*/*to_dap); +} + +std::vector DAPTestBase::DrainOutput() { + std::vector msgs; + output.CloseWriteFileDescriptor(); + while (true) { + Expected next = from_dap->Read(std::chrono::milliseconds(1)); + if (!next) { + consumeError(next.takeError()); + break; + } + msgs.push_back(*next); + } + return msgs; +} diff --git a/lldb/unittests/DAP/TestBase.h b/lldb/unittests/DAP/TestBase.h new file mode 100644 index 0000000000000..c789adf53c225 --- /dev/null +++ b/lldb/unittests/DAP/TestBase.h @@ -0,0 +1,48 @@ +//===-- TestBase.cpp ------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DAP.h" +#include "Protocol/ProtocolBase.h" +#include "Transport.h" +#include "lldb/Host/Pipe.h" +#include "gtest/gtest.h" + +namespace lldb_dap_tests { + +/// A base class for tests that need a pair of pipes for communication. +class PipeBase : public testing::Test { +protected: + lldb_private::Pipe input; + lldb_private::Pipe output; + + void SetUp() override; +}; + +/// A base class for tests that need transport configured for communicating DAP +/// messages. +class TransportBase : public PipeBase { +protected: + std::unique_ptr to_dap; + std::unique_ptr from_dap; + + void SetUp() override; +}; + +/// A base class for tests that interact with a `lldb_dap::DAP` instance. +class DAPTestBase : public TransportBase { +protected: + std::unique_ptr dap; + + void SetUp() override; + + /// Closes the DAP output pipe and returns the remaining protocol messages in + /// the buffer. + std::vector DrainOutput(); +}; + +} // namespace lldb_dap_tests diff --git a/lldb/unittests/DAP/TransportTest.cpp b/lldb/unittests/DAP/TransportTest.cpp new file mode 100644 index 0000000000000..e6dab42e30941 --- /dev/null +++ b/lldb/unittests/DAP/TransportTest.cpp @@ -0,0 +1,94 @@ +//===-- TransportTest.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Transport.h" +#include "Protocol/ProtocolBase.h" +#include "TestBase.h" +#include "lldb/Host/File.h" +#include "lldb/Host/Pipe.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" +#include +#include +#include + +using namespace llvm; +using namespace lldb; +using namespace lldb_dap; +using namespace lldb_dap_tests; +using namespace lldb_dap::protocol; +using lldb_private::File; +using lldb_private::NativeFile; +using lldb_private::Pipe; + +class TransportTest : public PipeBase { +protected: + std::unique_ptr transport; + + void SetUp() override { + PipeBase::SetUp(); + transport = std::make_unique( + "stdio", nullptr, + std::make_shared(input.GetReadFileDescriptor(), + File::eOpenOptionReadOnly, + NativeFile::Unowned), + std::make_shared(output.GetWriteFileDescriptor(), + File::eOpenOptionWriteOnly, + NativeFile::Unowned)); + } +}; + +TEST_F(TransportTest, MalformedRequests) { + std::string malformed_header = "COnTent-LenGth: -1{}\r\n\r\nnotjosn"; + ASSERT_THAT_EXPECTED( + input.Write(malformed_header.data(), malformed_header.size()), + Succeeded()); + ASSERT_THAT_EXPECTED( + transport->Read(std::chrono::milliseconds(1)), + FailedWithMessage( + "expected 'Content-Length: ' and got 'COnTent-LenGth: '")); +} + +TEST_F(TransportTest, Read) { + std::string json = + R"json({"seq": 1, "type": "request", "command": "abc"})json"; + std::string message = + formatv("Content-Length: {0}\r\n\r\n{1}", json.size(), json).str(); + ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()), + Succeeded()); + ASSERT_THAT_EXPECTED( + transport->Read(std::chrono::milliseconds(1)), + HasValue(testing::VariantWith(testing::FieldsAre( + /*seq=*/1, /*command=*/"abc", /*arguments=*/std::nullopt)))); +} + +TEST_F(TransportTest, ReadWithTimeout) { + ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)), + Failed()); +} + +TEST_F(TransportTest, ReadWithEOF) { + input.CloseWriteFileDescriptor(); + ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)), + Failed()); +} + +TEST_F(TransportTest, Write) { + ASSERT_THAT_ERROR(transport->Write(Event{"my-event", std::nullopt}), + Succeeded()); + output.CloseWriteFileDescriptor(); + char buf[1024]; + Expected bytes_read = + output.Read(buf, sizeof(buf), std::chrono::milliseconds(1)); + ASSERT_THAT_EXPECTED(bytes_read, Succeeded()); + ASSERT_EQ( + StringRef(buf, *bytes_read), + StringRef("Content-Length: 43\r\n\r\n" + R"json({"event":"my-event","seq":0,"type":"event"})json")); +} diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index c427a65ee030c..2912f45953c41 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -1240,6 +1240,8 @@ if(LLVM_PROFDATA_FILE AND EXISTS ${LLVM_PROFDATA_FILE}) else() message(FATAL_ERROR "LLVM_PROFDATA_FILE can only be specified when compiling with clang") endif() +elseif(LLVM_PROFDATA_FILE) + message(WARNING "LLVM_PROFDATA_FILE specified, but ${LLVM_PROFDATA_FILE} not found") endif() option(LLVM_BUILD_INSTRUMENTED_COVERAGE "Build LLVM and tools with Code Coverage instrumentation" Off) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index d1535960a0257..3751bb3a332da 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -394,12 +394,12 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following **GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_ ----------------------------------------------------------------------------------------------------------------------- - ``gfx1010`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon RX 5700 - - wavefrontsize64 flat - *pal-amdhsa* - Radeon RX 5700 XT - - xnack scratch - *pal-amdpal* - Radeon Pro 5600 XT - - Radeon Pro 5600M + ``gfx1010`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon Pro 5600 XT + - wavefrontsize64 flat - *pal-amdhsa* - Radeon RX 5600M + - xnack scratch - *pal-amdpal* - Radeon RX 5700 + - Radeon RX 5700 XT ``gfx1011`` ``amdgcn`` dGPU - cumode - *rocm-amdhsa* - Radeon Pro V520 - - wavefrontsize64 - Absolute - *pal-amdhsa* + - wavefrontsize64 - Absolute - *pal-amdhsa* - Radeon Pro 5600M - xnack flat - *pal-amdpal* scratch ``gfx1012`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon RX 5500 @@ -4547,7 +4547,7 @@ same *vendor-name*. Code Object V4 Metadata +++++++++++++++++++++++ -. warning:: +.. warning:: Code object V4 is not the default code object version emitted by this version of LLVM. diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst index 60e32dc467d27..f64029547e648 100644 --- a/llvm/docs/Coroutines.rst +++ b/llvm/docs/Coroutines.rst @@ -2121,10 +2121,11 @@ Coroutine Transformation Passes =============================== CoroEarly --------- -The pass CoroEarly lowers coroutine intrinsics that hide the details of the -structure of the coroutine frame, but, otherwise not needed to be preserved to -help later coroutine passes. This pass lowers `coro.frame`_, `coro.done`_, -and `coro.promise`_ intrinsics. +The CoroEarly pass ensures later middle end passes correctly interpret coroutine +semantics and lowers coroutine intrinsics that not needed to be preserved to +help later coroutine passes. This pass lowers `coro.promise`_, `coro.frame`_ and +`coro.done`_ intrinsics. Afterwards, it replace uses of promise alloca with +`coro.promise`_ intrinsic. .. _CoroSplit: diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index de0022e518cc8..31dee0bbb17ba 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -292,9 +292,9 @@ Your options are as follows: subsequent PRs in a stack. Instead, it will show a combined diff that includes all commits from earlier PRs. - As described in the first option above, in such cases it is the PR author’s - responsibility to clearly indicate which commits are relevant to the - current PR. For example: “The first N commits are from the base PR.” + As described above, it is the PR author’s responsibility to clearly indicate + which commits are relevant to the current PR. + For example: “The first N commits are from the base PR.” You can avoid this issue by using user branches directly in the ``llvm/llvm-project`` repository. diff --git a/llvm/docs/GlobalISel/KnownBits.rst b/llvm/docs/GlobalISel/KnownBits.rst index c01faa5f08f0f..3c61a58626e84 100644 --- a/llvm/docs/GlobalISel/KnownBits.rst +++ b/llvm/docs/GlobalISel/KnownBits.rst @@ -66,7 +66,7 @@ dependency with ``INITIALIZE_PASS_DEPENDENCY``. ... INITIALIZE_PASS_BEGIN(...) - INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) + INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(...) and require the pass in ``getAnalysisUsage``. @@ -74,10 +74,10 @@ and require the pass in ``getAnalysisUsage``. .. code-block:: c++ void MyPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); + AU.addRequired(); // Optional: If your pass preserves known bits analysis (many do) then // indicate that it's preserved for re-use by another pass here. - AU.addPreserved(); + AU.addPreserved(); } Then it's just a matter of fetching the analysis and using it: @@ -86,7 +86,7 @@ Then it's just a matter of fetching the analysis and using it: bool MyPass::runOnMachineFunction(MachineFunction &MF) { ... - GISelValueTracking &VT = getAnalysis().get(MF); + GISelValueTracking &VT = getAnalysis().get(MF); ... MachineInstr *MI = ...; KnownBits Known = VT->getKnownBits(MI->getOperand(0).getReg()); diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 5f14726c36672..343ca743c74f8 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -410,8 +410,8 @@ added in the future: calling convention: on most platforms, they are not preserved and need to be saved by the caller, but on Windows, xmm6-xmm15 are preserved. - - On AArch64 the callee preserve all general purpose registers, except X0-X8 - and X16-X18. + - On AArch64 the callee preserve all general purpose registers, except + X0-X8 and X16-X18. Not allowed with ``nest``. The idea behind this convention is to support calls to runtime functions that have a hot path and a cold path. The hot path is usually a small piece @@ -447,9 +447,9 @@ added in the future: R11. R11 can be used as a scratch register. Furthermore it also preserves all floating-point registers (XMMs/YMMs). - - On AArch64 the callee preserve all general purpose registers, except X0-X8 - and X16-X18. Furthermore it also preserves lower 128 bits of V8-V31 SIMD - - floating point registers. + - On AArch64 the callee preserve all general purpose registers, except + X0-X8 and X16-X18. Furthermore it also preserves lower 128 bits of V8-V31 + SIMD floating point registers. Not allowed with ``nest``. The idea behind this convention is to support calls to runtime functions that don't need to call out to any other functions. @@ -3133,6 +3133,9 @@ as follows: program memory space defaults to the default address space of 0, which corresponds to a Von Neumann architecture that has code and data in the same space. + +.. _globals_addrspace: + ``G
`` Specifies the address space to be used by default when creating global variables. If omitted, the globals address space defaults to the default @@ -3147,14 +3150,21 @@ as follows: ``A
`` Specifies the address space of objects created by '``alloca``'. Defaults to the default address space of 0. -``p[n]::[:][:]`` - This specifies the *size* of a pointer and its ```` and - ````\erred alignments for address space ``n``. - The fourth parameter ```` is the size of the - index that used for address calculation, which must be less than or equal - to the pointer size. If not - specified, the default index size is equal to the pointer size. All sizes - are in bits. The address space, ``n``, is optional, and if not specified, +``p[n]::[:[:]]`` + This specifies the properties of a pointer in address space ``n``. + The ```` parameter specifies the size of the bitwise representation. + For :ref:`non-integral pointers ` the representation size may + be larger than the address width of the underlying address space (e.g. to + accommodate additional metadata). + The alignment requirements are specified via the ```` and + ````\erred alignments parameters. + The fourth parameter ```` is the size of the index that used for + address calculations such as :ref:`getelementptr `. + It must be less than or equal to the pointer size. If not specified, the + default index size is equal to the pointer size. + The index size also specifies the width of addresses in this address space. + All sizes are in bits. + The address space, ``n``, is optional, and if not specified, denotes the default address space 0. The value of ``n`` must be in the range [1,2^24). ``i:[:]`` @@ -4266,6 +4276,16 @@ address spaces defined in the :ref:`datalayout string`. the default globals address space and ``addrspace("P")`` the program address space. +The representation of pointers can be different for each address space and does +not necessarily need to be a plain integer address (e.g. for +:ref:`non-integral pointers `). In addition to a representation +bits size, pointers in each address space also have an index size which defines +the bitwidth of indexing operations as well as the size of `integer addresses` +in this address space. For example, CHERI capabilities are twice the size of the +underlying addresses to accommodate for additional metadata such as bounds and +permissions: on a 32-bit system the bitwidth of the pointer representation size +is 64, but the underlying address width remains 32 bits. + The default address space is number zero. The semantics of non-zero address spaces are target-specific. Memory @@ -15061,7 +15081,8 @@ Syntax: :: - declare ptr @llvm.thread.pointer() + declare ptr @llvm.thread.pointer.p0() + declare ptr addrspace(5) @llvm.thread.pointer.p5() Overview: """"""""" @@ -15078,7 +15099,8 @@ specific: it may point to the start of TLS area, to the end, or somewhere in the middle. Depending on the target, this intrinsic may read a register, call a helper function, read from an alternate memory space, or perform other operations necessary to locate the TLS area. Not all targets support -this intrinsic. +this intrinsic. The address space must be the :ref:`globals address space +`. '``llvm.call.preallocated.setup``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -17195,12 +17217,14 @@ type. Semantics: """""""""" -If both operands are NaNs (including sNaN), returns qNaN. If one operand -is NaN (including sNaN) and another operand is a number, return the number. -Otherwise returns the lesser of the two arguments. -0.0 is considered to -be less than +0.0 for this intrinsic. -Note that these are the semantics of minimumNumber specified in IEEE 754-2019. +If both operands are NaNs (including sNaN), returns a :ref:`NaN `. If +one operand is NaN (including sNaN) and another operand is a number, +return the number. Otherwise returns the lesser of the two +arguments. -0.0 is considered to be less than +0.0 for this intrinsic. + +Note that these are the semantics of minimumNumber specified in +IEEE-754-2019 with the usual :ref:`signaling NaN ` exception. It has some differences with '``llvm.minnum.*``': 1)'``llvm.minnum.*``' will return qNaN if either operand is sNaN. @@ -17241,12 +17265,15 @@ type. Semantics: """""""""" -If both operands are NaNs (including sNaN), returns qNaN. If one operand -is NaN (including sNaN) and another operand is a number, return the number. -Otherwise returns the greater of the two arguments. -0.0 is considered to -be less than +0.0 for this intrinsic. -Note that these are the semantics of maximumNumber specified in IEEE 754-2019. +If both operands are NaNs (including sNaN), returns a +:ref:`NaN `. If one operand is NaN (including sNaN) and +another operand is a number, return the number. Otherwise returns the +greater of the two arguments. -0.0 is considered to be less than +0.0 +for this intrinsic. + +Note that these are the semantics of maximumNumber specified in +IEEE-754-2019 with the usual :ref:`signaling NaN ` exception. It has some differences with '``llvm.maxnum.*``': 1)'``llvm.maxnum.*``' will return qNaN if either operand is sNaN. @@ -21120,7 +21147,12 @@ sufficiently aligned block of memory; this memory is written to by the intrinsic. Note that the size and the alignment are target-specific - LLVM currently provides no portable way of determining them, so a front-end that generates this intrinsic needs to have some -target-specific knowledge. The ``func`` argument must hold a function. +target-specific knowledge. + +The ``func`` argument must be a constant (potentially bitcasted) pointer to a +function declaration or definition, since the calling convention may affect the +content of the trampoline that is created. + Semantics: """""""""" diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst index 692c9861d8caa..78e63a2fa8cc3 100644 --- a/llvm/docs/MLGO.rst +++ b/llvm/docs/MLGO.rst @@ -27,8 +27,180 @@ of models during training. Corpus Tooling ============== -.. - TODO(boomanaiden154): Write this section. +Within the LLVM monorepo, there is the ``mlgo-utils`` python packages that +lives at ``llvm/utils/mlgo-utils``. This package primarily contains tooling +for working with corpora, or collections of LLVM bitcode. We use these corpora +to train and evaluate ML models. Corpora consist of a description in JSON +format at ``corpus_description.json`` in the root of the corpus, and then +a bitcode file and command line flags file for each extracted module. The +corpus structure is designed to contain sufficient information to fully +compile the bitcode to bit-identical object files. + +.. program:: extract_ir.py + +Synopsis +-------- + +Extracts a corpus from some form of a structured compilation database. This +tool supports a variety of different scenarios and input types. + +Options +------- + +.. option:: --input + + The path to the input. This should be a path to a supported structured + compilation database. Currently only ``compile_commands.json`` files, linker + parameter files, a directory containing object files (for the local + ThinLTO case only), or a JSON file containing a bazel aquery result are + supported. + +.. option:: --input_type + + The type of input that has been passed to the ``--input`` flag. + +.. option:: --output_dir + + The output directory to place the corpus in. + +.. option:: --num_workers + + The number of workers to use for extracting bitcode into the corpus. This + defaults to the number of hardware threads available on the host system. + +.. option:: --llvm_objcopy_path + + The path to the llvm-objcopy binary to use when extracting bitcode. + +.. option:: --obj_base_dir + + The base directory for object files. Bitcode files that get extracted into + the corpus will be placed into the output directory based on where their + source object files are placed relative to this path. + +.. option:: --cmd_filter + + Allows filtering of modules by command line. If set, only modules that much + the filter will be extracted into the corpus. Regular expressions are + supported in some instances. + +.. option:: --thinlto_build + + If the build was performed with ThinLTO, this should be set to either + ``distributed`` or ``local`` depending upon how the build was performed. + +.. option:: --cmd_section_name + + This flag allows specifying the command line section name. This is needed + on non-ELF platforms where the section name might differ. + +.. option:: --bitcode_section_name + + This flag allows specifying the bitcode section name. This is needed on + non-ELF platforms where the section name might differ. + +Example: CMake +-------------- + +CMake can output a ``compilation_commands.json`` compilation database if the +``CMAKE_EXPORT_COMPILE_COMMANDS`` switch is turned on at compile time. It is +also necessary to enable bitcode embedding (done by passing +``-Xclang -fembed-bitcode=all`` to all C/C++ compilation actions in the +non-ThinLTO case). For example, to extract a corpus from clang, you would +run the following commands (assuming that the system C/C++ compiler is clang): + +.. code-block:: bash + + cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DCMAKE_C_FLAGS="-Xclang -fembed-bitcode=all" \ + -DCMAKE_CXX_FLAGS="-Xclang -fembed-bitcode-all" + ../llvm + ninja + +After running CMake and building the project, there should be a + ``compilation_commands.json`` file within the build directory. You can then + run the following command to create a corpus: + +.. code-block:: bash + + python3 ./extract_ir.py \ + --input=./build/compile_commands.json \ + --input_type=json \ + --output_dir=./corpus + +After running the above command, there should be a full +corpus of bitcode within the ``./corpus`` directory. + +Example: Bazel Aquery +--------------------- + +This tool also supports extracting bitcode from bazel in multiple ways +depending upon the exact configuration. For ThinLTO, a linker parameters file +is preferred. For the non-ThinLTO case, the script will accept the output of +``bazel aquery`` which it will use to find all the object files that are linked +into a specific target and then extract bitcode from them. First, you need +to generate the aquery output: + +.. code-block:: bash + + bazel aquery --output=jsonproto //path/to:target > /path/to/aquery.json + +Afterwards, assuming that the build is already complete, you can run this +script to create a corpus: + +.. code-block:: bash + + python3 ./extract_ir.py \ + --input=/path/to/aquery.json \ + --input_type=bazel_aqeury \ + --output_dir=./corpus \ + --obj_base_dir=./bazel-bin + +This will again leave a corpus that contains all the bitcode files. This mode +does not capture all object files in the build however, only the ones that +are involved in the link for the binary passed to the ``bazel aquery`` +invocation. + +.. program:: make_corpus.py + +Synopsis +-------- + +Creates a corpus from a collection of bitcode files. + +Options +------- + +.. option:: --input_dir + + The input directory to search for bitcode files in. + +.. option:: --output_dir + + The output directory to place the constructed corpus in. + +.. option:: --default_args + + A list of space separated flags that are put into the corpus description. + These are used by some tooling when compiling the modules within the corpus. + +.. program:: combine_training_corpus.py + +Synopsis +-------- + +Combines two training corpora that share the same parent folder by generating +a new ``corpus_description.json`` that contains all the modules in both corpora. + +Options +------- + +.. option:: --root_dir + + The root directory that contains subfolders consisting of the corpora that + should be combined. Interacting with ML models ========================== @@ -61,6 +233,7 @@ call, where the parameters and result are bound by name and are described by name, scalar type, and shape tuples. The main types in LLVM are: + - ``MLModelRunner`` - an abstraction for the decision making mechanism - ``TensorSpec`` which describes a tensor. diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 51bbfd0a5c88d..957cccc6268e6 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -672,6 +672,7 @@ Syntax: .. code-block:: llvm declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(..., i32 %size, i64 %ch, i1 %flag_ch, i16 %mask) Overview: """"""""" @@ -680,10 +681,13 @@ The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX instructions. These instructions initiate an asynchronous copy from shared::cta to global memory. The 32-bit operand ``%size`` specifies -the amount of memory to be copied and it must be a multiple of 16. +the amount of memory to be copied (in bytes) and it must be a multiple +of 16. For the ``.bytemask`` variant, the 16-bit wide mask operand +specifies whether the i-th byte of each 16-byte wide chunk of source +data is copied to the destination. -* The last argument to these intrinsics is a boolean flag - indicating support for cache_hint. This flag argument must +* The ``i1 %flag_ch`` argument to these intrinsics is a boolean + flag indicating support for cache_hint. This flag argument must be a compile-time constant. When set, it indicates a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` variant of the PTX instruction. diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 0ebe1764c6502..cf11d3878a745 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -119,6 +119,7 @@ on support follow. ``E`` Supported (`See note <#riscv-rve-note>`__) ``H`` Assembly Support ``M`` Supported + ``Q`` Assembly Support ``Sha`` Supported ``Shcounterenw`` Assembly Support (`See note <#riscv-profiles-extensions-note>`__) ``Shgatpa`` Assembly Support (`See note <#riscv-profiles-extensions-note>`__) @@ -511,6 +512,9 @@ The current vendor extensions supported are: ``XAndesVPackFPH`` LLVM implements `version 5.0.0 of the Andes Vector Packed FP16 Extension specification `__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification. +``XAndesVDot`` + LLVM implements `version 5.0.0 of the Andes Vector Dot Product Extension specification `__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification. + Experimental C Intrinsics ========================= diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index f4bec50cfca46..9c8cc599a8daf 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -186,7 +186,10 @@ Changes to the RISC-V Backend * Adds assembler support for the Andes `XAndesperf` (Andes Performance extension). * `-mcpu=sifive-p870` was added. * Adds assembler support for the Andes `XAndesvpackfph` (Andes Vector Packed FP16 extension). - +* Adds assembler support for the Andes `XAndesvdot` (Andes Vector Dot Product extension). +* Adds assembler support for the standard `Q` (Quad-Precision Floating Point) + extension. + Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/include/llvm-c/Orc.h b/llvm/include/llvm-c/Orc.h index 8609a8a6d9e9d..743ba1d581782 100644 --- a/llvm/include/llvm-c/Orc.h +++ b/llvm/include/llvm-c/Orc.h @@ -1059,7 +1059,7 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForPath( */ LLVMErrorRef LLVMOrcCreateStaticLibrarySearchGeneratorForPath( LLVMOrcDefinitionGeneratorRef *Result, LLVMOrcObjectLayerRef ObjLayer, - const char *FileName, const char *TargetTriple); + const char *FileName); /** * Create a ThreadSafeContext containing a new LLVMContext. diff --git a/llvm/include/llvm/ADT/APFixedPoint.h b/llvm/include/llvm/ADT/APFixedPoint.h index 70d7f325702cf..89d2a93a06a26 100644 --- a/llvm/include/llvm/ADT/APFixedPoint.h +++ b/llvm/include/llvm/ADT/APFixedPoint.h @@ -249,7 +249,10 @@ class APFixedPoint { } void print(raw_ostream &) const; - void dump() const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif // If LHS > RHS, return 1. If LHS == RHS, return 0. If LHS < RHS, return -1. int compare(const APFixedPoint &Other) const; diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index ed49380cfc05f..b88cbc56c105c 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -1483,7 +1483,10 @@ class APFloat : public APFloatBase { } void print(raw_ostream &) const; - void dump() const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif bool getExactInverse(APFloat *inv) const { APFLOAT_DISPATCH_ON_SEMANTICS(getExactInverse(inv)); diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 7fbf09b44e6c4..44260c7eca309 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -1896,8 +1896,10 @@ class [[nodiscard]] APInt { /// FoldingSets. void Profile(FoldingSetNodeID &id) const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// debug method - void dump() const; + LLVM_DUMP_METHOD void dump() const; +#endif /// Returns whether this instance allocated memory. bool needsCleanup() const { return !isSingleWord(); } diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h index dcb13bd8ba51a..7214f25b0aa10 100644 --- a/llvm/include/llvm/ADT/BitmaskEnum.h +++ b/llvm/include/llvm/ADT/BitmaskEnum.h @@ -92,6 +92,7 @@ using ::llvm::BitmaskEnumDetail::operator^=; \ using ::llvm::BitmaskEnumDetail::operator<<=; \ using ::llvm::BitmaskEnumDetail::operator>>=; \ + using ::llvm::BitmaskEnumDetail::operator!; \ /* Force a semicolon at the end of this macro. */ \ using ::llvm::BitmaskEnumDetail::any @@ -141,6 +142,11 @@ constexpr unsigned bitWidth(uint64_t Value) { return Value ? 1 + bitWidth(Value >> 1) : 0; } +template ::value>> +constexpr bool operator!(E Val) { + return Val == static_cast(0); +} + template ::value>> constexpr bool any(E Val) { return Val != static_cast(0); diff --git a/llvm/include/llvm/ADT/DynamicAPInt.h b/llvm/include/llvm/ADT/DynamicAPInt.h index ff958d48e7731..bb65a08a968d9 100644 --- a/llvm/include/llvm/ADT/DynamicAPInt.h +++ b/llvm/include/llvm/ADT/DynamicAPInt.h @@ -216,7 +216,9 @@ class DynamicAPInt { void static_assert_layout(); // NOLINT raw_ostream &print(raw_ostream &OS) const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const; +#endif }; inline raw_ostream &operator<<(raw_ostream &OS, const DynamicAPInt &X) { diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index fed012ce56005..b1009f8b49992 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -60,6 +60,7 @@ namespace llvm { /// 5 1 2 /// template class EquivalenceClasses { +public: /// ECValue - The EquivalenceClasses data structure is just a set of these. /// Each of these represents a relation for a value. First it stores the /// value itself. Next, it provides a "next pointer", which is used to @@ -122,6 +123,7 @@ template class EquivalenceClasses { } }; +private: /// TheMapping - This implicitly provides a mapping from ElemTy values to the /// ECValues, it just keeps the key as part of the value. DenseMap TheMapping; diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h index 5bee746688ce4..ac86f43b2048e 100644 --- a/llvm/include/llvm/ADT/ImmutableSet.h +++ b/llvm/include/llvm/ADT/ImmutableSet.h @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include #include @@ -213,9 +214,12 @@ class ImutAVLTree { ImutAVLTree *next = nullptr; unsigned height : 28; - bool IsMutable : 1; - bool IsDigestCached : 1; - bool IsCanonicalized : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsMutable : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsDigestCached : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsCanonicalized : 1; value_type value; uint32_t digest = 0; diff --git a/llvm/include/llvm/ADT/SlowDynamicAPInt.h b/llvm/include/llvm/ADT/SlowDynamicAPInt.h index ec1021892cf4d..c9aef96b9e1c3 100644 --- a/llvm/include/llvm/ADT/SlowDynamicAPInt.h +++ b/llvm/include/llvm/ADT/SlowDynamicAPInt.h @@ -79,7 +79,10 @@ class SlowDynamicAPInt { unsigned getBitWidth() const { return Val.getBitWidth(); } void print(raw_ostream &OS) const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const; +#endif }; inline raw_ostream &operator<<(raw_ostream &OS, const SlowDynamicAPInt &X) { diff --git a/llvm/include/llvm/ADT/TrieRawHashMap.h b/llvm/include/llvm/ADT/TrieRawHashMap.h index e312967edeb58..1382eac1c768f 100644 --- a/llvm/include/llvm/ADT/TrieRawHashMap.h +++ b/llvm/include/llvm/ADT/TrieRawHashMap.h @@ -90,7 +90,10 @@ class ThreadSafeTrieRawHashMapBase { static void *operator new(size_t Size) { return ::operator new(Size); } void operator delete(void *Ptr) { ::operator delete(Ptr); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const; +#endif + void print(raw_ostream &OS) const; protected: @@ -214,7 +217,10 @@ class ThreadSafeTrieRawHashMap : public ThreadSafeTrieRawHashMapBase { using ThreadSafeTrieRawHashMapBase::operator delete; using HashType = HashT; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) using ThreadSafeTrieRawHashMapBase::dump; +#endif + using ThreadSafeTrieRawHashMapBase::print; private: diff --git a/llvm/include/llvm/ADT/Twine.h b/llvm/include/llvm/ADT/Twine.h index 1f1fd1967efbc..d9e553a8a8c77 100644 --- a/llvm/include/llvm/ADT/Twine.h +++ b/llvm/include/llvm/ADT/Twine.h @@ -507,14 +507,16 @@ namespace llvm { /// stream \p OS. void print(raw_ostream &OS) const; - /// Dump the concatenated string represented by this twine to stderr. - void dump() const; - /// Write the representation of this twine to the stream \p OS. void printRepr(raw_ostream &OS) const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Dump the concatenated string represented by this twine to stderr. + LLVM_DUMP_METHOD void dump() const; + /// Dump the representation of this twine to stderr. - void dumpRepr() const; + LLVM_DUMP_METHOD void dumpRepr() const; +#endif /// @} }; diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h index d23b81854c9ea..16f54c394788d 100644 --- a/llvm/include/llvm/Analysis/AliasAnalysis.h +++ b/llvm/include/llvm/Analysis/AliasAnalysis.h @@ -1011,19 +1011,24 @@ struct ExternalAAWrapperPass : ImmutablePass { ExternalAAWrapperPass(); - explicit ExternalAAWrapperPass(CallbackT CB); + explicit ExternalAAWrapperPass(CallbackT CB, bool RunEarly = false); - /// Returns whether this external AA should run before Basic AA. + /// Flag indicating whether this external AA should run before Basic AA. /// - /// By default, external AA passes are run after Basic AA. If this returns - /// true, the external AA will be run before Basic AA during alias analysis. + /// This flag is for LegacyPassManager only. To run an external AA early + /// with the NewPassManager, override the registerEarlyDefaultAliasAnalyses + /// method on the target machine. + /// + /// By default, external AA passes are run after Basic AA. If this flag is + /// set to true, the external AA will be run before Basic AA during alias + /// analysis. /// /// For some targets, we prefer to run the external AA early to improve /// compile time as it has more target-specific information. This is /// particularly useful when the external AA can provide more precise results /// than Basic AA so that Basic AA does not need to spend time recomputing /// them. - virtual bool runEarly() { return false; } + bool RunEarly = false; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index f715e0ec8dbb4..fea2ede8b5ab4 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -853,11 +853,10 @@ getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, /// is a simple API that does not depend on the analysis pass. /// \param StrictCheck Ensure that the calculated distance matches the /// type-based one after all the bitcasts removal in the provided pointers. -std::optional getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, - Value *PtrB, const DataLayout &DL, - ScalarEvolution &SE, - bool StrictCheck = false, - bool CheckType = true); +std::optional +getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, + const DataLayout &DL, ScalarEvolution &SE, + bool StrictCheck = false, bool CheckType = true); /// Attempt to sort the pointers in \p VL and return the sorted indices /// in \p SortedIndices, if reordering is required. diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 4fbc0cf1e5954..82890bf814935 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -602,7 +602,7 @@ struct RootDescriptor : public v1::RootDescriptor { uint32_t Flags; RootDescriptor() = default; - RootDescriptor(v1::RootDescriptor &Base) + explicit RootDescriptor(v1::RootDescriptor &Base) : v1::RootDescriptor(Base), Flags(0u) {} void swapBytes() { diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 1645018aebedb..81d2c54b6e07c 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -34,7 +34,7 @@ SHADER_FEATURE_FLAG(14, 19, WaveOps, "Wave level operations") SHADER_FEATURE_FLAG(15, 20, Int64Ops, "64-Bit integer") SHADER_FEATURE_FLAG(16, 21, ViewID, "View Instancing") SHADER_FEATURE_FLAG(17, 22, Barycentrics, "Barycentrics") -SHADER_FEATURE_FLAG(18, -1, NativeLowPrecision, "Use native low precision") +SHADER_FEATURE_FLAG(18, -1, NativeLowPrecision, "Native low-precision data types") SHADER_FEATURE_FLAG(19, 24, ShadingRate, "Shading Rate") SHADER_FEATURE_FLAG(20, 25, Raytracing_Tier_1_1, "Raytracing tier 1.1 features") SHADER_FEATURE_FLAG(21, 26, SamplerFeedback, "Sampler feedback") @@ -115,9 +115,9 @@ DXIL_MODULE_FLAG( 0, DisableOptimizations, "Disable shader optimizations") DXIL_MODULE_FLAG( 1, DisableMathRefactoring, "Disable math refactoring") DXIL_MODULE_FLAG( 3, ForceEarlyDepthStencil, "Force early depth-stencil test") DXIL_MODULE_FLAG( 4, EnableRawAndStructuredBuffers, "Raw and structured buffers") -DXIL_MODULE_FLAG( 5, LowPrecisionPresent, "Low-precision data types") +DXIL_MODULE_FLAG( 5, LowPrecisionPresent, "Low-precision data types present") DXIL_MODULE_FLAG( 8, AllResourcesBound, "All resources bound for the duration of shader execution") -DXIL_MODULE_FLAG(23, UseNativeLowPrecision, "Use native low precision") +DXIL_MODULE_FLAG(23, NativeLowPrecisionMode, "Enable native low-precision data types") DXIL_MODULE_FLAG(33, ResMayNotAlias, "Any UAV may not alias any other UAV") #undef DXIL_MODULE_FLAG diff --git a/llvm/include/llvm/Bitstream/BitCodes.h b/llvm/include/llvm/Bitstream/BitCodes.h index 93888f7d3b335..205024f754dfb 100644 --- a/llvm/include/llvm/Bitstream/BitCodes.h +++ b/llvm/include/llvm/Bitstream/BitCodes.h @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Bitstream/BitCodeEnums.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/ErrorHandling.h" #include @@ -31,9 +32,6 @@ namespace llvm { /// 2. It could be an encoding specification ("this operand encoded like so"). /// class BitCodeAbbrevOp { - uint64_t Val; // A literal value or data for an encoding. - bool IsLiteral : 1; // Indicate whether this is a literal value or not. - unsigned Enc : 3; // The encoding to use. public: enum Encoding { Fixed = 1, // A fixed width field, Val specifies number of bits. @@ -43,6 +41,14 @@ class BitCodeAbbrevOp { Blob = 5 // 32-bit aligned array of 8-bit characters. }; +protected: + uint64_t Val; // A literal value or data for an encoding. + LLVM_PREFERRED_TYPE(bool) + uint64_t IsLiteral : 1; // Indicate whether this is a literal value or not. + LLVM_PREFERRED_TYPE(Encoding) + uint64_t Enc : 3; // The encoding to use. + +public: static bool isValidEncoding(uint64_t E) { return E >= 1 && E <= 5; } diff --git a/llvm/include/llvm/CodeGen/GCMetadata.h b/llvm/include/llvm/CodeGen/GCMetadata.h index ca6a511185c7c..88e3377dd77b8 100644 --- a/llvm/include/llvm/CodeGen/GCMetadata.h +++ b/llvm/include/llvm/CodeGen/GCMetadata.h @@ -33,6 +33,7 @@ #define LLVM_CODEGEN_GCMETADATA_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -151,15 +152,49 @@ class GCFunctionInfo { size_t live_size(const iterator &p) const { return roots_size(); } }; -struct GCStrategyMap { - StringMap> StrategyMap; +class GCStrategyMap { + using MapT = + MapVector, StringMap>; + MapT Strategies; +public: GCStrategyMap() = default; GCStrategyMap(GCStrategyMap &&) = default; /// Handle invalidation explicitly. bool invalidate(Module &M, const PreservedAnalyses &PA, ModuleAnalysisManager::Invalidator &Inv); + + using iterator = MapT::iterator; + using const_iterator = MapT::const_iterator; + using reverse_iterator = MapT::reverse_iterator; + using const_reverse_iterator = MapT::const_reverse_iterator; + + iterator begin() { return Strategies.begin(); } + const_iterator begin() const { return Strategies.begin(); } + iterator end() { return Strategies.end(); } + const_iterator end() const { return Strategies.end(); } + + reverse_iterator rbegin() { return Strategies.rbegin(); } + const_reverse_iterator rbegin() const { return Strategies.rbegin(); } + reverse_iterator rend() { return Strategies.rend(); } + const_reverse_iterator rend() const { return Strategies.rend(); } + + bool empty() const { return Strategies.empty(); } + + const GCStrategy &operator[](StringRef GCName) const { + auto I = Strategies.find(GCName); + assert(I != Strategies.end() && "Required strategy doesn't exist!"); + return *I->second; + } + + std::pair try_emplace(StringRef GCName) { + return Strategies.try_emplace(GCName); + } + + bool contains(StringRef GCName) const { + return Strategies.find(GCName) != Strategies.end(); + } }; /// An analysis pass which caches information about the entire Module. diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h index aa99bf321d2b1..d4b4a4e731da7 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h @@ -18,6 +18,7 @@ #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/Register.h" +#include "llvm/IR/PassManager.h" #include "llvm/InitializePasses.h" #include "llvm/Support/KnownBits.h" @@ -104,19 +105,42 @@ class GISelValueTracking : public GISelChangeObserver { /// Eventually add other features such as caching/ser/deserializing /// to MIR etc. Those implementations can derive from GISelValueTracking /// and override computeKnownBitsImpl. -class GISelValueTrackingAnalysis : public MachineFunctionPass { +class GISelValueTrackingAnalysisLegacy : public MachineFunctionPass { std::unique_ptr Info; public: static char ID; - GISelValueTrackingAnalysis() : MachineFunctionPass(ID) { - initializeGISelValueTrackingAnalysisPass(*PassRegistry::getPassRegistry()); + GISelValueTrackingAnalysisLegacy() : MachineFunctionPass(ID) { + initializeGISelValueTrackingAnalysisLegacyPass( + *PassRegistry::getPassRegistry()); } GISelValueTracking &get(MachineFunction &MF); void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnMachineFunction(MachineFunction &MF) override; void releaseMemory() override { Info.reset(); } }; + +class GISelValueTrackingAnalysis + : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + using Result = GISelValueTracking; + + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); +}; + +class GISelValueTrackingPrinterPass + : public PassInfoMixin { + raw_ostream &OS; + +public: + GISelValueTrackingPrinterPass(raw_ostream &OS) : OS(OS) {} + + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; } // namespace llvm #endif // LLVM_CODEGEN_GLOBALISEL_GISELVALUETRACKING_H diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 80ef32aff62ae..9f66402e4c820 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -67,6 +67,15 @@ enum NodeType { /// poisoned the assertion will not be true for that value. AssertAlign, + /// AssertNoFPClass - These nodes record if a register contains a float + /// value that is known to be not some type. + /// This node takes two operands. The first is the node that is known + /// never to be some float types; the second is a constant value with + /// the value of FPClassTest (casted to uint32_t). + /// NOTE: In case of the source value (or any vector element value) is + /// poisoned the assertion will not be true for that value. + AssertNoFPClass, + /// Various leaf nodes. BasicBlock, VALUETYPE, @@ -1524,6 +1533,15 @@ enum NodeType { // Operands: Mask VECTOR_FIND_LAST_ACTIVE, + // GET_ACTIVE_LANE_MASK - this corrosponds to the llvm.get.active.lane.mask + // intrinsic. It creates a mask representing active and inactive vector + // lanes, active while Base + index < Trip Count. As with the intrinsic, + // the operands Base and Trip Count have the same scalar integer type and + // the internal addition of Base + index cannot overflow. However, the ISD + // node supports result types which are wider than i1, where the high + // bits conform to getBooleanContents similar to the SETCC operator. + GET_ACTIVE_LANE_MASK, + // llvm.clear_cache intrinsic // Operands: Input Chain, Start Addres, End Address // Outputs: Output Chain diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 2b7fda1878e35..de88f330855bc 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -149,7 +149,7 @@ class MachineInstr /// Various bits of information used by the AsmPrinter to emit helpful /// comments. This is *not* semantic information. Do not use this for /// anything other than to convey comment information to AsmPrinter. - uint8_t AsmPrinterFlags : LLVM_MI_ASMPRINTERFLAGS_BITS; + uint32_t AsmPrinterFlags : LLVM_MI_ASMPRINTERFLAGS_BITS; /// Internal implementation detail class that provides out-of-line storage for /// extra info used by the machine instruction when this info cannot be stored diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h index b4e704654495c..5a06ad02ca3a3 100644 --- a/llvm/include/llvm/CodeGen/MachineOperand.h +++ b/llvm/include/llvm/CodeGen/MachineOperand.h @@ -279,9 +279,8 @@ class MachineOperand { static void printIRSlotNumber(raw_ostream &OS, int Slot); /// Print the MachineOperand to \p os. - /// Providing a valid \p TRI and \p IntrinsicInfo results in a more - /// target-specific printing. If \p TRI and \p IntrinsicInfo are null, the - /// function will try to pick it up from the parent. + /// Providing a valid \p TRI results in a more target-specific printing. If + /// \p TRI is null, the function will try to pick it up from the parent. void print(raw_ostream &os, const TargetRegisterInfo *TRI = nullptr) const; /// More complex way of printing a MachineOperand. @@ -304,14 +303,13 @@ class MachineOperand { /// \param TRI - provide more target-specific information to the printer. /// Unlike the previous function, this one will not try and get the /// information from it's parent. - /// \param IntrinsicInfo - same as \p TRI. void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint, std::optional OpIdx, bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies, unsigned TiedOperandIdx, const TargetRegisterInfo *TRI) const; - /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level - /// type to be printed the same way the full version of print(...) does it. + /// Same as print(os, TRI), but allows to specify the low-level type to be + /// printed the same way the full version of print(...) does it. void print(raw_ostream &os, LLT TypeToPrint, const TargetRegisterInfo *TRI = nullptr) const; diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index d214ab9306c2f..428fc35f8a400 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -14,9 +14,9 @@ #ifndef LLVM_CODEGEN_PASSES_H #define LLVM_CODEGEN_PASSES_H +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Discriminator.h" -#include "llvm/CodeGen/RegAllocCommon.h" #include #include @@ -36,587 +36,587 @@ namespace vfs { class FileSystem; } // namespace vfs -} // End llvm namespace +} // namespace llvm // List of target independent CodeGen pass IDs. namespace llvm { - /// AtomicExpandPass - At IR level this pass replace atomic instructions with - /// __atomic_* library calls, or target specific instruction which implement the - /// same semantics in a way which better fits the target backend. - FunctionPass *createAtomicExpandLegacyPass(); - - /// createUnreachableBlockEliminationPass - The LLVM code generator does not - /// work well with unreachable basic blocks (what live ranges make sense for a - /// block that cannot be reached?). As such, a code generator should either - /// not instruction select unreachable blocks, or run this pass as its - /// last LLVM modifying pass to clean up blocks that are not reachable from - /// the entry block. - FunctionPass *createUnreachableBlockEliminationPass(); - - /// createGCEmptyBasicblocksPass - Empty basic blocks (basic blocks without - /// real code) appear as the result of optimization passes removing - /// instructions. These blocks confuscate profile analysis (e.g., basic block - /// sections) since they will share the address of their fallthrough blocks. - /// This pass garbage-collects such basic blocks. - MachineFunctionPass *createGCEmptyBasicBlocksPass(); - - /// createBasicBlockSections Pass - This pass assigns sections to machine - /// basic blocks and is enabled with -fbasic-block-sections. - MachineFunctionPass *createBasicBlockSectionsPass(); - - MachineFunctionPass *createBasicBlockPathCloningPass(); +/// AtomicExpandPass - At IR level this pass replace atomic instructions with +/// __atomic_* library calls, or target specific instruction which implement the +/// same semantics in a way which better fits the target backend. +FunctionPass *createAtomicExpandLegacyPass(); + +/// createUnreachableBlockEliminationPass - The LLVM code generator does not +/// work well with unreachable basic blocks (what live ranges make sense for a +/// block that cannot be reached?). As such, a code generator should either +/// not instruction select unreachable blocks, or run this pass as its +/// last LLVM modifying pass to clean up blocks that are not reachable from +/// the entry block. +FunctionPass *createUnreachableBlockEliminationPass(); + +/// createGCEmptyBasicblocksPass - Empty basic blocks (basic blocks without +/// real code) appear as the result of optimization passes removing +/// instructions. These blocks confuscate profile analysis (e.g., basic block +/// sections) since they will share the address of their fallthrough blocks. +/// This pass garbage-collects such basic blocks. +MachineFunctionPass *createGCEmptyBasicBlocksPass(); + +/// createBasicBlockSections Pass - This pass assigns sections to machine +/// basic blocks and is enabled with -fbasic-block-sections. +MachineFunctionPass *createBasicBlockSectionsPass(); + +MachineFunctionPass *createBasicBlockPathCloningPass(); - /// createMachineFunctionSplitterPass - This pass splits machine functions - /// using profile information. - MachineFunctionPass *createMachineFunctionSplitterPass(); +/// createMachineFunctionSplitterPass - This pass splits machine functions +/// using profile information. +MachineFunctionPass *createMachineFunctionSplitterPass(); - /// createStaticDataSplitterPass - This is a machine-function pass that - /// categorizes static data hotness using profile information. - MachineFunctionPass *createStaticDataSplitterPass(); +/// createStaticDataSplitterPass - This is a machine-function pass that +/// categorizes static data hotness using profile information. +MachineFunctionPass *createStaticDataSplitterPass(); - /// createStaticDataAnnotatorPASS - This is a module pass that reads from - /// StaticDataProfileInfoWrapperPass and annotates the section prefix of - /// global variables. - ModulePass *createStaticDataAnnotatorPass(); - - /// MachineFunctionPrinter pass - This pass prints out the machine function to - /// the given stream as a debugging tool. - MachineFunctionPass * - createMachineFunctionPrinterPass(raw_ostream &OS, - const std::string &Banner =""); +/// createStaticDataAnnotatorPASS - This is a module pass that reads from +/// StaticDataProfileInfoWrapperPass and annotates the section prefix of +/// global variables. +ModulePass *createStaticDataAnnotatorPass(); + +/// MachineFunctionPrinter pass - This pass prints out the machine function to +/// the given stream as a debugging tool. +MachineFunctionPass * +createMachineFunctionPrinterPass(raw_ostream &OS, + const std::string &Banner = ""); - /// StackFramePrinter pass - This pass prints out the machine function's - /// stack frame to the given stream as a debugging tool. - MachineFunctionPass *createStackFrameLayoutAnalysisPass(); +/// StackFramePrinter pass - This pass prints out the machine function's +/// stack frame to the given stream as a debugging tool. +MachineFunctionPass *createStackFrameLayoutAnalysisPass(); - /// MIRPrinting pass - this pass prints out the LLVM IR into the given stream - /// using the MIR serialization format. - MachineFunctionPass *createPrintMIRPass(raw_ostream &OS); +/// MIRPrinting pass - this pass prints out the LLVM IR into the given stream +/// using the MIR serialization format. +MachineFunctionPass *createPrintMIRPass(raw_ostream &OS); - /// This pass resets a MachineFunction when it has the FailedISel property - /// as if it was just created. - /// If EmitFallbackDiag is true, the pass will emit a - /// DiagnosticInfoISelFallback for every MachineFunction it resets. - /// If AbortOnFailedISel is true, abort compilation instead of resetting. - MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag, - bool AbortOnFailedISel); +/// This pass resets a MachineFunction when it has the FailedISel property +/// as if it was just created. +/// If EmitFallbackDiag is true, the pass will emit a +/// DiagnosticInfoISelFallback for every MachineFunction it resets. +/// If AbortOnFailedISel is true, abort compilation instead of resetting. +MachineFunctionPass *createResetMachineFunctionPass(bool EmitFallbackDiag, + bool AbortOnFailedISel); - /// createCodeGenPrepareLegacyPass - Transform the code to expose more pattern - /// matching during instruction selection. - FunctionPass *createCodeGenPrepareLegacyPass(); +/// createCodeGenPrepareLegacyPass - Transform the code to expose more pattern +/// matching during instruction selection. +FunctionPass *createCodeGenPrepareLegacyPass(); - /// This pass implements generation of target-specific intrinsics to support - /// handling of complex number arithmetic - FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM); +/// This pass implements generation of target-specific intrinsics to support +/// handling of complex number arithmetic +FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM); - /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg - /// load-linked/store-conditional loops. - extern char &AtomicExpandID; +/// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg +/// load-linked/store-conditional loops. +extern char &AtomicExpandID; - /// MachineLoopInfo - This pass is a loop analysis pass. - extern char &MachineLoopInfoID; +/// MachineLoopInfo - This pass is a loop analysis pass. +extern char &MachineLoopInfoID; - /// MachineDominators - This pass is a machine dominators analysis pass. - extern char &MachineDominatorsID; - - /// MachineDominanaceFrontier - This pass is a machine dominators analysis. - extern char &MachineDominanceFrontierID; +/// MachineDominators - This pass is a machine dominators analysis pass. +extern char &MachineDominatorsID; + +/// MachineDominanaceFrontier - This pass is a machine dominators analysis. +extern char &MachineDominanceFrontierID; - /// MachineRegionInfo - This pass computes SESE regions for machine functions. - extern char &MachineRegionInfoPassID; - - /// EdgeBundles analysis - Bundle machine CFG edges. - extern char &EdgeBundlesWrapperLegacyID; - - /// LiveVariables pass - This pass computes the set of blocks in which each - /// variable is life and sets machine operand kill flags. - extern char &LiveVariablesID; - - /// PHIElimination - This pass eliminates machine instruction PHI nodes - /// by inserting copy instructions. This destroys SSA information, but is the - /// desired input for some register allocators. This pass is "required" by - /// these register allocator like this: AU.addRequiredID(PHIEliminationID); - extern char &PHIEliminationID; - - /// LiveIntervals - This analysis keeps track of the live ranges of virtual - /// and physical registers. - extern char &LiveIntervalsID; - - /// LiveStacks pass. An analysis keeping track of the liveness of stack slots. - extern char &LiveStacksID; - - /// TwoAddressInstruction - This pass reduces two-address instructions to - /// use two operands. This destroys SSA information but it is desired by - /// register allocators. - extern char &TwoAddressInstructionPassID; - - /// ProcessImpicitDefs pass - This pass removes IMPLICIT_DEFs. - extern char &ProcessImplicitDefsID; - - /// RegisterCoalescer - This pass merges live ranges to eliminate copies. - extern char &RegisterCoalescerID; - - /// MachineScheduler - This pass schedules machine instructions. - extern char &MachineSchedulerID; - - /// PostMachineScheduler - This pass schedules machine instructions postRA. - extern char &PostMachineSchedulerID; - - /// SpillPlacement analysis. Suggest optimal placement of spill code between - /// basic blocks. - extern char &SpillPlacementID; - - /// ShrinkWrap pass. Look for the best place to insert save and restore - // instruction and update the MachineFunctionInfo with that information. - extern char &ShrinkWrapID; - - /// LiveRangeShrink pass. Move instruction close to its definition to shrink - /// the definition's live range. - extern char &LiveRangeShrinkID; - - /// Greedy register allocator. - extern char &RAGreedyLegacyID; - - /// Basic register allocator. - extern char &RABasicID; - - /// VirtRegRewriter pass. Rewrite virtual registers to physical registers as - /// assigned in VirtRegMap. - extern char &VirtRegRewriterID; - FunctionPass *createVirtRegRewriter(bool ClearVirtRegs = true); +/// MachineRegionInfo - This pass computes SESE regions for machine functions. +extern char &MachineRegionInfoPassID; + +/// EdgeBundles analysis - Bundle machine CFG edges. +extern char &EdgeBundlesWrapperLegacyID; + +/// LiveVariables pass - This pass computes the set of blocks in which each +/// variable is life and sets machine operand kill flags. +extern char &LiveVariablesID; + +/// PHIElimination - This pass eliminates machine instruction PHI nodes +/// by inserting copy instructions. This destroys SSA information, but is the +/// desired input for some register allocators. This pass is "required" by +/// these register allocator like this: AU.addRequiredID(PHIEliminationID); +extern char &PHIEliminationID; + +/// LiveIntervals - This analysis keeps track of the live ranges of virtual +/// and physical registers. +extern char &LiveIntervalsID; + +/// LiveStacks pass. An analysis keeping track of the liveness of stack slots. +extern char &LiveStacksID; + +/// TwoAddressInstruction - This pass reduces two-address instructions to +/// use two operands. This destroys SSA information but it is desired by +/// register allocators. +extern char &TwoAddressInstructionPassID; + +/// ProcessImpicitDefs pass - This pass removes IMPLICIT_DEFs. +extern char &ProcessImplicitDefsID; + +/// RegisterCoalescer - This pass merges live ranges to eliminate copies. +extern char &RegisterCoalescerID; + +/// MachineScheduler - This pass schedules machine instructions. +extern char &MachineSchedulerID; + +/// PostMachineScheduler - This pass schedules machine instructions postRA. +extern char &PostMachineSchedulerID; + +/// SpillPlacement analysis. Suggest optimal placement of spill code between +/// basic blocks. +extern char &SpillPlacementID; + +/// ShrinkWrap pass. Look for the best place to insert save and restore +// instruction and update the MachineFunctionInfo with that information. +extern char &ShrinkWrapID; + +/// LiveRangeShrink pass. Move instruction close to its definition to shrink +/// the definition's live range. +extern char &LiveRangeShrinkID; + +/// Greedy register allocator. +extern char &RAGreedyLegacyID; + +/// Basic register allocator. +extern char &RABasicID; + +/// VirtRegRewriter pass. Rewrite virtual registers to physical registers as +/// assigned in VirtRegMap. +extern char &VirtRegRewriterID; +FunctionPass *createVirtRegRewriter(bool ClearVirtRegs = true); - /// UnreachableMachineBlockElimination - This pass removes unreachable - /// machine basic blocks. - extern char &UnreachableMachineBlockElimID; +/// UnreachableMachineBlockElimination - This pass removes unreachable +/// machine basic blocks. +extern char &UnreachableMachineBlockElimID; - /// DeadMachineInstructionElim - This pass removes dead machine instructions. - extern char &DeadMachineInstructionElimID; +/// DeadMachineInstructionElim - This pass removes dead machine instructions. +extern char &DeadMachineInstructionElimID; - /// This pass adds dead/undef flags after analyzing subregister lanes. - extern char &DetectDeadLanesID; +/// This pass adds dead/undef flags after analyzing subregister lanes. +extern char &DetectDeadLanesID; - /// This pass perform post-ra machine sink for COPY instructions. - extern char &PostRAMachineSinkingID; +/// This pass perform post-ra machine sink for COPY instructions. +extern char &PostRAMachineSinkingID; - /// This pass adds flow sensitive discriminators. - extern char &MIRAddFSDiscriminatorsID; +/// This pass adds flow sensitive discriminators. +extern char &MIRAddFSDiscriminatorsID; - /// This pass reads flow sensitive profile. - extern char &MIRProfileLoaderPassID; +/// This pass reads flow sensitive profile. +extern char &MIRProfileLoaderPassID; - // This pass gives undef values a Pseudo Instruction definition for - // Instructions to ensure early-clobber is followed when using the greedy - // register allocator. - extern char &InitUndefID; +// This pass gives undef values a Pseudo Instruction definition for +// Instructions to ensure early-clobber is followed when using the greedy +// register allocator. +extern char &InitUndefID; - /// FastRegisterAllocation Pass - This pass register allocates as fast as - /// possible. It is best suited for debug code where live ranges are short. - /// - FunctionPass *createFastRegisterAllocator(); - FunctionPass *createFastRegisterAllocator(RegAllocFilterFunc F, - bool ClearVirtRegs); +/// FastRegisterAllocation Pass - This pass register allocates as fast as +/// possible. It is best suited for debug code where live ranges are short. +/// +FunctionPass *createFastRegisterAllocator(); +FunctionPass *createFastRegisterAllocator(RegAllocFilterFunc F, + bool ClearVirtRegs); - /// BasicRegisterAllocation Pass - This pass implements a degenerate global - /// register allocator using the basic regalloc framework. - /// - FunctionPass *createBasicRegisterAllocator(); - FunctionPass *createBasicRegisterAllocator(RegAllocFilterFunc F); +/// BasicRegisterAllocation Pass - This pass implements a degenerate global +/// register allocator using the basic regalloc framework. +/// +FunctionPass *createBasicRegisterAllocator(); +FunctionPass *createBasicRegisterAllocator(RegAllocFilterFunc F); - /// Greedy register allocation pass - This pass implements a global register - /// allocator for optimized builds. - /// - FunctionPass *createGreedyRegisterAllocator(); - FunctionPass *createGreedyRegisterAllocator(RegAllocFilterFunc F); +/// Greedy register allocation pass - This pass implements a global register +/// allocator for optimized builds. +/// +FunctionPass *createGreedyRegisterAllocator(); +FunctionPass *createGreedyRegisterAllocator(RegAllocFilterFunc F); - /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean - /// Quadratic Prograaming (PBQP) based register allocator. - /// - FunctionPass *createDefaultPBQPRegisterAllocator(); - - /// PrologEpilogCodeInserter - This pass inserts prolog and epilog code, - /// and eliminates abstract frame references. - extern char &PrologEpilogCodeInserterID; - MachineFunctionPass *createPrologEpilogInserterPass(); +/// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean +/// Quadratic Prograaming (PBQP) based register allocator. +/// +FunctionPass *createDefaultPBQPRegisterAllocator(); + +/// PrologEpilogCodeInserter - This pass inserts prolog and epilog code, +/// and eliminates abstract frame references. +extern char &PrologEpilogCodeInserterID; +MachineFunctionPass *createPrologEpilogInserterPass(); - /// ExpandPostRAPseudos - This pass expands pseudo instructions after - /// register allocation. - extern char &ExpandPostRAPseudosID; +/// ExpandPostRAPseudos - This pass expands pseudo instructions after +/// register allocation. +extern char &ExpandPostRAPseudosID; - /// PostRAHazardRecognizer - This pass runs the post-ra hazard - /// recognizer. - extern char &PostRAHazardRecognizerID; - - /// PostRAScheduler - This pass performs post register allocation - /// scheduling. - extern char &PostRASchedulerID; +/// PostRAHazardRecognizer - This pass runs the post-ra hazard +/// recognizer. +extern char &PostRAHazardRecognizerID; + +/// PostRAScheduler - This pass performs post register allocation +/// scheduling. +extern char &PostRASchedulerID; - /// BranchFolding - This pass performs machine code CFG based - /// optimizations to delete branches to branches, eliminate branches to - /// successor blocks (creating fall throughs), and eliminating branches over - /// branches. - extern char &BranchFolderPassID; +/// BranchFolding - This pass performs machine code CFG based +/// optimizations to delete branches to branches, eliminate branches to +/// successor blocks (creating fall throughs), and eliminating branches over +/// branches. +extern char &BranchFolderPassID; - /// BranchRelaxation - This pass replaces branches that need to jump further - /// than is supported by a branch instruction. - extern char &BranchRelaxationPassID; - - /// MachineFunctionPrinterPass - This pass prints out MachineInstr's. - extern char &MachineFunctionPrinterPassID; - - /// MIRPrintingPass - this pass prints out the LLVM IR using the MIR - /// serialization format. - extern char &MIRPrintingPassID; - - /// TailDuplicate - Duplicate blocks with unconditional branches - /// into tails of their predecessors. - extern char &TailDuplicateLegacyID; - - /// Duplicate blocks with unconditional branches into tails of their - /// predecessors. Variant that works before register allocation. - extern char &EarlyTailDuplicateLegacyID; - - /// MachineTraceMetrics - This pass computes critical path and CPU resource - /// usage in an ensemble of traces. - extern char &MachineTraceMetricsID; - - /// EarlyIfConverter - This pass performs if-conversion on SSA form by - /// inserting cmov instructions. - extern char &EarlyIfConverterLegacyID; +/// BranchRelaxation - This pass replaces branches that need to jump further +/// than is supported by a branch instruction. +extern char &BranchRelaxationPassID; + +/// MachineFunctionPrinterPass - This pass prints out MachineInstr's. +extern char &MachineFunctionPrinterPassID; + +/// MIRPrintingPass - this pass prints out the LLVM IR using the MIR +/// serialization format. +extern char &MIRPrintingPassID; + +/// TailDuplicate - Duplicate blocks with unconditional branches +/// into tails of their predecessors. +extern char &TailDuplicateLegacyID; + +/// Duplicate blocks with unconditional branches into tails of their +/// predecessors. Variant that works before register allocation. +extern char &EarlyTailDuplicateLegacyID; + +/// MachineTraceMetrics - This pass computes critical path and CPU resource +/// usage in an ensemble of traces. +extern char &MachineTraceMetricsID; + +/// EarlyIfConverter - This pass performs if-conversion on SSA form by +/// inserting cmov instructions. +extern char &EarlyIfConverterLegacyID; - /// EarlyIfPredicator - This pass performs if-conversion on SSA form by - /// predicating if/else block and insert select at the join point. - extern char &EarlyIfPredicatorID; +/// EarlyIfPredicator - This pass performs if-conversion on SSA form by +/// predicating if/else block and insert select at the join point. +extern char &EarlyIfPredicatorID; - /// This pass performs instruction combining using trace metrics to estimate - /// critical-path and resource depth. - extern char &MachineCombinerID; +/// This pass performs instruction combining using trace metrics to estimate +/// critical-path and resource depth. +extern char &MachineCombinerID; - /// StackSlotColoring - This pass performs stack coloring and merging. - /// It merges disjoint allocas to reduce the stack size. - extern char &StackColoringLegacyID; +/// StackSlotColoring - This pass performs stack coloring and merging. +/// It merges disjoint allocas to reduce the stack size. +extern char &StackColoringLegacyID; - /// StackFramePrinter - This pass prints the stack frame layout and variable - /// mappings. - extern char &StackFrameLayoutAnalysisPassID; - - /// IfConverter - This pass performs machine code if conversion. - extern char &IfConverterID; - - FunctionPass *createIfConverter( - std::function Ftor); - - /// MachineBlockPlacement - This pass places basic blocks based on branch - /// probabilities. - extern char &MachineBlockPlacementID; - - /// MachineBlockPlacementStats - This pass collects statistics about the - /// basic block placement using branch probabilities and block frequency - /// information. - extern char &MachineBlockPlacementStatsID; - - /// GCLowering Pass - Used by gc.root to perform its default lowering - /// operations. - FunctionPass *createGCLoweringPass(); - - /// GCLowering Pass - Used by gc.root to perform its default lowering - /// operations. - extern char &GCLoweringID; +/// StackFramePrinter - This pass prints the stack frame layout and variable +/// mappings. +extern char &StackFrameLayoutAnalysisPassID; + +/// IfConverter - This pass performs machine code if conversion. +extern char &IfConverterID; + +FunctionPass * +createIfConverter(std::function Ftor); + +/// MachineBlockPlacement - This pass places basic blocks based on branch +/// probabilities. +extern char &MachineBlockPlacementID; + +/// MachineBlockPlacementStats - This pass collects statistics about the +/// basic block placement using branch probabilities and block frequency +/// information. +extern char &MachineBlockPlacementStatsID; + +/// GCLowering Pass - Used by gc.root to perform its default lowering +/// operations. +FunctionPass *createGCLoweringPass(); + +/// GCLowering Pass - Used by gc.root to perform its default lowering +/// operations. +extern char &GCLoweringID; - /// ShadowStackGCLowering - Implements the custom lowering mechanism - /// used by the shadow stack GC. Only runs on functions which opt in to - /// the shadow stack collector. - FunctionPass *createShadowStackGCLoweringPass(); +/// ShadowStackGCLowering - Implements the custom lowering mechanism +/// used by the shadow stack GC. Only runs on functions which opt in to +/// the shadow stack collector. +FunctionPass *createShadowStackGCLoweringPass(); - /// ShadowStackGCLowering - Implements the custom lowering mechanism - /// used by the shadow stack GC. - extern char &ShadowStackGCLoweringID; +/// ShadowStackGCLowering - Implements the custom lowering mechanism +/// used by the shadow stack GC. +extern char &ShadowStackGCLoweringID; - /// GCMachineCodeAnalysis - Target-independent pass to mark safe points - /// in machine code. Must be added very late during code generation, just - /// prior to output, and importantly after all CFG transformations (such as - /// branch folding). - extern char &GCMachineCodeAnalysisID; +/// GCMachineCodeAnalysis - Target-independent pass to mark safe points +/// in machine code. Must be added very late during code generation, just +/// prior to output, and importantly after all CFG transformations (such as +/// branch folding). +extern char &GCMachineCodeAnalysisID; - /// MachineCSE - This pass performs global CSE on machine instructions. - extern char &MachineCSELegacyID; +/// MachineCSE - This pass performs global CSE on machine instructions. +extern char &MachineCSELegacyID; - /// MIRCanonicalizer - This pass canonicalizes MIR by renaming vregs - /// according to the semantics of the instruction as well as hoists - /// code. - extern char &MIRCanonicalizerID; +/// MIRCanonicalizer - This pass canonicalizes MIR by renaming vregs +/// according to the semantics of the instruction as well as hoists +/// code. +extern char &MIRCanonicalizerID; - /// ImplicitNullChecks - This pass folds null pointer checks into nearby - /// memory operations. - extern char &ImplicitNullChecksID; +/// ImplicitNullChecks - This pass folds null pointer checks into nearby +/// memory operations. +extern char &ImplicitNullChecksID; - /// This pass performs loop invariant code motion on machine instructions. - extern char &MachineLICMID; +/// This pass performs loop invariant code motion on machine instructions. +extern char &MachineLICMID; - /// This pass performs loop invariant code motion on machine instructions. - /// This variant works before register allocation. \see MachineLICMID. - extern char &EarlyMachineLICMID; +/// This pass performs loop invariant code motion on machine instructions. +/// This variant works before register allocation. \see MachineLICMID. +extern char &EarlyMachineLICMID; - /// MachineSinking - This pass performs sinking on machine instructions. - extern char &MachineSinkingLegacyID; +/// MachineSinking - This pass performs sinking on machine instructions. +extern char &MachineSinkingLegacyID; - /// MachineCopyPropagation - This pass performs copy propagation on - /// machine instructions. - extern char &MachineCopyPropagationID; +/// MachineCopyPropagation - This pass performs copy propagation on +/// machine instructions. +extern char &MachineCopyPropagationID; - MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr); +MachineFunctionPass *createMachineCopyPropagationPass(bool UseCopyInstr); - /// MachineLateInstrsCleanup - This pass removes redundant identical - /// instructions after register allocation and rematerialization. - extern char &MachineLateInstrsCleanupID; +/// MachineLateInstrsCleanup - This pass removes redundant identical +/// instructions after register allocation and rematerialization. +extern char &MachineLateInstrsCleanupID; - /// PeepholeOptimizer - This pass performs peephole optimizations - - /// like extension and comparison eliminations. - extern char &PeepholeOptimizerLegacyID; +/// PeepholeOptimizer - This pass performs peephole optimizations - +/// like extension and comparison eliminations. +extern char &PeepholeOptimizerLegacyID; - /// OptimizePHIs - This pass optimizes machine instruction PHIs - /// to take advantage of opportunities created during DAG legalization. - extern char &OptimizePHIsLegacyID; +/// OptimizePHIs - This pass optimizes machine instruction PHIs +/// to take advantage of opportunities created during DAG legalization. +extern char &OptimizePHIsLegacyID; - /// StackSlotColoring - This pass performs stack slot coloring. - extern char &StackSlotColoringID; +/// StackSlotColoring - This pass performs stack slot coloring. +extern char &StackSlotColoringID; - /// This pass lays out funclets contiguously. - extern char &FuncletLayoutID; +/// This pass lays out funclets contiguously. +extern char &FuncletLayoutID; - /// This pass inserts the XRay instrumentation sleds if they are supported by - /// the target platform. - extern char &XRayInstrumentationID; +/// This pass inserts the XRay instrumentation sleds if they are supported by +/// the target platform. +extern char &XRayInstrumentationID; - /// This pass inserts FEntry calls - extern char &FEntryInserterID; - - /// This pass implements the "patchable-function" attribute. - extern char &PatchableFunctionID; - - /// createStackProtectorPass - This pass adds stack protectors to functions. - /// - FunctionPass *createStackProtectorPass(); - - /// createMachineVerifierPass - This pass verifies cenerated machine code - /// instructions for correctness. - /// - FunctionPass *createMachineVerifierPass(const std::string& Banner); - - /// createDwarfEHPass - This pass mulches exception handling code into a form - /// adapted to code generation. Required if using dwarf exception handling. - FunctionPass *createDwarfEHPass(CodeGenOptLevel OptLevel); - - /// createWinEHPass - Prepares personality functions used by MSVC on Windows, - /// in addition to the Itanium LSDA based personalities. - FunctionPass *createWinEHPass(bool DemoteCatchSwitchPHIOnly = false); - - /// createSjLjEHPreparePass - This pass adapts exception handling code to use - /// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow. - /// - FunctionPass *createSjLjEHPreparePass(const TargetMachine *TM); - - /// createWasmEHPass - This pass adapts exception handling code to use - /// WebAssembly's exception handling scheme. - FunctionPass *createWasmEHPass(); - - /// LocalStackSlotAllocation - This pass assigns local frame indices to stack - /// slots relative to one another and allocates base registers to access them - /// when it is estimated by the target to be out of range of normal frame - /// pointer or stack pointer index addressing. - extern char &LocalStackSlotAllocationID; +/// This pass inserts FEntry calls +extern char &FEntryInserterID; + +/// This pass implements the "patchable-function" attribute. +extern char &PatchableFunctionID; + +/// createStackProtectorPass - This pass adds stack protectors to functions. +/// +FunctionPass *createStackProtectorPass(); + +/// createMachineVerifierPass - This pass verifies cenerated machine code +/// instructions for correctness. +/// +FunctionPass *createMachineVerifierPass(const std::string &Banner); + +/// createDwarfEHPass - This pass mulches exception handling code into a form +/// adapted to code generation. Required if using dwarf exception handling. +FunctionPass *createDwarfEHPass(CodeGenOptLevel OptLevel); + +/// createWinEHPass - Prepares personality functions used by MSVC on Windows, +/// in addition to the Itanium LSDA based personalities. +FunctionPass *createWinEHPass(bool DemoteCatchSwitchPHIOnly = false); + +/// createSjLjEHPreparePass - This pass adapts exception handling code to use +/// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow. +/// +FunctionPass *createSjLjEHPreparePass(const TargetMachine *TM); + +/// createWasmEHPass - This pass adapts exception handling code to use +/// WebAssembly's exception handling scheme. +FunctionPass *createWasmEHPass(); + +/// LocalStackSlotAllocation - This pass assigns local frame indices to stack +/// slots relative to one another and allocates base registers to access them +/// when it is estimated by the target to be out of range of normal frame +/// pointer or stack pointer index addressing. +extern char &LocalStackSlotAllocationID; - /// This pass expands pseudo-instructions, reserves registers and adjusts - /// machine frame information. - extern char &FinalizeISelID; +/// This pass expands pseudo-instructions, reserves registers and adjusts +/// machine frame information. +extern char &FinalizeISelID; - /// UnpackMachineBundles - This pass unpack machine instruction bundles. - extern char &UnpackMachineBundlesID; +/// UnpackMachineBundles - This pass unpack machine instruction bundles. +extern char &UnpackMachineBundlesID; - FunctionPass * - createUnpackMachineBundles(std::function Ftor); +FunctionPass * +createUnpackMachineBundles(std::function Ftor); - /// FinalizeMachineBundles - This pass finalize machine instruction - /// bundles (created earlier, e.g. during pre-RA scheduling). - extern char &FinalizeMachineBundlesID; +/// FinalizeMachineBundles - This pass finalize machine instruction +/// bundles (created earlier, e.g. during pre-RA scheduling). +extern char &FinalizeMachineBundlesID; - /// StackMapLiveness - This pass analyses the register live-out set of - /// stackmap/patchpoint intrinsics and attaches the calculated information to - /// the intrinsic for later emission to the StackMap. - extern char &StackMapLivenessID; +/// StackMapLiveness - This pass analyses the register live-out set of +/// stackmap/patchpoint intrinsics and attaches the calculated information to +/// the intrinsic for later emission to the StackMap. +extern char &StackMapLivenessID; - // MachineSanitizerBinaryMetadata - appends/finalizes sanitizer binary - // metadata after llvm SanitizerBinaryMetadata pass. - extern char &MachineSanitizerBinaryMetadataID; +// MachineSanitizerBinaryMetadata - appends/finalizes sanitizer binary +// metadata after llvm SanitizerBinaryMetadata pass. +extern char &MachineSanitizerBinaryMetadataID; - /// RemoveLoadsIntoFakeUses pass. - extern char &RemoveLoadsIntoFakeUsesID; +/// RemoveLoadsIntoFakeUses pass. +extern char &RemoveLoadsIntoFakeUsesID; - /// RemoveRedundantDebugValues pass. - extern char &RemoveRedundantDebugValuesID; +/// RemoveRedundantDebugValues pass. +extern char &RemoveRedundantDebugValuesID; - /// MachineCFGPrinter pass. - extern char &MachineCFGPrinterID; +/// MachineCFGPrinter pass. +extern char &MachineCFGPrinterID; - /// LiveDebugValues pass - extern char &LiveDebugValuesID; +/// LiveDebugValues pass +extern char &LiveDebugValuesID; - /// InterleavedAccess Pass - This pass identifies and matches interleaved - /// memory accesses to target specific intrinsics. - /// - FunctionPass *createInterleavedAccessPass(); +/// InterleavedAccess Pass - This pass identifies and matches interleaved +/// memory accesses to target specific intrinsics. +/// +FunctionPass *createInterleavedAccessPass(); - /// InterleavedLoadCombines Pass - This pass identifies interleaved loads and - /// combines them into wide loads detectable by InterleavedAccessPass - /// - FunctionPass *createInterleavedLoadCombinePass(); +/// InterleavedLoadCombines Pass - This pass identifies interleaved loads and +/// combines them into wide loads detectable by InterleavedAccessPass +/// +FunctionPass *createInterleavedLoadCombinePass(); - /// LowerEmuTLS - This pass generates __emutls_[vt].xyz variables for all - /// TLS variables for the emulated TLS model. - /// - ModulePass *createLowerEmuTLSPass(); +/// LowerEmuTLS - This pass generates __emutls_[vt].xyz variables for all +/// TLS variables for the emulated TLS model. +/// +ModulePass *createLowerEmuTLSPass(); - /// This pass lowers the \@llvm.load.relative and \@llvm.objc.* intrinsics to - /// instructions. This is unsafe to do earlier because a pass may combine the - /// constant initializer into the load, which may result in an overflowing - /// evaluation. - ModulePass *createPreISelIntrinsicLoweringPass(); +/// This pass lowers the \@llvm.load.relative and \@llvm.objc.* intrinsics to +/// instructions. This is unsafe to do earlier because a pass may combine the +/// constant initializer into the load, which may result in an overflowing +/// evaluation. +ModulePass *createPreISelIntrinsicLoweringPass(); - /// GlobalMerge - This pass merges internal (by default) globals into structs - /// to enable reuse of a base pointer by indexed addressing modes. - /// It can also be configured to focus on size optimizations only. - /// - Pass *createGlobalMergePass(const TargetMachine *TM, unsigned MaximalOffset, - bool OnlyOptimizeForSize = false, - bool MergeExternalByDefault = false, - bool MergeConstantByDefault = false, - bool MergeConstAggressiveByDefault = false); +/// GlobalMerge - This pass merges internal (by default) globals into structs +/// to enable reuse of a base pointer by indexed addressing modes. +/// It can also be configured to focus on size optimizations only. +/// +Pass *createGlobalMergePass(const TargetMachine *TM, unsigned MaximalOffset, + bool OnlyOptimizeForSize = false, + bool MergeExternalByDefault = false, + bool MergeConstantByDefault = false, + bool MergeConstAggressiveByDefault = false); - /// This pass splits the stack into a safe stack and an unsafe stack to - /// protect against stack-based overflow vulnerabilities. - FunctionPass *createSafeStackPass(); +/// This pass splits the stack into a safe stack and an unsafe stack to +/// protect against stack-based overflow vulnerabilities. +FunctionPass *createSafeStackPass(); - /// This pass detects subregister lanes in a virtual register that are used - /// independently of other lanes and splits them into separate virtual - /// registers. - extern char &RenameIndependentSubregsID; +/// This pass detects subregister lanes in a virtual register that are used +/// independently of other lanes and splits them into separate virtual +/// registers. +extern char &RenameIndependentSubregsID; - /// This pass is executed POST-RA to collect which physical registers are - /// preserved by given machine function. - FunctionPass *createRegUsageInfoCollector(); +/// This pass is executed POST-RA to collect which physical registers are +/// preserved by given machine function. +FunctionPass *createRegUsageInfoCollector(); - /// Return a MachineFunction pass that identifies call sites - /// and propagates register usage information of callee to caller - /// if available with PysicalRegisterUsageInfo pass. - FunctionPass *createRegUsageInfoPropPass(); +/// Return a MachineFunction pass that identifies call sites +/// and propagates register usage information of callee to caller +/// if available with PysicalRegisterUsageInfo pass. +FunctionPass *createRegUsageInfoPropPass(); - /// This pass performs software pipelining on machine instructions. - extern char &MachinePipelinerID; +/// This pass performs software pipelining on machine instructions. +extern char &MachinePipelinerID; - /// This pass frees the memory occupied by the MachineFunction. - FunctionPass *createFreeMachineFunctionPass(); +/// This pass frees the memory occupied by the MachineFunction. +FunctionPass *createFreeMachineFunctionPass(); - /// This pass performs merging similar functions globally. - ModulePass *createGlobalMergeFuncPass(); +/// This pass performs merging similar functions globally. +ModulePass *createGlobalMergeFuncPass(); - /// This pass performs outlining on machine instructions directly before - /// printing assembly. - ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true); +/// This pass performs outlining on machine instructions directly before +/// printing assembly. +ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true); - /// This pass expands the reduction intrinsics into sequences of shuffles. - FunctionPass *createExpandReductionsPass(); +/// This pass expands the reduction intrinsics into sequences of shuffles. +FunctionPass *createExpandReductionsPass(); - // This pass replaces intrinsics operating on vector operands with calls to - // the corresponding function in a vector library (e.g., SVML, libmvec). - FunctionPass *createReplaceWithVeclibLegacyPass(); +// This pass replaces intrinsics operating on vector operands with calls to +// the corresponding function in a vector library (e.g., SVML, libmvec). +FunctionPass *createReplaceWithVeclibLegacyPass(); - // Expands large div/rem instructions. - FunctionPass *createExpandLargeDivRemPass(); +// Expands large div/rem instructions. +FunctionPass *createExpandLargeDivRemPass(); - // Expands large div/rem instructions. - FunctionPass *createExpandFpPass(); +// Expands large div/rem instructions. +FunctionPass *createExpandFpPass(); - // This pass expands memcmp() to load/stores. - FunctionPass *createExpandMemCmpLegacyPass(); +// This pass expands memcmp() to load/stores. +FunctionPass *createExpandMemCmpLegacyPass(); - /// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp - FunctionPass *createBreakFalseDeps(); +/// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp +FunctionPass *createBreakFalseDeps(); - // This pass expands indirectbr instructions. - FunctionPass *createIndirectBrExpandPass(); +// This pass expands indirectbr instructions. +FunctionPass *createIndirectBrExpandPass(); - /// Creates CFI Fixup pass. \see CFIFixup.cpp - FunctionPass *createCFIFixup(); +/// Creates CFI Fixup pass. \see CFIFixup.cpp +FunctionPass *createCFIFixup(); - /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp - FunctionPass *createCFIInstrInserter(); +/// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp +FunctionPass *createCFIInstrInserter(); - /// Creates CFGuard longjmp target identification pass. - /// \see CFGuardLongjmp.cpp - FunctionPass *createCFGuardLongjmpPass(); +/// Creates CFGuard longjmp target identification pass. +/// \see CFGuardLongjmp.cpp +FunctionPass *createCFGuardLongjmpPass(); - /// Creates Windows EH Continuation Guard target identification pass. - /// \see EHContGuardTargets.cpp - FunctionPass *createEHContGuardTargetsPass(); +/// Creates Windows EH Continuation Guard target identification pass. +/// \see EHContGuardTargets.cpp +FunctionPass *createEHContGuardTargetsPass(); - /// Create Hardware Loop pass. \see HardwareLoops.cpp - FunctionPass *createHardwareLoopsLegacyPass(); +/// Create Hardware Loop pass. \see HardwareLoops.cpp +FunctionPass *createHardwareLoopsLegacyPass(); - /// This pass inserts pseudo probe annotation for callsite profiling. - FunctionPass *createPseudoProbeInserter(); +/// This pass inserts pseudo probe annotation for callsite profiling. +FunctionPass *createPseudoProbeInserter(); - /// Create IR Type Promotion pass. \see TypePromotion.cpp - FunctionPass *createTypePromotionLegacyPass(); +/// Create IR Type Promotion pass. \see TypePromotion.cpp +FunctionPass *createTypePromotionLegacyPass(); - /// Add Flow Sensitive Discriminators. PassNum specifies the - /// sequence number of this pass (starting from 1). - FunctionPass * - createMIRAddFSDiscriminatorsPass(sampleprof::FSDiscriminatorPass P); +/// Add Flow Sensitive Discriminators. PassNum specifies the +/// sequence number of this pass (starting from 1). +FunctionPass * +createMIRAddFSDiscriminatorsPass(sampleprof::FSDiscriminatorPass P); - /// Read Flow Sensitive Profile. - FunctionPass * - createMIRProfileLoaderPass(std::string File, std::string RemappingFile, - sampleprof::FSDiscriminatorPass P, - IntrusiveRefCntPtr FS); +/// Read Flow Sensitive Profile. +FunctionPass * +createMIRProfileLoaderPass(std::string File, std::string RemappingFile, + sampleprof::FSDiscriminatorPass P, + IntrusiveRefCntPtr FS); - /// Creates MIR Debugify pass. \see MachineDebugify.cpp - ModulePass *createDebugifyMachineModulePass(); +/// Creates MIR Debugify pass. \see MachineDebugify.cpp +ModulePass *createDebugifyMachineModulePass(); - /// Creates MIR Strip Debug pass. \see MachineStripDebug.cpp - /// If OnlyDebugified is true then it will only strip debug info if it was - /// added by a Debugify pass. The module will be left unchanged if the debug - /// info was generated by another source such as clang. - ModulePass *createStripDebugMachineModulePass(bool OnlyDebugified); +/// Creates MIR Strip Debug pass. \see MachineStripDebug.cpp +/// If OnlyDebugified is true then it will only strip debug info if it was +/// added by a Debugify pass. The module will be left unchanged if the debug +/// info was generated by another source such as clang. +ModulePass *createStripDebugMachineModulePass(bool OnlyDebugified); - /// Creates MIR Check Debug pass. \see MachineCheckDebugify.cpp - ModulePass *createCheckDebugMachineModulePass(); +/// Creates MIR Check Debug pass. \see MachineCheckDebugify.cpp +ModulePass *createCheckDebugMachineModulePass(); - /// The pass fixups statepoint machine instruction to replace usage of - /// caller saved registers with stack slots. - extern char &FixupStatepointCallerSavedID; +/// The pass fixups statepoint machine instruction to replace usage of +/// caller saved registers with stack slots. +extern char &FixupStatepointCallerSavedID; - /// The pass transforms load/store <256 x i32> to AMX load/store intrinsics - /// or split the data to two <128 x i32>. - FunctionPass *createX86LowerAMXTypePass(); +/// The pass transforms load/store <256 x i32> to AMX load/store intrinsics +/// or split the data to two <128 x i32>. +FunctionPass *createX86LowerAMXTypePass(); - /// The pass transforms amx intrinsics to scalar operation if the function has - /// optnone attribute or it is O0. - FunctionPass *createX86LowerAMXIntrinsicsPass(); +/// The pass transforms amx intrinsics to scalar operation if the function has +/// optnone attribute or it is O0. +FunctionPass *createX86LowerAMXIntrinsicsPass(); - /// When learning an eviction policy, extract score(reward) information, - /// otherwise this does nothing - FunctionPass *createRegAllocScoringPass(); +/// When learning an eviction policy, extract score(reward) information, +/// otherwise this does nothing +FunctionPass *createRegAllocScoringPass(); - /// JMC instrument pass. - ModulePass *createJMCInstrumenterPass(); +/// JMC instrument pass. +ModulePass *createJMCInstrumenterPass(); - /// This pass converts conditional moves to conditional jumps when profitable. - FunctionPass *createSelectOptimizePass(); +/// This pass converts conditional moves to conditional jumps when profitable. +FunctionPass *createSelectOptimizePass(); - FunctionPass *createCallBrPass(); +FunctionPass *createCallBrPass(); - /// Lowers KCFI operand bundles for indirect calls. - FunctionPass *createKCFIPass(); -} // End llvm namespace +/// Lowers KCFI operand bundles for indirect calls. +FunctionPass *createKCFIPass(); +} // namespace llvm #endif diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index cfefceea8f0fe..1a373f4c10e78 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1386,7 +1386,7 @@ class MemSDNode : public SDNode { bool writeMem() const { return MMO->isStore(); } /// Returns alignment and volatility of the memory access - Align getOriginalAlign() const { return MMO->getBaseAlign(); } + Align getBaseAlign() const { return MMO->getBaseAlign(); } Align getAlign() const { return MMO->getAlign(); } /// Return the SubclassData value, without HasDebugValue. This contains an diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h index 6df3f5066e327..b39fb6852d700 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -103,6 +103,7 @@ class DWARFContext : public DIContext { std::unique_ptr parseMacroOrMacinfo(MacroSecType SectionType); + virtual Error doWorkThreadSafely(function_ref Work) = 0; }; friend class DWARFContextState; @@ -491,6 +492,10 @@ class DWARFContext : public DIContext { /// manually only for DWARF5. void setParseCUTUIndexManually(bool PCUTU) { ParseCUTUIndexManually = PCUTU; } + Error doWorkThreadSafely(function_ref Work) { + return State->doWorkThreadSafely(Work); + } + private: void addLocalsForDie(DWARFCompileUnit *CU, DWARFDie Subprogram, DWARFDie Die, std::vector &Result); diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h index 80c27aea89312..0f7958f28065d 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h @@ -566,6 +566,9 @@ class DWARFUnit { Error tryExtractDIEsIfNeeded(bool CUDieOnly); + /// clearDIEs - Clear parsed DIEs to keep memory usage low. + void clearDIEs(bool KeepCUDie, bool KeepDWODies = false); + private: /// Size in bytes of the .debug_info data associated with this compile unit. size_t getDebugInfoSize() const { @@ -581,9 +584,6 @@ class DWARFUnit { void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs, std::vector &DIEs) const; - /// clearDIEs - Clear parsed DIEs to keep memory usage low. - void clearDIEs(bool KeepCUDie); - /// parseDWO - Parses .dwo file for current compile unit. Returns true if /// it was actually constructed. /// The \p AlternativeLocation specifies an alternative location to get diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymDIContext.h b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h similarity index 81% rename from llvm/include/llvm/DebugInfo/GSYM/GsymDIContext.h rename to llvm/include/llvm/DebugInfo/GSYM/GsymContext.h index 396c08c608d25..9c04ff63c8059 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/GsymDIContext.h +++ b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h @@ -1,4 +1,4 @@ -//===-- GsymDIContext.h --------------------------------------------------===// +//===-- GsymContext.h --------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===/ -#ifndef LLVM_DEBUGINFO_GSYM_GSYMDICONTEXT_H -#define LLVM_DEBUGINFO_GSYM_GSYMDICONTEXT_H +#ifndef LLVM_DEBUGINFO_GSYM_GSYMCONTEXT_H +#define LLVM_DEBUGINFO_GSYM_GSYMCONTEXT_H #include "llvm/DebugInfo/DIContext.h" #include @@ -27,12 +27,12 @@ class GsymReader; /// interface to different symbolication formats (e.g. GSYM, PDB and DWARF). /// More control and power over the debug information access can be had by using /// the GSYM interfaces directly. -class GsymDIContext : public DIContext { +class GsymContext : public DIContext { public: - GsymDIContext(std::unique_ptr Reader); + GsymContext(std::unique_ptr Reader); - GsymDIContext(GsymDIContext &) = delete; - GsymDIContext &operator=(GsymDIContext &) = delete; + GsymContext(GsymContext &) = delete; + GsymContext &operator=(GsymContext &) = delete; static bool classof(const DIContext *DICtx) { return DICtx->getKind() == CK_GSYM; @@ -63,4 +63,4 @@ class GsymDIContext : public DIContext { } // end namespace llvm -#endif // LLVM_DEBUGINFO_PDB_PDBCONTEXT_H +#endif // LLVM_DEBUGINFO_GSYM_GSYMCONTEXT_H diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h index 295c12ab24916..f4569850b093c 100644 --- a/llvm/include/llvm/Demangle/ItaniumDemangle.h +++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h @@ -21,6 +21,7 @@ #include "Utility.h" #include #include +#include #include #include #include @@ -164,18 +165,18 @@ class NodeArray; // traversed by the printLeft/Right functions to produce a demangled string. class Node { public: - enum Kind : unsigned char { + enum Kind : uint8_t { #define NODE(NodeKind) K##NodeKind, #include "ItaniumNodes.def" }; /// Three-way bool to track a cached value. Unknown is possible if this node /// has an unexpanded parameter pack below it that may affect this cache. - enum class Cache : unsigned char { Yes, No, Unknown, }; + enum class Cache : uint8_t { Yes, No, Unknown, }; /// Operator precedence for expression nodes. Used to determine required /// parens in expression emission. - enum class Prec { + enum class Prec : uint8_t { Primary, Postfix, Unary, diff --git a/llvm/include/llvm/Frontend/Directive/DirectiveBase.td b/llvm/include/llvm/Frontend/Directive/DirectiveBase.td index 4faea18324cb7..3e2744dea8d14 100644 --- a/llvm/include/llvm/Frontend/Directive/DirectiveBase.td +++ b/llvm/include/llvm/Frontend/Directive/DirectiveBase.td @@ -172,6 +172,15 @@ def CA_Meta: Category<"Meta"> {} def CA_Subsidiary: Category<"Subsidiary"> {} def CA_Utility: Category<"Utility"> {} +class SourceLanguage { + string name = n; // Name of the enum value in enum class Association. +} + +// The C languages also implies C++ until there is a reason to add C++ +// separately. +def L_C : SourceLanguage<"C"> {} +def L_Fortran : SourceLanguage<"Fortran"> {} + // Information about a specific directive. class Directive { // Name of the directive. Can be composite directive sepearted by whitespace. @@ -205,4 +214,7 @@ class Directive { // The category of the directive. Category category = ?; + + // The languages that allow this directive. Default: all languages. + list languages = [L_C, L_Fortran]; } diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h index 37f3d9ad61d3e..9fdb40db9c23d 100644 --- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h +++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h @@ -20,6 +20,10 @@ #include namespace llvm { +class LLVMContext; +class MDNode; +class Metadata; + namespace hlsl { namespace rootsig { @@ -84,7 +88,9 @@ struct RootConstants { // Models the end of a descriptor table and stores its visibility struct DescriptorTable { ShaderVisibility Visibility = ShaderVisibility::All; - uint32_t NumClauses = 0; // The number of clauses in the table + // Denotes that the previous NumClauses in the RootElement array + // are the clauses in the table. + uint32_t NumClauses = 0; void dump(raw_ostream &OS) const; }; @@ -119,12 +125,47 @@ struct DescriptorTableClause { void dump(raw_ostream &OS) const; }; -// Models RootElement : RootConstants | DescriptorTable | DescriptorTableClause +/// Models RootElement : RootFlags | RootConstants | DescriptorTable +/// | DescriptorTableClause +/// +/// A Root Signature is modeled in-memory by an array of RootElements. These +/// aim to map closely to their DSL grammar reprsentation defined in the spec. +/// +/// Each optional parameter has its default value defined in the struct, and, +/// each mandatory parameter does not have a default initialization. +/// +/// For the variants RootFlags, RootConstants and DescriptorTableClause: each +/// data member maps directly to a parameter in the grammar. +/// +/// The DescriptorTable is modelled by having its Clauses as the previous +/// RootElements in the array, and it holds a data member for the Visibility +/// parameter. using RootElement = std::variant; void dumpRootElements(raw_ostream &OS, ArrayRef Elements); +class MetadataBuilder { +public: + MetadataBuilder(llvm::LLVMContext &Ctx, ArrayRef Elements) + : Ctx(Ctx), Elements(Elements) {} + + /// Iterates through the elements and dispatches onto the correct Build method + /// + /// Accumulates the root signature and returns the Metadata node that is just + /// a list of all the elements + MDNode *BuildRootSignature(); + +private: + /// Define the various builders for the different metadata types + MDNode *BuildDescriptorTable(const DescriptorTable &Table); + MDNode *BuildDescriptorTableClause(const DescriptorTableClause &Clause); + + llvm::LLVMContext &Ctx; + ArrayRef Elements; + SmallVector GeneratedMetadata; +}; + } // namespace rootsig } // namespace hlsl } // namespace llvm diff --git a/llvm/include/llvm/Frontend/OpenACC/ACC.td b/llvm/include/llvm/Frontend/OpenACC/ACC.td index d372fc221e4b4..46cba9f2400e1 100644 --- a/llvm/include/llvm/Frontend/OpenACC/ACC.td +++ b/llvm/include/llvm/Frontend/OpenACC/ACC.td @@ -556,35 +556,31 @@ def ACC_HostData : Directive<"host_data"> { // 2.11 def ACC_KernelsLoop : Directive<"kernels loop"> { - let allowedClauses = [ - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause - ]; - let allowedOnceClauses = [ - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause - ]; + let allowedClauses = [VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause]; + let allowedOnceClauses = [VersionedClause, + VersionedClause, + VersionedClause]; let allowedExclusiveClauses = [ VersionedClause, VersionedClause, @@ -596,36 +592,32 @@ def ACC_KernelsLoop : Directive<"kernels loop"> { // 2.11 def ACC_ParallelLoop : Directive<"parallel loop"> { - let allowedClauses = [ - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause - ]; - let allowedOnceClauses = [ - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause - ]; + let allowedClauses = [VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause]; + let allowedOnceClauses = [VersionedClause, + VersionedClause, + VersionedClause]; let allowedExclusiveClauses = [ VersionedClause, VersionedClause, @@ -637,33 +629,29 @@ def ACC_ParallelLoop : Directive<"parallel loop"> { // 2.11 def ACC_SerialLoop : Directive<"serial loop"> { - let allowedClauses = [ - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause - ]; - let allowedOnceClauses = [ - VersionedClause, - VersionedClause, - VersionedClause, - VersionedClause - ]; + let allowedClauses = [VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause, + VersionedClause]; + let allowedOnceClauses = [VersionedClause, + VersionedClause, + VersionedClause]; let allowedExclusiveClauses = [ VersionedClause, VersionedClause, diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 194b1e657c493..0af4b436649a3 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -573,6 +573,7 @@ def OMP_Allocators : Directive<"allocators"> { ]; let association = AS_Block; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_Assumes : Directive<"assumes"> { let association = AS_None; @@ -586,10 +587,6 @@ def OMP_Assumes : Directive<"assumes"> { VersionedClause, ]; } -def OMP_EndAssumes : Directive<"end assumes"> { - let association = AS_Delimited; - let category = OMP_Assumes.category; -} def OMP_Assume : Directive<"assume"> { let association = AS_Block; let category = CA_Informational; @@ -637,6 +634,12 @@ def OMP_BeginAssumes : Directive<"begin assumes"> { VersionedClause, VersionedClause, ]; + let languages = [L_C]; +} +def OMP_EndAssumes : Directive<"end assumes"> { + let association = AS_Delimited; + let category = OMP_BeginAssumes.category; + let languages = OMP_BeginAssumes.languages; } def OMP_BeginDeclareTarget : Directive<"begin declare target"> { let allowedClauses = [ @@ -647,10 +650,22 @@ def OMP_BeginDeclareTarget : Directive<"begin declare target"> { ]; let association = AS_Delimited; let category = CA_Declarative; + let languages = [L_C]; +} +def OMP_EndDeclareTarget : Directive<"end declare target"> { + let association = AS_Delimited; + let category = OMP_BeginDeclareTarget.category; + let languages = OMP_BeginDeclareTarget.languages; } def OMP_BeginDeclareVariant : Directive<"begin declare variant"> { let association = AS_Delimited; let category = CA_Declarative; + let languages = [L_C]; +} +def OMP_EndDeclareVariant : Directive<"end declare variant"> { + let association = AS_Delimited; + let category = OMP_BeginDeclareVariant.category; + let languages = OMP_BeginDeclareVariant.languages; } def OMP_Cancel : Directive<"cancel"> { let allowedOnceClauses = [ @@ -717,10 +732,6 @@ def OMP_DeclareTarget : Directive<"declare target"> { let association = AS_None; let category = CA_Declarative; } -def OMP_EndDeclareTarget : Directive<"end declare target"> { - let association = AS_Delimited; - let category = OMP_DeclareTarget.category; -} def OMP_DeclareVariant : Directive<"declare variant"> { let allowedClauses = [ VersionedClause, @@ -731,10 +742,7 @@ def OMP_DeclareVariant : Directive<"declare variant"> { ]; let association = AS_Declaration; let category = CA_Declarative; -} -def OMP_EndDeclareVariant : Directive<"end declare variant"> { - let association = AS_Delimited; - let category = OMP_DeclareVariant.category; + let languages = [L_C]; } def OMP_Depobj : Directive<"depobj"> { let allowedClauses = [ @@ -793,15 +801,16 @@ def OMP_Do : Directive<"do"> { ]; let association = AS_Loop; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_EndDo : Directive<"end do"> { let allowedOnceClauses = [ VersionedClause, ]; - // Needed for association computation, since OMP_Do has it "from leafConstructs". let leafConstructs = OMP_Do.leafConstructs; let association = OMP_Do.association; let category = OMP_Do.category; + let languages = OMP_Do.languages; } def OMP_Error : Directive<"error"> { let allowedClauses = [ @@ -841,6 +850,7 @@ def OMP_For : Directive<"for"> { ]; let association = AS_Loop; let category = CA_Executable; + let languages = [L_C]; } def OMP_Interchange : Directive<"interchange"> { let allowedOnceClauses = [ @@ -984,6 +994,7 @@ def OMP_EndScope : Directive<"end scope"> { let leafConstructs = OMP_Scope.leafConstructs; let association = OMP_Scope.association; let category = OMP_Scope.category; + let languages = [L_Fortran]; } def OMP_Section : Directive<"section"> { let association = AS_Separating; @@ -1008,6 +1019,7 @@ def OMP_EndSections : Directive<"end sections"> { let leafConstructs = OMP_Sections.leafConstructs; let association = OMP_Sections.association; let category = OMP_Sections.category; + let languages = [L_Fortran]; } def OMP_Simd : Directive<"simd"> { let allowedClauses = [ @@ -1052,6 +1064,7 @@ def OMP_EndSingle : Directive<"end single"> { let leafConstructs = OMP_Single.leafConstructs; let association = OMP_Single.association; let category = OMP_Single.category; + let languages = [L_Fortran]; } def OMP_Target : Directive<"target"> { let allowedClauses = [ @@ -1259,6 +1272,7 @@ def OMP_Workshare : Directive<"workshare"> { ]; let association = AS_Block; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_EndWorkshare : Directive<"end workshare"> { let allowedOnceClauses = [ @@ -1267,6 +1281,7 @@ def OMP_EndWorkshare : Directive<"end workshare"> { let leafConstructs = OMP_Workshare.leafConstructs; let association = OMP_Workshare.association; let category = OMP_Workshare.category; + let languages = [L_Fortran]; } //===----------------------------------------------------------------------===// @@ -1298,6 +1313,7 @@ def OMP_DistributeParallelDo : Directive<"distribute parallel do"> { ]; let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_Do]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_DistributeParallelDoSimd : Directive<"distribute parallel do simd"> { let allowedClauses = [ @@ -1324,6 +1340,7 @@ def OMP_DistributeParallelDoSimd : Directive<"distribute parallel do simd"> { ]; let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_DistributeParallelFor : Directive<"distribute parallel for"> { let allowedClauses = [ @@ -1346,6 +1363,7 @@ def OMP_DistributeParallelFor : Directive<"distribute parallel for"> { ]; let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_For]; let category = CA_Executable; + let languages = [L_C]; } def OMP_DistributeParallelForSimd : Directive<"distribute parallel for simd"> { let allowedClauses = [ @@ -1373,6 +1391,7 @@ def OMP_DistributeParallelForSimd : Directive<"distribute parallel for simd"> { ]; let leafConstructs = [OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd]; let category = CA_Executable; + let languages = [L_C]; } def OMP_DistributeSimd : Directive<"distribute simd"> { let allowedClauses = [ @@ -1422,6 +1441,7 @@ def OMP_DoSimd : Directive<"do simd"> { ]; let leafConstructs = [OMP_Do, OMP_Simd]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_EndDoSimd : Directive<"end do simd"> { let allowedOnceClauses = [ @@ -1430,6 +1450,7 @@ def OMP_EndDoSimd : Directive<"end do simd"> { let leafConstructs = OMP_DoSimd.leafConstructs; let association = OMP_DoSimd.association; let category = OMP_DoSimd.category; + let languages = [L_Fortran]; } def OMP_ForSimd : Directive<"for simd"> { let allowedClauses = [ @@ -1611,6 +1632,7 @@ def OMP_ParallelDo : Directive<"parallel do"> { ]; let leafConstructs = [OMP_Parallel, OMP_Do]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_ParallelDoSimd : Directive<"parallel do simd"> { let allowedClauses = [ @@ -1639,6 +1661,7 @@ def OMP_ParallelDoSimd : Directive<"parallel do simd"> { ]; let leafConstructs = [OMP_Parallel, OMP_Do, OMP_Simd]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_ParallelFor : Directive<"parallel for"> { let allowedClauses = [ @@ -1662,6 +1685,7 @@ def OMP_ParallelFor : Directive<"parallel for"> { ]; let leafConstructs = [OMP_Parallel, OMP_For]; let category = CA_Executable; + let languages = [L_C]; } def OMP_ParallelForSimd : Directive<"parallel for simd"> { let allowedClauses = [ @@ -1689,6 +1713,7 @@ def OMP_ParallelForSimd : Directive<"parallel for simd"> { ]; let leafConstructs = [OMP_Parallel, OMP_For, OMP_Simd]; let category = CA_Executable; + let languages = [L_C]; } def OMP_parallel_loop : Directive<"parallel loop"> { let allowedClauses = [ @@ -1907,6 +1932,7 @@ def OMP_ParallelWorkshare : Directive<"parallel workshare"> { ]; let leafConstructs = [OMP_Parallel, OMP_Workshare]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_TargetParallel : Directive<"target parallel"> { let allowedClauses = [ @@ -1966,6 +1992,7 @@ def OMP_TargetParallelDo : Directive<"target parallel do"> { ]; let leafConstructs = [OMP_Target, OMP_Parallel, OMP_Do]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_TargetParallelDoSimd : Directive<"target parallel do simd"> { let allowedClauses = [ @@ -1999,6 +2026,7 @@ def OMP_TargetParallelDoSimd : Directive<"target parallel do simd"> { ]; let leafConstructs = [OMP_Target, OMP_Parallel, OMP_Do, OMP_Simd]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_TargetParallelFor : Directive<"target parallel for"> { let allowedClauses = [ @@ -2033,6 +2061,7 @@ def OMP_TargetParallelFor : Directive<"target parallel for"> { ]; let leafConstructs = [OMP_Target, OMP_Parallel, OMP_For]; let category = CA_Executable; + let languages = [L_C]; } def OMP_TargetParallelForSimd : Directive<"target parallel for simd"> { let allowedClauses = [ @@ -2071,6 +2100,7 @@ def OMP_TargetParallelForSimd : Directive<"target parallel for simd"> { ]; let leafConstructs = [OMP_Target, OMP_Parallel, OMP_For, OMP_Simd]; let category = CA_Executable; + let languages = [L_C]; } def OMP_target_parallel_loop : Directive<"target parallel loop"> { let allowedClauses = [ @@ -2230,8 +2260,10 @@ def OMP_TargetTeamsDistributeParallelDo : VersionedClause, VersionedClause, ]; - let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do]; + let leafConstructs = + [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_TargetTeamsDistributeParallelDoSimd : Directive<"target teams distribute parallel do simd"> { @@ -2268,8 +2300,10 @@ def OMP_TargetTeamsDistributeParallelDoSimd : VersionedClause, VersionedClause, ]; - let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd]; + let leafConstructs = + [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_TargetTeamsDistributeParallelFor : Directive<"target teams distribute parallel for"> { @@ -2303,8 +2337,10 @@ def OMP_TargetTeamsDistributeParallelFor : let allowedOnceClauses = [ VersionedClause, ]; - let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For]; + let leafConstructs = + [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For]; let category = CA_Executable; + let languages = [L_C]; } def OMP_TargetTeamsDistributeParallelForSimd : Directive<"target teams distribute parallel for simd"> { @@ -2343,8 +2379,10 @@ def OMP_TargetTeamsDistributeParallelForSimd : let allowedOnceClauses = [ VersionedClause, ]; - let leafConstructs = [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd]; + let leafConstructs = + [OMP_Target, OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd]; let category = CA_Executable; + let languages = [L_C]; } def OMP_TargetTeamsDistributeSimd : Directive<"target teams distribute simd"> { @@ -2494,6 +2532,7 @@ def OMP_TeamsDistributeParallelDo : ]; let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_TeamsDistributeParallelDoSimd : Directive<"teams distribute parallel do simd"> { @@ -2522,8 +2561,10 @@ def OMP_TeamsDistributeParallelDoSimd : VersionedClause, VersionedClause, ]; - let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd]; + let leafConstructs = + [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_Do, OMP_Simd]; let category = CA_Executable; + let languages = [L_Fortran]; } def OMP_TeamsDistributeParallelFor : Directive<"teams distribute parallel for"> { @@ -2549,6 +2590,7 @@ def OMP_TeamsDistributeParallelFor : ]; let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For]; let category = CA_Executable; + let languages = [L_C]; } def OMP_TeamsDistributeParallelForSimd : Directive<"teams distribute parallel for simd"> { @@ -2576,8 +2618,10 @@ def OMP_TeamsDistributeParallelForSimd : VersionedClause, VersionedClause, ]; - let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd]; + let leafConstructs = + [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd]; let category = CA_Executable; + let languages = [L_C]; } def OMP_TeamsDistributeSimd : Directive<"teams distribute simd"> { let allowedClauses = [ diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h index d83fe1299237b..e399a2f29e545 100644 --- a/llvm/include/llvm/IR/DataLayout.h +++ b/llvm/include/llvm/IR/DataLayout.h @@ -92,6 +92,7 @@ class DataLayout { /// The function pointer alignment is a multiple of the function alignment. MultipleOfFunctionAlign, }; + private: bool BigEndian = false; @@ -324,16 +325,38 @@ class DataLayout { /// the backends/clients are updated. Align getPointerPrefAlignment(unsigned AS = 0) const; - /// Layout pointer size in bytes, rounded up to a whole - /// number of bytes. + /// The pointer representation size in bytes, rounded up to a whole number of + /// bytes. The difference between this function and getAddressSize() is that + /// this one returns the size of the entire pointer representation (including + /// metadata bits for fat pointers) and the latter only returns the number of + /// address bits. + /// \sa DataLayout::getAddressSizeInBits /// FIXME: The defaults need to be removed once all of /// the backends/clients are updated. unsigned getPointerSize(unsigned AS = 0) const; - // Index size in bytes used for address calculation, - /// rounded up to a whole number of bytes. + /// The index size in bytes used for address calculation, rounded up to a + /// whole number of bytes. This not only defines the size used in + /// getelementptr operations, but also the size of addresses in this \p AS. + /// For example, a 64-bit CHERI-enabled target has 128-bit pointers of which + /// only 64 are used to represent the address and the remaining ones are used + /// for metadata such as bounds and access permissions. In this case + /// getPointerSize() returns 16, but getIndexSize() returns 8. + /// To help with code understanding, the alias getAddressSize() can be used + /// instead of getIndexSize() to clarify that an address width is needed. unsigned getIndexSize(unsigned AS) const; + /// The integral size of a pointer in a given address space in bytes, which + /// is defined to be the same as getIndexSize(). This exists as a separate + /// function to make it clearer when reading code that the size of an address + /// is being requested. While targets exist where index size and the + /// underlying address width are not identical (e.g. AMDGPU fat pointers with + /// 48-bit addresses and 32-bit offsets indexing), there is currently no need + /// to differentiate these properties in LLVM. + /// \sa DataLayout::getIndexSize + /// \sa DataLayout::getAddressSizeInBits + unsigned getAddressSize(unsigned AS) const { return getIndexSize(AS); } + /// Return the address spaces containing non-integral pointers. Pointers in /// this address space don't have a well-defined bitwise representation. SmallVector getNonIntegralAddressSpaces() const { @@ -358,29 +381,53 @@ class DataLayout { return PTy && isNonIntegralPointerType(PTy); } - /// Layout pointer size, in bits + /// The size in bits of the pointer representation in a given address space. + /// This is not necessarily the same as the integer address of a pointer (e.g. + /// for fat pointers). + /// \sa DataLayout::getAddressSizeInBits() /// FIXME: The defaults need to be removed once all of /// the backends/clients are updated. unsigned getPointerSizeInBits(unsigned AS = 0) const { return getPointerSpec(AS).BitWidth; } - /// Size in bits of index used for address calculation in getelementptr. + /// The size in bits of indices used for address calculation in getelementptr + /// and for addresses in the given AS. See getIndexSize() for more + /// information. + /// \sa DataLayout::getAddressSizeInBits() unsigned getIndexSizeInBits(unsigned AS) const { return getPointerSpec(AS).IndexBitWidth; } - /// Layout pointer size, in bits, based on the type. If this function is + /// The size in bits of an address in for the given AS. This is defined to + /// return the same value as getIndexSizeInBits() since there is currently no + /// target that requires these two properties to have different values. See + /// getIndexSize() for more information. + /// \sa DataLayout::getIndexSizeInBits() + unsigned getAddressSizeInBits(unsigned AS) const { + return getIndexSizeInBits(AS); + } + + /// The pointer representation size in bits for this type. If this function is /// called with a pointer type, then the type size of the pointer is returned. /// If this function is called with a vector of pointers, then the type size /// of the pointer is returned. This should only be called with a pointer or /// vector of pointers. unsigned getPointerTypeSizeInBits(Type *) const; - /// Layout size of the index used in GEP calculation. + /// The size in bits of the index used in GEP calculation for this type. /// The function should be called with pointer or vector of pointers type. + /// This is defined to return the same value as getAddressSizeInBits(), + /// but separate functions exist for code clarity. unsigned getIndexTypeSizeInBits(Type *Ty) const; + /// The size in bits of an address for this type. + /// This is defined to return the same value as getIndexTypeSizeInBits(), + /// but separate functions exist for code clarity. + unsigned getAddressSizeInBits(Type *Ty) const { + return getIndexTypeSizeInBits(Ty); + } + unsigned getPointerTypeSize(Type *Ty) const { return getPointerTypeSizeInBits(Ty) / 8; } @@ -515,15 +562,21 @@ class DataLayout { /// are set. unsigned getLargestLegalIntTypeSizeInBits() const; - /// Returns the type of a GEP index in AddressSpace. + /// Returns the type of a GEP index in \p AddressSpace. /// If it was not specified explicitly, it will be the integer type of the /// pointer width - IntPtrType. IntegerType *getIndexType(LLVMContext &C, unsigned AddressSpace) const; + /// Returns the type of an address in \p AddressSpace + IntegerType *getAddressType(LLVMContext &C, unsigned AddressSpace) const { + return getIndexType(C, AddressSpace); + } /// Returns the type of a GEP index. /// If it was not specified explicitly, it will be the integer type of the /// pointer width - IntPtrType. Type *getIndexType(Type *PtrTy) const; + /// Returns the type of an address in \p AddressSpace + Type *getAddressType(Type *PtrTy) const { return getIndexType(PtrTy); } /// Returns the offset from the beginning of the type for the specified /// indices. diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index d0dffa9de616a..fba6f45d37d1d 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -554,6 +554,23 @@ class VectorType : public Type { return VectorType::get(VTy->getElementType(), EltCnt * 2); } + /// This static method attempts to construct a VectorType with the same + /// size-in-bits as SizeTy but with an element type that matches the scalar + /// type of EltTy. The VectorType is returned on success, nullptr otherwise. + static VectorType *getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy) { + if (SizeTy->getScalarType() == EltTy->getScalarType()) + return SizeTy; + + unsigned EltSize = EltTy->getScalarSizeInBits(); + if (!SizeTy->getPrimitiveSizeInBits().isKnownMultipleOf(EltSize)) + return nullptr; + + ElementCount EC = SizeTy->getElementCount() + .multiplyCoefficientBy(SizeTy->getScalarSizeInBits()) + .divideCoefficientBy(EltSize); + return VectorType::get(EltTy->getScalarType(), EC); + } + /// Return true if the specified type is valid as a element type. static bool isValidElementType(Type *ElemTy); diff --git a/llvm/include/llvm/IR/GCStrategy.h b/llvm/include/llvm/IR/GCStrategy.h index cbfbe23aaa068..6b813554d6544 100644 --- a/llvm/include/llvm/IR/GCStrategy.h +++ b/llvm/include/llvm/IR/GCStrategy.h @@ -63,6 +63,7 @@ class Type; class GCStrategy { private: friend class GCModuleInfo; + friend class CollectorMetadataAnalysis; std::string Name; diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index f650c06590ef2..7e0521e72ceb2 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -608,43 +608,33 @@ class IRBuilderBase { /// Create and insert a memset to the specified pointer and the /// specified value. /// - /// If the pointer isn't an i8*, it will be converted. If a TBAA tag is - /// specified, it will be added to the instruction. Likewise with alias.scope - /// and noalias tags. + /// If the pointer isn't an i8*, it will be converted. If alias metadata is + /// specified, it will be added to the instruction. CallInst *CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile = false, - MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr) { - return CreateMemSet(Ptr, Val, getInt64(Size), Align, isVolatile, - TBAATag, ScopeTag, NoAliasTag); + const AAMDNodes &AAInfo = AAMDNodes()) { + return CreateMemSet(Ptr, Val, getInt64(Size), Align, isVolatile, AAInfo); } CallInst *CreateMemSet(Value *Ptr, Value *Val, Value *Size, MaybeAlign Align, - bool isVolatile = false, MDNode *TBAATag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr); + bool isVolatile = false, + const AAMDNodes &AAInfo = AAMDNodes()); CallInst *CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Val, Value *Size, bool IsVolatile = false, - MDNode *TBAATag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr); + const AAMDNodes &AAInfo = AAMDNodes()); /// Create and insert an element unordered-atomic memset of the region of /// memory starting at the given pointer to the given value. /// - /// If the pointer isn't an i8*, it will be converted. If a TBAA tag is - /// specified, it will be added to the instruction. Likewise with alias.scope - /// and noalias tags. - CallInst *CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val, - uint64_t Size, Align Alignment, - uint32_t ElementSize, - MDNode *TBAATag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr) { - return CreateElementUnorderedAtomicMemSet(Ptr, Val, getInt64(Size), - Align(Alignment), ElementSize, - TBAATag, ScopeTag, NoAliasTag); + /// If the pointer isn't an i8*, it will be converted. If alias metadata is + /// specified, it will be added to the instruction. + CallInst * + CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val, uint64_t Size, + Align Alignment, uint32_t ElementSize, + const AAMDNodes &AAInfo = AAMDNodes()) { + return CreateElementUnorderedAtomicMemSet( + Ptr, Val, getInt64(Size), Align(Alignment), ElementSize, AAInfo); } CallInst *CreateMalloc(Type *IntPtrTy, Type *AllocTy, Value *AllocSize, @@ -662,88 +652,72 @@ class IRBuilderBase { /// Generate the IR for a call to the builtin free function. CallInst *CreateFree(Value *Source, ArrayRef Bundles = {}); - CallInst *CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val, - Value *Size, Align Alignment, - uint32_t ElementSize, - MDNode *TBAATag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr); + CallInst * + CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val, Value *Size, + Align Alignment, uint32_t ElementSize, + const AAMDNodes &AAInfo = AAMDNodes()); /// Create and insert a memcpy between the specified pointers. /// - /// If the pointers aren't i8*, they will be converted. If a TBAA tag is - /// specified, it will be added to the instruction. Likewise with alias.scope + /// If the pointers aren't i8*, they will be converted. If alias metadata is + /// specified, it will be added to the instruction. /// and noalias tags. CallInst *CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, - bool isVolatile = false, MDNode *TBAATag = nullptr, - MDNode *TBAAStructTag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr) { + bool isVolatile = false, + const AAMDNodes &AAInfo = AAMDNodes()) { return CreateMemCpy(Dst, DstAlign, Src, SrcAlign, getInt64(Size), - isVolatile, TBAATag, TBAAStructTag, ScopeTag, - NoAliasTag); + isVolatile, AAInfo); } - CallInst *CreateMemTransferInst( - Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, - MaybeAlign SrcAlign, Value *Size, bool isVolatile = false, - MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, - MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); + CallInst *CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, + MaybeAlign DstAlign, Value *Src, + MaybeAlign SrcAlign, Value *Size, + bool isVolatile = false, + const AAMDNodes &AAInfo = AAMDNodes()); CallInst *CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, - bool isVolatile = false, MDNode *TBAATag = nullptr, - MDNode *TBAAStructTag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr) { + bool isVolatile = false, + const AAMDNodes &AAInfo = AAMDNodes()) { return CreateMemTransferInst(Intrinsic::memcpy, Dst, DstAlign, Src, - SrcAlign, Size, isVolatile, TBAATag, - TBAAStructTag, ScopeTag, NoAliasTag); + SrcAlign, Size, isVolatile, AAInfo); } - CallInst * - CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src, - MaybeAlign SrcAlign, Value *Size, bool isVolatile = false, - MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, - MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) { + CallInst *CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src, + MaybeAlign SrcAlign, Value *Size, + bool isVolatile = false, + const AAMDNodes &AAInfo = AAMDNodes()) { return CreateMemTransferInst(Intrinsic::memcpy_inline, Dst, DstAlign, Src, - SrcAlign, Size, isVolatile, TBAATag, - TBAAStructTag, ScopeTag, NoAliasTag); + SrcAlign, Size, isVolatile, AAInfo); } /// Create and insert an element unordered-atomic memcpy between the /// specified pointers. /// - /// DstAlign/SrcAlign are the alignments of the Dst/Src pointers, respectively. + /// DstAlign/SrcAlign are the alignments of the Dst/Src pointers, + /// respectively. /// - /// If the pointers aren't i8*, they will be converted. If a TBAA tag is - /// specified, it will be added to the instruction. Likewise with alias.scope - /// and noalias tags. + /// If the pointers aren't i8*, they will be converted. If alias metadata is + /// specified, it will be added to the instruction. CallInst *CreateElementUnorderedAtomicMemCpy( Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, - uint32_t ElementSize, MDNode *TBAATag = nullptr, - MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr); + uint32_t ElementSize, const AAMDNodes &AAInfo = AAMDNodes()); CallInst *CreateMemMove(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, - bool isVolatile = false, MDNode *TBAATag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr) { + bool isVolatile = false, + const AAMDNodes &AAInfo = AAMDNodes()) { return CreateMemMove(Dst, DstAlign, Src, SrcAlign, getInt64(Size), - isVolatile, TBAATag, ScopeTag, NoAliasTag); + isVolatile, AAInfo); } CallInst *CreateMemMove(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, - bool isVolatile = false, MDNode *TBAATag = nullptr, - MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr) { + bool isVolatile = false, + const AAMDNodes &AAInfo = AAMDNodes()) { return CreateMemTransferInst(Intrinsic::memmove, Dst, DstAlign, Src, - SrcAlign, Size, isVolatile, TBAATag, - /*TBAAStructTag=*/nullptr, ScopeTag, - NoAliasTag); + SrcAlign, Size, isVolatile, AAInfo); } /// \brief Create and insert an element unordered-atomic memmove between the @@ -752,14 +726,11 @@ class IRBuilderBase { /// DstAlign/SrcAlign are the alignments of the Dst/Src pointers, /// respectively. /// - /// If the pointers aren't i8*, they will be converted. If a TBAA tag is - /// specified, it will be added to the instruction. Likewise with alias.scope - /// and noalias tags. + /// If the pointers aren't i8*, they will be converted. If alias metadata is + /// specified, it will be added to the instruction. CallInst *CreateElementUnorderedAtomicMemMove( Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, - uint32_t ElementSize, MDNode *TBAATag = nullptr, - MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr); + uint32_t ElementSize, const AAMDNodes &AAInfo = AAMDNodes()); private: CallInst *getReductionIntrinsic(Intrinsic::ID ID, Value *Src); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 28450f03b7619..e1a135a5ad48e 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -903,7 +903,7 @@ def int_stackrestore : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>, def int_get_dynamic_area_offset : DefaultAttrsIntrinsic<[llvm_anyint_ty]>; -def int_thread_pointer : DefaultAttrsIntrinsic<[llvm_ptr_ty], [], [IntrNoMem]>, +def int_thread_pointer : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_thread_pointer">; // IntrInaccessibleMemOrArgMemOnly is a little more pessimistic than strictly diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 5be1a915a06a7..64ac84dcd7c50 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -357,38 +357,33 @@ class MMA_SIGNATURE { !ne(A.ptx_elt_type, B.ptx_elt_type): [A, B], true: [A] ); - string ret = !foldl("", id_frags, a, b, !strconcat(a, ".", b.ptx_elt_type)); + string ret = !foldl("", id_frags, a, b, !strconcat(a, "_", b.ptx_elt_type)); } class WMMA_NAME { string signature = MMA_SIGNATURE.ret; - string llvm = "llvm.nvvm.wmma." - # A.geom - # ".mma" - # b1op - # "." # ALayout - # "." # BLayout - # !if(!ne(Rnd, ""), !strconcat(".", Rnd), "") - # signature - # !if(Satfinite, ".satfinite", ""); - - string record = !subst(".", "_", - !subst("llvm.", "int_", llvm)); + string record = "int_nvvm_wmma_" + # A.geom + # "_mma" + # !subst(".", "_", b1op) + # "_" # ALayout + # "_" # BLayout + # !if(!ne(Rnd, ""), !strconcat("_", Rnd), "") + # signature + # !if(Satfinite, "_satfinite", ""); } class MMA_NAME { string signature = MMA_SIGNATURE.ret; - string llvm = "llvm.nvvm.mma" - # b1op - # "." # A.geom - # "." # ALayout - # "." # BLayout - # !if(Satfinite, ".satfinite", "") - # signature; - string record = !subst(".", "_", - !subst("llvm.", "int_", llvm)); + string record = "int_nvvm_mma" + # !subst(".", "_", b1op) + # "_" # A.geom + # "_" # ALayout + # "_" # BLayout + # !if(Satfinite, "_satfinite", "") + # signature; } class LDMATRIX_NAME { @@ -696,101 +691,6 @@ class SHFL_INFO { [OpType, llvm_i32_ty, llvm_i32_ty]); } -class CP_ASYNC_BULK_TENSOR_G2S_INTR { - string Name = "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim # "d"; - - bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0); - int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0); - list Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets); - list TensorDimsTy = !listsplat(llvm_i32_ty, dim); - list ArgsTy = !listconcat( - [llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr - llvm_shared_ptr_ty, // mbarrier_smem_ptr - llvm_ptr_ty], // tensormap_ptr - TensorDimsTy, // actual tensor dims - Im2ColOffsetsTy, // im2col offsets - [llvm_i16_ty, // cta_mask - llvm_i64_ty, // cache_hint - llvm_i1_ty, // Flag for cta_mask - llvm_i1_ty] // Flag for cache_hint - ); - - int TempFlagsStartIdx = !add(dim, 5); - int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets); - list IntrProp = [IntrConvergent, - WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>, NoCapture>, - ImmArg>, - ImmArg>]; -} - -class CP_ASYNC_BULK_TENSOR_S2G_INTR { - string Name = "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # "d"; - - list TensorDimsTy = !listsplat(llvm_i32_ty, dim); - list ArgsTy = !listconcat( - [llvm_shared_ptr_ty, // src_smem_ptr - llvm_ptr_ty], // tensormap_ptr - TensorDimsTy, // actual tensor dims - [llvm_i64_ty, // cache_hint - llvm_i1_ty] // Flag for cache_hint - ); - int FlagsStartIdx = !add(dim, 3); - list IntrProp = [IntrConvergent, - ReadOnly>, ReadOnly>, - NoCapture>, NoCapture>, - ImmArg>]; -} - -class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { - string Name = "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # "d"; - - bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0); - int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0); - list Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets); - list TensorDimsTy = !listsplat(llvm_i32_ty, dim); - list ArgsTy = !listconcat( - [llvm_ptr_ty], // tensormap_ptr - TensorDimsTy, // actual tensor dims - Im2ColOffsetsTy, // im2col offsets - [llvm_i64_ty, // cache_hint - llvm_i1_ty] // Flag for cache_hint - ); - - int TempFlagsStartIdx = !add(dim, 2); - int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets); - list IntrProp = [IntrConvergent, - ReadOnly>, NoCapture>, - ImmArg>]; -} - -class CP_ASYNC_BULK_TENSOR_REDUCE_INTR { - string Suffix = op # "_" # mode # "_" # dim # "d"; - string Name = "int_nvvm_cp_async_bulk_tensor_reduce_" # Suffix; - - list TensorDimsTy = !listsplat(llvm_i32_ty, dim); - list ArgsTy = !listconcat( - [llvm_shared_ptr_ty, // src_smem_ptr - llvm_ptr_ty], // tensormap_ptr - TensorDimsTy, // actual tensor dims - [llvm_i64_ty, // cache_hint - llvm_i1_ty] // Flag for cache_hint - ); - int FlagsStartIdx = !add(dim, 3); - list IntrProp = [IntrConvergent, - ReadOnly>, ReadOnly>, - NoCapture>, NoCapture>, - ImmArg>]; -} - -class NVVM_TCGEN05_LDST_NAME { - string intr = "llvm.nvvm.tcgen05." # Op - # "." # Shape - # "." # "x" # !shl(1, Num); - - string record = !subst(".", "_", - !subst("llvm.", "int_", intr)); -} class NVVM_TCGEN05_LDST_ACCESS_SIZE { int shift = !cond(!eq(Shape, "16x128b"): 1, !eq(Shape, "16x256b"): 2, @@ -810,6 +710,28 @@ class NVVM_TCGEN05_LDST_ACCESS_SIZE { true : llvm_void_ty); } +class TexVector types> { + string Name = name; + list Types = types; +} + +def TV_I8 : TexVector<"i8", [llvm_i16_ty]>; +def TV_I16 : TexVector<"i16", [llvm_i16_ty]>; +def TV_I32 : TexVector<"i32", [llvm_i32_ty]>; +def TV_I64 : TexVector<"i64", [llvm_i64_ty]>; +def TV_V2I8 : TexVector<"v2i8", !listsplat(llvm_i16_ty, 2)>; +def TV_V2I16 : TexVector<"v2i16", !listsplat(llvm_i16_ty, 2)>; +def TV_V2I32 : TexVector<"v2i32", !listsplat(llvm_i32_ty, 2)>; +def TV_V2I64 : TexVector<"v2i64", !listsplat(llvm_i64_ty, 2)>; +def TV_V4I8 : TexVector<"v4i8", !listsplat(llvm_i16_ty, 4)>; +def TV_V4I16 : TexVector<"v4i16", !listsplat(llvm_i16_ty, 4)>; +def TV_V4I32 : TexVector<"v4i32", !listsplat(llvm_i32_ty, 4)>; + + +def V4F32 : TexVector<"v4f32", !listsplat(llvm_float_ty, 4)>; +def V4S32 : TexVector<"v4s32", !listsplat(llvm_i32_ty, 4)>; +def V4U32 : TexVector<"v4u32", !listsplat(llvm_i32_ty, 4)>; + class NVVMBuiltin : ClangBuiltin { assert !eq(!substr(NAME, 0, !size("int_nvvm_")), "int_nvvm_"), @@ -828,131 +750,116 @@ let TargetPrefix = "nvvm" in { // // Min Max // + let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { + foreach operation = ["min", "max"] in { + def int_nvvm_f # operation # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; - foreach operation = ["min", "max"] in { - def int_nvvm_f # operation # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - - foreach variant = ["", "_xorsign_abs"] in { - foreach nan = ["", "_nan"] in { - foreach ftz = ["", "_ftz"] in { - def int_nvvm_f # operation # ftz # nan # variant # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + foreach variant = ["", "_xorsign_abs"] in { + foreach nan = ["", "_nan"] in { + foreach ftz = ["", "_ftz"] in { + def int_nvvm_f # operation # ftz # nan # variant # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_f # operation # ftz # nan # variant # _f16 : - DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + def int_nvvm_f # operation # ftz # nan # variant # _f16 : + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty]>; - def int_nvvm_f # operation # ftz # nan # variant # _f16x2 : - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + def int_nvvm_f # operation # ftz # nan # variant # _f16x2 : + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>; - def int_nvvm_f # operation # ftz # nan # variant # _bf16 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty, llvm_bfloat_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + def int_nvvm_f # operation # ftz # nan # variant # _bf16 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty, llvm_bfloat_ty]>; - def int_nvvm_f # operation # ftz # nan # variant # _bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty, llvm_v2bf16_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - } // ftz - } // nan - } // variant - } // operation + def int_nvvm_f # operation # ftz # nan # variant # _bf16x2 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty, llvm_v2bf16_ty]>; + } // ftz + } // nan + } // variant + } // operation + } // // Multiplication // + let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { + foreach sign = ["", "u"] in { + def int_nvvm_mulhi_ # sign # s : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty]>; - foreach sign = ["", "u"] in { - def int_nvvm_mulhi_ # sign # s : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - - def int_nvvm_mulhi_ # sign # i : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + def int_nvvm_mulhi_ # sign # i : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; - def int_nvvm_mulhi_ # sign # ll : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + def int_nvvm_mulhi_ # sign # ll : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty]>; - def int_nvvm_mul24_ # sign # i : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; - } + def int_nvvm_mul24_ # sign # i : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; + } - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_mul_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_mul_ # rnd # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_mul_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + def int_nvvm_mul_ # rnd # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; + } } // // Div // + let IntrProperties = [IntrNoMem] in { + foreach ftz = ["", "_ftz"] in { + def int_nvvm_div_approx # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; - foreach ftz = ["", "_ftz"] in { - def int_nvvm_div_approx # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem]>; - - def int_nvvm_div_full # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem]>; - } + def int_nvvm_div_full # ftz : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; + } - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_div_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem]>; + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_div_ # rnd # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_div_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem]>; + def int_nvvm_div_ # rnd # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; + } } // // Sad // + let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + foreach sign = ["", "u"] in { + def int_nvvm_sad_ # sign # s : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty]>; - foreach sign = ["", "u"] in { - def int_nvvm_sad_ # sign # s : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [IntrNoMem, Commutative, IntrSpeculatable]>; - - def int_nvvm_sad_ # sign # i : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, Commutative, IntrSpeculatable]>; + def int_nvvm_sad_ # sign # i : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - def int_nvvm_sad_ # sign # ll : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty], - [IntrNoMem, Commutative, IntrSpeculatable]>; + def int_nvvm_sad_ # sign # ll : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty]>; + } } // // Floor Ceil // - - foreach op = ["floor", "ceil"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_ # op # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; - def int_nvvm_ # op # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + foreach op = ["floor", "ceil"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_ # op # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + def int_nvvm_ # op # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } } // // Abs // - foreach ftz = ["", "_ftz"] in def int_nvvm_fabs # ftz : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], @@ -961,7 +868,6 @@ let TargetPrefix = "nvvm" in { // // Abs, Neg bf16, bf16x2 // - def int_nvvm_neg_bf16 : NVVMBuiltin, DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty], [IntrNoMem]>; def int_nvvm_neg_bf16x2 : NVVMBuiltin, @@ -970,62 +876,65 @@ let TargetPrefix = "nvvm" in { // // Round // + let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_round # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_round # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; - - def int_nvvm_round_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_round_d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } // // Trunc // + let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_trunc # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_trunc # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; - - def int_nvvm_trunc_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_trunc_d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } // // Saturate // + let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_saturate # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_saturate # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; - - def int_nvvm_saturate_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_saturate_d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } // // Exp2 Log2 // + let IntrProperties = [IntrNoMem] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - - def int_nvvm_ex2_approx_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; - def int_nvvm_ex2_approx_f16 : - DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>; - def int_nvvm_ex2_approx_f16x2 : - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>; + def int_nvvm_ex2_approx_d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + def int_nvvm_ex2_approx_f16 : + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty]>; + def int_nvvm_ex2_approx_f16x2 : + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_lg2_approx # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + foreach ftz = ["", "_ftz"] in + def int_nvvm_lg2_approx # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_lg2_approx_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; + def int_nvvm_lg2_approx_d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } // // Sin Cos // - foreach op = ["sin", "cos"] in foreach ftz = ["", "_ftz"] in def int_nvvm_ # op # _approx # ftz # _f : NVVMBuiltin, @@ -1034,105 +943,103 @@ let TargetPrefix = "nvvm" in { // // Fma // + let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + foreach variant = ["", "_sat", "_relu"] in { + foreach ftz = ["", "_ftz"] in { + def int_nvvm_fma_rn # ftz # variant # _f16 : + DefaultAttrsIntrinsic<[llvm_half_ty], + [llvm_half_ty, llvm_half_ty, llvm_half_ty]>; + + def int_nvvm_fma_rn # ftz # variant # _f16x2 : + DefaultAttrsIntrinsic<[llvm_v2f16_ty], + [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty]>; + + def int_nvvm_fma_rn # ftz # variant # _bf16 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_bfloat_ty], + [llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty]>; + + def int_nvvm_fma_rn # ftz # variant # _bf16x2 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2bf16_ty], + [llvm_v2bf16_ty, llvm_v2bf16_ty, llvm_v2bf16_ty]>; + } // ftz + } // variant - foreach variant = ["", "_sat", "_relu"] in { - foreach ftz = ["", "_ftz"] in { - def int_nvvm_fma_rn # ftz # variant # _f16 : - DefaultAttrsIntrinsic<[llvm_half_ty], - [llvm_half_ty, llvm_half_ty, llvm_half_ty], - [IntrNoMem, IntrSpeculatable]>; - - def int_nvvm_fma_rn # ftz # variant # _f16x2 : - DefaultAttrsIntrinsic<[llvm_v2f16_ty], - [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty], - [IntrNoMem, IntrSpeculatable]>; - - def int_nvvm_fma_rn # ftz # variant # _bf16 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], - [llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty], - [IntrNoMem, IntrSpeculatable]>; - - def int_nvvm_fma_rn # ftz # variant # _bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], - [llvm_v2bf16_ty, llvm_v2bf16_ty, llvm_v2bf16_ty], - [IntrNoMem, IntrSpeculatable]>; - } // ftz - } // variant - - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_fma_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable]>; + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_fma_ # rnd # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty]>; - def int_nvvm_fma_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], - [llvm_double_ty, llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_fma_ # rnd # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], + [llvm_double_ty, llvm_double_ty, llvm_double_ty]>; + } } // // Rcp // + let IntrProperties = [IntrNoMem] in { + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_rcp_ # rnd # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_rcp_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_nvvm_rcp_ # rnd # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } - def int_nvvm_rcp_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; + def int_nvvm_rcp_approx_ftz_f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + def int_nvvm_rcp_approx_ftz_d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; } - def int_nvvm_rcp_approx_ftz_f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_nvvm_rcp_approx_ftz_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; - // // Sqrt // - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_sqrt_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + let IntrProperties = [IntrNoMem] in { + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_sqrt_ # rnd # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_sqrt_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; - } + def int_nvvm_sqrt_ # rnd # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } - def int_nvvm_sqrt_f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_nvvm_sqrt_f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_sqrt_approx # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + foreach ftz = ["", "_ftz"] in + def int_nvvm_sqrt_approx # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + } // // Rsqrt // - - foreach ftz = ["", "_ftz"] in { - def int_nvvm_rsqrt_approx # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_nvvm_rsqrt_approx # ftz # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; + let IntrProperties = [IntrNoMem] in { + foreach ftz = ["", "_ftz"] in { + def int_nvvm_rsqrt_approx # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + def int_nvvm_rsqrt_approx # ftz # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + } } // // Add // - - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_add_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_add_ # rnd # ftz # _f : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; def int_nvvm_add_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; + } } // @@ -1191,126 +1098,134 @@ let TargetPrefix = "nvvm" in { // // Convert // + let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + def int_nvvm_lohi_i2d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty]>; - def int_nvvm_lohi_i2d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, Commutative]>; + def int_nvvm_d2i_lo : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; + def int_nvvm_d2i_hi : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - def int_nvvm_d2i_lo : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; - def int_nvvm_d2i_hi : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_d2f_ # rnd # ftz : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty]>; + foreach sign = ["", "u"] in { - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_d2f_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_d2 # sign # i_ # rnd : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - foreach sign = ["", "u"] in { + def int_nvvm_ # sign # i2d_ # rnd : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty]>; - def int_nvvm_d2 # sign # i_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + foreach ftz = ["", "_ftz"] in + def int_nvvm_f2 # sign # i_ # rnd # ftz : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - def int_nvvm_ # sign # i2d_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_ # sign # i2f_ # rnd : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_f2 # sign # i_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; + foreach ftz = ["", "_ftz"] in + def int_nvvm_f2 # sign # ll_ # rnd # ftz : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty]>; - def int_nvvm_ # sign # i2f_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_d2 # sign # ll_ # rnd : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty]>; - foreach ftz = ["", "_ftz"] in - def int_nvvm_f2 # sign # ll_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_ # sign # ll2f_ # rnd : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty]>; - def int_nvvm_d2 # sign # ll_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_ # sign # ll2d_ # rnd : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty]>; - def int_nvvm_ # sign # ll2f_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>; + } // sign + } // rnd - def int_nvvm_ # sign # ll2d_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>; - } // sign - } // rnd + foreach ftz = ["", "_ftz"] in { + def int_nvvm_f2h_rn # ftz : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty]>; - foreach ftz = ["", "_ftz"] in { - def int_nvvm_f2h_rn # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_bf2h_rn # ftz : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty]>; + } - def int_nvvm_bf2h_rn # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty], [IntrNoMem, IntrSpeculatable]>; - } + foreach rnd = ["rn", "rz"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; - foreach rnd = ["rn", "rz"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, - Intrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, - Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; + } + } + + foreach satfinite = ["", "_satfinite"] in { + def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, - Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + foreach rnd = ["rn", "rz"] in + foreach relu = ["", "_relu"] in + def int_nvvm_f2tf32_ # rnd # relu # satfinite : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; } - } + foreach type = ["e4m3x2", "e5m2x2"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - foreach satfinite = ["", "_satfinite"] in { - def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_f16x2_to_ # type # _rn # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>; - foreach rnd = ["rn", "rz"] in - foreach relu = ["", "_relu"] in - def int_nvvm_f2tf32_ # rnd # relu # satfinite : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>; - } + def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; + } + } - foreach type = ["e4m3x2", "e5m2x2"] in { + // FP4 conversions. foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin, - Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff_to_e2m1x2_rn # relu # _satfinite : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_f16x2_to_ # type # _rn # relu : NVVMBuiltin, - Intrinsic<[llvm_i16_ty], [llvm_v2f16_ty], [IntrNoMem, IntrNoCallback]>; - - def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, - Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; } - } - // FP6 conversions. - foreach type = ["e2m3x2", "e3m2x2"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rn # relu # _satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + // FP6 conversions. + foreach type = ["e2m3x2", "e3m2x2"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rn # relu # _satfinite : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; + } } - } - // UE8M0x2 conversions. - foreach rmode = ["_rz", "_rp"] in { - foreach satmode = ["", "_satfinite"] in { - defvar suffix = !strconcat(rmode, satmode); - def int_nvvm_ff_to_ue8m0x2 # suffix : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + // UE8M0x2 conversions. + foreach rmode = ["_rz", "_rp"] in { + foreach satmode = ["", "_satfinite"] in { + defvar suffix = rmode # satmode; + def int_nvvm_ff_to_ue8m0x2 # suffix : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_bf16x2_to_ue8m0x2 # suffix : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_bf16x2_to_ue8m0x2 # suffix : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>; + } } - } - def int_nvvm_ue8m0x2_to_bf16x2 : NVVMBuiltin, - Intrinsic<[llvm_v2bf16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ue8m0x2_to_bf16x2 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>; -// FNS + } // IntrProperties = [IntrNoMem, IntrSpeculatable] +// FNS def int_nvvm_fns : NVVMBuiltin, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; @@ -1414,14 +1329,16 @@ foreach scope = ["cta", "cluster", "gpu", "sys"] in { } // Async Copy -def int_nvvm_cp_async_mbarrier_arrive : NVVMBuiltin, - Intrinsic<[], [llvm_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_cp_async_mbarrier_arrive_shared : NVVMBuiltin, - Intrinsic<[], [llvm_shared_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_cp_async_mbarrier_arrive_noinc : NVVMBuiltin, - Intrinsic<[], [llvm_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_cp_async_mbarrier_arrive_noinc_shared : NVVMBuiltin, - Intrinsic<[], [llvm_shared_ptr_ty], [IntrConvergent, IntrNoCallback]>; +let IntrProperties = [IntrConvergent, IntrNoCallback] in { + def int_nvvm_cp_async_mbarrier_arrive : NVVMBuiltin, + Intrinsic<[],[llvm_ptr_ty]>; + def int_nvvm_cp_async_mbarrier_arrive_shared : NVVMBuiltin, + Intrinsic<[],[llvm_shared_ptr_ty]>; + def int_nvvm_cp_async_mbarrier_arrive_noinc : NVVMBuiltin, + Intrinsic<[],[llvm_ptr_ty]>; + def int_nvvm_cp_async_mbarrier_arrive_noinc_shared : NVVMBuiltin, + Intrinsic<[],[llvm_shared_ptr_ty]>; +} multiclass CP_ASYNC_SHARED_GLOBAL { def NAME : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty], @@ -1499,15 +1416,11 @@ def int_nvvm_mbarrier_pending_count : NVVMBuiltin, // Generated within nvvm. Use for ldu on sm_20 or later. Second arg is the // pointer's alignment. -def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture>]>; -def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture>]>; -def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture>]>; +let IntrProperties = [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture>] in { + def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty], [llvm_anyptr_ty, llvm_i32_ty]>; + def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty], [llvm_anyptr_ty, llvm_i32_ty]>; + def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty, llvm_i32_ty]>; +} // Represents an explicit hole in the LLVM IR type system. It may be inserted by // the compiler in cases where a pointer is of the wrong type. In the backend @@ -1527,22 +1440,26 @@ def int_nvvm_internal_addrspace_wrap : // Move intrinsics, used in nvvm internally -def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>; -def int_nvvm_move_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_nvvm_move_i64 : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>; -def int_nvvm_move_float : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; -def int_nvvm_move_double : Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; -def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty], [IntrNoMem, NoCapture>]>; +let IntrProperties = [IntrNoMem] in { + def int_nvvm_move_i16 : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty]>; + def int_nvvm_move_i32 : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty]>; + def int_nvvm_move_i64 : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty]>; + def int_nvvm_move_float : DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + def int_nvvm_move_double : DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; + def int_nvvm_move_ptr : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty]>; +} // For getting the handle from a texture or surface variable -def int_nvvm_texsurf_handle - : Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty], [IntrNoMem]>; -def int_nvvm_texsurf_handle_internal - : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty], [IntrNoMem]>; +let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + def int_nvvm_texsurf_handle + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty]>; + def int_nvvm_texsurf_handle_internal + : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; +} /// Error / Warn -def int_nvvm_compiler_error : Intrinsic<[], [llvm_anyptr_ty], []>; -def int_nvvm_compiler_warn : Intrinsic<[], [llvm_anyptr_ty], []>; +def int_nvvm_compiler_error : Intrinsic<[], [llvm_anyptr_ty]>; +def int_nvvm_compiler_warn : Intrinsic<[], [llvm_anyptr_ty]>; def int_nvvm_reflect : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; @@ -1559,1792 +1476,163 @@ foreach i = 0...31 in DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef]>; +// // Texture Fetch -// texmode_independent -def int_nvvm_tex_1d_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_tex_1d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_1d_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_tex_1d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_1d_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_tex_1d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; - -def int_nvvm_tex_1d_array_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_1d_array_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_array_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_1d_array_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_array_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_1d_array_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_array_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_1d_array_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_array_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_1d_array_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], []>; -def int_nvvm_tex_1d_array_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_1d_array_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_2d_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_2d_array_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], []>; -def int_nvvm_tex_2d_array_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_array_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_array_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_array_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], []>; -def int_nvvm_tex_2d_array_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_array_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_array_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_array_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], []>; -def int_nvvm_tex_2d_array_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_2d_array_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_2d_array_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; - -def int_nvvm_tex_3d_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - []>; -def int_nvvm_tex_3d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_3d_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_3d_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_3d_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - []>; -def int_nvvm_tex_3d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_3d_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_3d_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_3d_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - []>; -def int_nvvm_tex_3d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_3d_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_3d_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_cube_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_cube_array_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_array_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_array_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_array_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_array_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_cube_array_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tld4_r_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_g_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_b_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_a_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_r_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_g_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_b_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_a_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_r_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_g_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_b_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_a_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -// texmode_unified -def int_nvvm_tex_unified_1d_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_1d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_1d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_1d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; - -def int_nvvm_tex_unified_1d_array_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_1d_array_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_1d_array_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_1d_array_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_1d_array_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_unified_2d_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_tex_unified_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_unified_2d_array_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], []>; -def int_nvvm_tex_unified_2d_array_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], []>; -def int_nvvm_tex_unified_2d_array_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty], []>; -def int_nvvm_tex_unified_2d_array_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_2d_array_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; - -def int_nvvm_tex_unified_3d_v4f32_s32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - []>; -def int_nvvm_tex_unified_3d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_v4s32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - []>; -def int_nvvm_tex_unified_3d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_v4u32_s32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - []>; -def int_nvvm_tex_unified_3d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_3d_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_unified_cube_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_unified_cube_array_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_array_level_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_array_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_array_level_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_array_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_array_level_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_unified_cube_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tex_unified_cube_array_grad_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_array_grad_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tex_unified_cube_array_grad_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, - llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, - llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - -def int_nvvm_tld4_unified_r_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_g_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_b_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_a_2d_v4f32_f32 - : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_r_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_g_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_b_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_a_2d_v4s32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_r_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_g_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_b_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; -def int_nvvm_tld4_unified_a_2d_v4u32_f32 - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_float_ty, llvm_float_ty], []>; +// +let IntrProperties = [IntrReadMem] in { + foreach is_unified = [true, false] in { + defvar mode = !if(is_unified, "_unified", ""); + defvar addr_args = !if(is_unified, [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty]); + + foreach vec = [V4F32, V4S32, V4U32] in { + foreach is_array = [true, false] in { + defvar array = !if(is_array, "_array", ""); + defvar array_args = !if(is_array, [llvm_i32_ty], []); + + def int_nvvm_tex # mode # _1d # array # _ # vec.Name # _s32 + : Intrinsic; + def int_nvvm_tex # mode # _1d # array # _ # vec.Name # _f32 + : Intrinsic; + def int_nvvm_tex # mode # _1d # array # _level_ # vec.Name # _f32 + : Intrinsic; + def int_nvvm_tex # mode # _1d # array # _grad_ # vec.Name # _f32 + : Intrinsic; + + def int_nvvm_tex # mode # _2d # array # _ # vec.Name # _s32 + : Intrinsic; + def int_nvvm_tex # mode # _2d # array # _ # vec.Name # _f32 + : Intrinsic; + def int_nvvm_tex # mode # _2d # array # _level_ # vec.Name # _f32 + : Intrinsic; + def int_nvvm_tex # mode # _2d # array # _grad_ # vec.Name # _f32 + : Intrinsic; + + if !not(is_array) then { + def int_nvvm_tex # mode # _3d_ # vec.Name # _s32 + : Intrinsic; + def int_nvvm_tex # mode # _3d_ # vec.Name # _f32 + : Intrinsic; + def int_nvvm_tex # mode # _3d_level_ # vec.Name # _f32 + : Intrinsic; + def int_nvvm_tex # mode # _3d_grad_ # vec.Name # _f32 + : Intrinsic; + } + + def int_nvvm_tex # mode # _cube # array # _ # vec.Name # _f32 + : Intrinsic; + def int_nvvm_tex # mode # _cube # array # _level_ # vec.Name # _f32 + : Intrinsic; + + if is_unified then + def int_nvvm_tex # mode # _cube # array # _grad_ # vec.Name # _f32 + : Intrinsic; + } // is_array + + foreach comp = ["r", "g", "b", "a"] in { + def int_nvvm_tld4 # mode # _ # comp # _2d_ # vec.Name # _f32 + : Intrinsic; + } // comp + } // vec + } // is_unified +} // IntrProperties = [IntrReadMem] + //=== Surface Load -// .clamp variants -def int_nvvm_suld_1d_i8_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i16_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i32_clamp - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i64_clamp - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i64_clamp - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_1d_array_i8_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i16_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i32_clamp - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i64_clamp - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i64_clamp - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_2d_i8_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i16_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i32_clamp - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i64_clamp - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i64_clamp - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_2d_array_i8_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i16_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i32_clamp - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i64_clamp - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i64_clamp - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_3d_i8_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i16_clamp - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i32_clamp - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i64_clamp - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i64_clamp - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i8_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i16_clamp - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i32_clamp - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - -// .trap variants -def int_nvvm_suld_1d_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i16_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i32_trap - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i64_trap - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i64_trap - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_1d_array_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i16_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i32_trap - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i64_trap - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i64_trap - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_2d_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i16_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i32_trap - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i64_trap - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i64_trap - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_2d_array_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i16_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i32_trap - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i64_trap - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i64_trap - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_3d_i8_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i16_trap - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i32_trap - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i64_trap - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i64_trap - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i8_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i16_trap - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i32_trap - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - -// .zero variants -def int_nvvm_suld_1d_i8_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i16_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i32_zero - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_i64_zero - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v2i64_zero - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_v4i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_1d_array_i8_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i16_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i32_zero - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_i64_zero - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v2i64_zero - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_1d_array_v4i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_2d_i8_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i16_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i32_zero - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_i64_zero - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v2i64_zero - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_v4i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_2d_array_i8_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i16_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i32_zero - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_i64_zero - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v2i64_zero - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_2d_array_v4i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - -def int_nvvm_suld_3d_i8_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i16_zero - : Intrinsic<[llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i32_zero - : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_i64_zero - : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v2i64_zero - : Intrinsic<[llvm_i64_ty, llvm_i64_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i8_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i16_zero - : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_suld_3d_v4i32_zero - : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; +let IntrProperties = [IntrReadMem] in { + foreach clamp = ["clamp", "trap", "zero"] in { + foreach vec = [TV_I8, TV_I16, TV_I32, TV_I64, + TV_V2I8, TV_V2I16, TV_V2I32, TV_V2I64, + TV_V4I8, TV_V4I16, TV_V4I32] in { + + def int_nvvm_suld_1d_ # vec.Name # _ # clamp + : Intrinsic; + + def int_nvvm_suld_1d_array_ # vec.Name # _ # clamp + : Intrinsic; + + def int_nvvm_suld_2d_ # vec.Name # _ # clamp + : Intrinsic; + + def int_nvvm_suld_2d_array_ # vec.Name # _ # clamp + : Intrinsic; + + def int_nvvm_suld_3d_ # vec.Name # _ # clamp + : Intrinsic; + } // vec + } // clamp +} // IntrProperties = [IntrReadMem] //===- Texture Query ------------------------------------------------------===// foreach query = ["channel_order", "channel_data_type", "width", "height", - "depth", "array_size", "num_samples", "num_mipmap_levels"] in { + "depth", "array_size", "num_samples", "num_mipmap_levels"] in def int_nvvm_txq_ # query : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; -} //===- Surface Query ------------------------------------------------------===// -foreach query = ["channel_order", "channel_data_type", "width", "height", - "depth", "array_size"] in { +foreach query = ["channel_order", "channel_data_type", "width", "height", + "depth", "array_size"] in def int_nvvm_suq_ # query : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; -} //===- Handle Query -------------------------------------------------------===// -foreach type = ["sampler", "surface", "texture"] in { +foreach type = ["sampler", "surface", "texture"] in def int_nvvm_istypep_ # type : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem]>; -} //===- Surface Stores -----------------------------------------------------===// +multiclass SurfaceStoreIntrinsics { + def _1d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty], vec.Types)>; + + def _1d_array_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + + def _2d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + + def _2d_array_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + + def _3d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; +} + // Unformatted -// .clamp variant -def int_nvvm_sust_b_1d_i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_v2i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v2i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v2i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_v2i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_v4i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v4i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v4i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_array_v2i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v2i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v2i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_v2i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_array_v4i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v4i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v4i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_v2i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v2i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v2i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_v2i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_v4i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v4i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v4i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_array_v2i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v2i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v2i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_v2i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_array_v4i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v4i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v4i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_3d_v2i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v2i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v2i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_v2i64_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_3d_v4i8_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v4i16_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v4i32_clamp : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -// .trap variant -def int_nvvm_sust_b_1d_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_v2i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_array_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_v2i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_array_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_v2i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_array_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_v2i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_array_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_3d_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_v2i64_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_3d_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -// .zero variant -def int_nvvm_sust_b_1d_i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_v2i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v2i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v2i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_v2i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_v4i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v4i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_v4i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_array_v2i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v2i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v2i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_1d_array_v2i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_1d_array_v4i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v4i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_1d_array_v4i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_v2i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v2i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v2i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_v2i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_v4i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v4i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_v4i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_array_v2i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v2i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v2i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_2d_array_v2i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_2d_array_v4i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v4i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_2d_array_v4i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_3d_v2i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v2i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v2i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_b_3d_v2i64_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i64_ty, llvm_i64_ty], []>; -def int_nvvm_sust_b_3d_v4i8_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v4i16_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_b_3d_v4i32_zero : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; +foreach clamp = ["clamp", "trap", "zero"] in + foreach vec = [TV_I8, TV_I16, TV_I32, TV_I64, + TV_V2I8, TV_V2I16, TV_V2I32, TV_V2I64, + TV_V4I8, TV_V4I16, TV_V4I32] in + defm int_nvvm_sust_b : SurfaceStoreIntrinsics; // Formatted - -def int_nvvm_sust_p_1d_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_1d_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_1d_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_1d_array_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_array_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_array_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_1d_array_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_array_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_array_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_1d_array_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_array_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_1d_array_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_2d_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_2d_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_2d_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_2d_array_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_array_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_array_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_2d_array_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_array_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_array_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_2d_array_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_array_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_2d_array_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_3d_i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_3d_i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_3d_i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_3d_v2i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_3d_v2i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_3d_v2i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty], []>; -def int_nvvm_sust_p_3d_v4i8_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_3d_v4i16_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], []>; -def int_nvvm_sust_p_3d_v4i32_trap : NVVMBuiltin, - Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, - llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; +foreach vec = [TV_I8, TV_I16, TV_I32, + TV_V2I8, TV_V2I16, TV_V2I32, + TV_V4I8, TV_V4I16, TV_V4I32] in + defm int_nvvm_sust_p : SurfaceStoreIntrinsics<"trap", vec>; // Accessing special registers. @@ -3466,19 +1754,16 @@ def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32; // SHUFFLE // // Generate intrinsics for all variants of shfl instruction. -foreach sync = [false, true] in { - foreach mode = ["up", "down", "bfly", "idx"] in { - foreach type = ["i32", "f32"] in { - foreach return_pred = [false, true] in { - defvar i = SHFL_INFO; - if i.withGccBuiltin then { - def i.Name : NVVMBuiltin, - Intrinsic; - } else { - def i.Name : - Intrinsic; +let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] in { + foreach sync = [false, true] in { + foreach mode = ["up", "down", "bfly", "idx"] in { + foreach type = ["i32", "f32"] in { + foreach return_pred = [false, true] in { + defvar i = SHFL_INFO; + if i.withGccBuiltin then + def i.Name : NVVMBuiltin, Intrinsic; + else + def i.Name : Intrinsic; } } } @@ -3489,43 +1774,21 @@ foreach sync = [false, true] in { // VOTE // -// vote.all pred -def int_nvvm_vote_all : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// vote.any pred -def int_nvvm_vote_any : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// vote.uni pred -def int_nvvm_vote_uni : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// vote.ballot pred -def int_nvvm_vote_ballot : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; - +let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] in { + def int_nvvm_vote_all : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; + def int_nvvm_vote_any : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; + def int_nvvm_vote_uni : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; + def int_nvvm_vote_ballot : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i1_ty]>; +} // // VOTE.SYNC // - -// vote.sync.all mask, pred -def int_nvvm_vote_all_sync : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// vote.sync.any mask, pred -def int_nvvm_vote_any_sync : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// vote.sync.uni mask, pred -def int_nvvm_vote_uni_sync : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// vote.sync.ballot mask, pred -def int_nvvm_vote_ballot_sync : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; +let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] in { + def int_nvvm_vote_all_sync : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty]>; + def int_nvvm_vote_any_sync : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty]>; + def int_nvvm_vote_uni_sync : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty]>; + def int_nvvm_vote_ballot_sync : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i1_ty]>; +} // // ACTIVEMASK @@ -3537,28 +1800,25 @@ def int_nvvm_activemask : NVVMBuiltin, // // MATCH.SYNC // -// match.any.sync.b32 mask, value -def int_nvvm_match_any_sync_i32 : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// match.any.sync.b64 mask, value -def int_nvvm_match_any_sync_i64 : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; - -// match.all instruction have two variants -- one returns a single value, another -// returns a pair {value, predicate}. We currently only implement the latter as -// that's the variant exposed by CUDA API. - -// match.all.sync.b32p mask, value -def int_nvvm_match_all_sync_i32p : - Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; -// match.all.sync.b64p mask, value -def int_nvvm_match_all_sync_i64p : - Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback]>; - +let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] in { + // match.any.sync.b32 mask, value + def int_nvvm_match_any_sync_i32 : NVVMBuiltin, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; + // match.any.sync.b64 mask, value + def int_nvvm_match_any_sync_i64 : NVVMBuiltin, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty]>; + + // match.all instruction have two variants -- one returns a single value, another + // returns a pair {value, predicate}. We currently only implement the latter as + // that's the variant exposed by CUDA API. + + // match.all.sync.b32p mask, value + def int_nvvm_match_all_sync_i32p : + Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty]>; + // match.all.sync.b64p mask, value + def int_nvvm_match_all_sync_i64p : + Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty]>; +} // // ELECT.SYNC // @@ -3572,21 +1832,17 @@ def int_nvvm_elect_sync : // // redux.sync.op.u32 dst, src, membermask; -foreach op = ["umin", "umax", "add", "min", "max", "and", "xor", "or"] in { - def int_nvvm_redux_sync_ # op : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>; -} - -// redux.sync.op.{abs}.{NaN}.f32 dst, src, membermask; -foreach binOp = ["min", "max"] in { - foreach abs = ["", "_abs"] in { - foreach NaN = ["", "_NaN"] in { - def int_nvvm_redux_sync_f # binOp # abs # NaN : NVVMBuiltin, - Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty], - [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>; - } - } +let IntrProperties = [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback] in { + foreach op = ["umin", "umax", "add", "min", "max", "and", "xor", "or"] in + def int_nvvm_redux_sync_ # op : NVVMBuiltin, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; + + // redux.sync.op.{abs}.{NaN}.f32 dst, src, membermask; + foreach binOp = ["min", "max"] in + foreach abs = ["", "_abs"] in + foreach NaN = ["", "_NaN"] in + def int_nvvm_redux_sync_f # binOp # abs # NaN : NVVMBuiltin, + Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty]>; } // @@ -3638,13 +1894,10 @@ foreach layout = ["row", "col"] in { } // WMMA.MMA -class NVVM_WMMA_MMA +class NVVM_MMA : Intrinsic.llvm>; + [IntrNoMem, IntrNoCallback]>; foreach layout_a = ["row", "col"] in { foreach layout_b = ["row", "col"] in { @@ -3655,8 +1908,7 @@ foreach layout_a = ["row", "col"] in { if NVVM_WMMA_SUPPORTED.ret then { def WMMA_NAME.record - : NVVM_WMMA_MMA; + : NVVM_MMA; } } // b1op } // op @@ -3665,14 +1917,6 @@ foreach layout_a = ["row", "col"] in { } // layout_b } // layout_a -// MMA -class NVVM_MMA - : Intrinsic.llvm>; - foreach layout_a = ["row", "col"] in { foreach layout_b = ["row", "col"] in { foreach satf = [0, 1] in { @@ -3680,7 +1924,7 @@ foreach layout_a = ["row", "col"] in { foreach b1op = NVVM_MMA_B1OPS.ret in { if NVVM_MMA_SUPPORTED.ret then { def MMA_NAME.record - : NVVM_MMA; + : NVVM_MMA; } } // b1op } // op @@ -3704,18 +1948,22 @@ foreach transposed = [0, 1] in { } } -def int_nvvm_mapa - : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, NoCapture>]>; -def int_nvvm_mapa_shared_cluster - : DefaultAttrsIntrinsic<[llvm_shared_cluster_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, NoCapture>]>; -def int_nvvm_getctarank - : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_ptr_ty], - [IntrNoMem, IntrSpeculatable, NoCapture>]>; -def int_nvvm_getctarank_shared_cluster - : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_shared_ptr_ty], - [IntrNoMem, IntrSpeculatable, NoCapture>]>; +// MAPA +let IntrProperties = [IntrNoMem, IntrSpeculatable, NoCapture>] in { + def int_nvvm_mapa + : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i32_ty]>; + def int_nvvm_mapa_shared_cluster + : DefaultAttrsIntrinsic<[llvm_shared_cluster_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty]>; +} + +// GETCTARANK +let IntrProperties = [IntrNoMem, IntrSpeculatable, NoCapture>] in { + def int_nvvm_getctarank + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_ptr_ty]>; + def int_nvvm_getctarank_shared_cluster + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_shared_ptr_ty]>; +} + def int_nvvm_is_explicit_cluster : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef], @@ -3731,87 +1979,123 @@ foreach op = ["dec", "inc"] in def int_nvvm_exit : NVVMBuiltin, Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoReturn]>; +class DefaultAttrsIntrinsicFlags ret_types, + list param_types, + list flags, + list intr_properties> + : DefaultAttrsIntrinsic< + ret_types, + !listconcat(param_types, flags), + !listconcat(intr_properties, + !foreach(i, !range(flags), + ImmArg>))>; + // Intrinsics for Tensor Copy using TMA // G2S -> From Global to Shared memory variants // S2G -> From Shared to Global memory variants -foreach dim = [1, 2, 3, 4, 5] in { +foreach dim = 1...5 in { + defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim); + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - foreach g2s = [CP_ASYNC_BULK_TENSOR_G2S_INTR] in - def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>; - foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR] in - def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>; - foreach prefetch = [CP_ASYNC_BULK_TENSOR_PREFETCH_INTR] in - def prefetch.Name : DefaultAttrsIntrinsic<[], prefetch.ArgsTy, prefetch.IntrProp>; + defvar is_im2col = !eq(mode, "im2col"); + defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0); + defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets); + + def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d : + DefaultAttrsIntrinsicFlags<[], + !listconcat([llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr + llvm_shared_ptr_ty, // mbarrier_smem_ptr + llvm_ptr_ty], // tensormap_ptr + tensor_dim_args, // actual tensor dims + im2col_offsets_args, // im2col offsets + [llvm_i16_ty, // cta_mask + llvm_i64_ty]), // cache_hint + [llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, + WriteOnly>, ReadOnly>, + NoCapture>, NoCapture>, NoCapture>]>; + + def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d : + DefaultAttrsIntrinsicFlags<[], + !listconcat([llvm_shared_ptr_ty, // src_smem_ptr + llvm_ptr_ty], // tensormap_ptr + tensor_dim_args, // actual tensor dims + [llvm_i64_ty]), // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, + ReadOnly>, ReadOnly>, + NoCapture>, NoCapture>]>; + + def int_nvvm_cp_async_bulk_tensor_prefetch_ # mode # _ # dim # d : + DefaultAttrsIntrinsicFlags<[], + !listconcat([llvm_ptr_ty], // tensormap_ptr + tensor_dim_args, // actual tensor dims + im2col_offsets_args, // im2col offsets + [llvm_i64_ty]), // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, + ReadOnly>, NoCapture>]>; + + // Intrinsics for TMA Copy with reduction + foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in + def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d : + DefaultAttrsIntrinsicFlags<[], + !listconcat([llvm_shared_ptr_ty, // src_smem_ptr + llvm_ptr_ty], // tensormap_ptr + tensor_dim_args, // actual tensor dims + [llvm_i64_ty]), // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, ReadOnly>, ReadOnly>, + NoCapture>, NoCapture>]>; } } -// Intrinsics for TMA Copy with reduction -foreach dim = [1, 2, 3, 4, 5] in { - foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in { - foreach reduce = [CP_ASYNC_BULK_TENSOR_REDUCE_INTR] in - def reduce.Name : DefaultAttrsIntrinsic<[], reduce.ArgsTy, reduce.IntrProp>; - } +// Intrinsics for Prefetch and Prefetchu +let IntrProperties = [IntrArgMemOnly, ReadOnly>, NoCapture>] in { + foreach level = ["L1", "L2"] in { + def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>; + def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>; + def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>; } + + foreach eviction_priority = ["evict_normal", "evict_last"] in + def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>; + + def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>; } -// Intrinsics for Prefetch and Prefetchu -def int_nvvm_prefetch_L1 : Intrinsic<[], [llvm_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_nvvm_prefetch_L2 : Intrinsic<[], [llvm_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_nvvm_prefetch_global_L1 : Intrinsic<[], [llvm_global_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_nvvm_prefetch_global_L2 : Intrinsic<[], [llvm_global_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_nvvm_prefetch_local_L1 : Intrinsic<[], [llvm_local_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_nvvm_prefetch_local_L2 : Intrinsic<[], [llvm_local_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; - -def int_nvvm_prefetch_global_L2_evict_normal : Intrinsic<[], [llvm_global_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_nvvm_prefetch_global_L2_evict_last : Intrinsic<[], [llvm_global_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>]>; - -def int_nvvm_applypriority_global_L2_evict_normal - : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>, - ImmArg>]>; - -def int_nvvm_applypriority_L2_evict_normal - : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty], - [IntrArgMemOnly, ReadOnly>, NoCapture>, - ImmArg>]>; - -// Intrinsics for discard -def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[], - [llvm_global_ptr_ty, llvm_i64_ty], [NoCapture>, - ImmArg>, IntrHasSideEffects]>; - -def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[], - [llvm_ptr_ty, llvm_i64_ty], [NoCapture>, - ImmArg>, IntrHasSideEffects]>; +// applypriority +let IntrProperties = [IntrArgMemOnly, ReadOnly>, NoCapture>, + ImmArg>] in { + def int_nvvm_applypriority_global_L2_evict_normal + : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty]>; + + def int_nvvm_applypriority_L2_evict_normal + : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty]>; +} + +// discard +let IntrProperties = [NoCapture>, ImmArg>, IntrHasSideEffects] in { + def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty]>; + def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty]>; +} // Intrinsics for Bulk Copy using TMA (non-tensor) // From Global to Shared Cluster def int_nvvm_cp_async_bulk_global_to_shared_cluster - : DefaultAttrsIntrinsic<[], + : DefaultAttrsIntrinsicFlags<[], [llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr llvm_shared_ptr_ty, // mbarrier_ptr llvm_global_ptr_ty, // src_gmem_ptr llvm_i32_ty, // copy_size llvm_i16_ty, // cta_mask - llvm_i64_ty, // cache_hint - llvm_i1_ty, // Flag for cta_mask + llvm_i64_ty], // cache_hint + [llvm_i1_ty, // Flag for cta_mask llvm_i1_ty], // Flag for cache_hint [IntrConvergent, IntrArgMemOnly, WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>, - NoCapture>, ImmArg>, - ImmArg>]>; + NoCapture>, NoCapture>, NoCapture>]>; // From Shared CTA to Shared Cluster def int_nvvm_cp_async_bulk_shared_cta_to_cluster @@ -3827,27 +2111,38 @@ def int_nvvm_cp_async_bulk_shared_cta_to_cluster // From Shared CTA to Global memory def int_nvvm_cp_async_bulk_shared_cta_to_global + : DefaultAttrsIntrinsicFlags<[], + [llvm_global_ptr_ty, // dst_gmem_ptr + llvm_shared_ptr_ty, // src_smem_ptr + llvm_i32_ty, // copy_size + llvm_i64_ty], // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, IntrArgMemOnly, + WriteOnly>, ReadOnly>, + NoCapture>, NoCapture>]>; + +// From Shared CTA to Global memory with bytemask +def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty, // dst_gmem_ptr llvm_shared_ptr_ty, // src_smem_ptr llvm_i32_ty, // copy_size llvm_i64_ty, // cache_hint - llvm_i1_ty], // Flag for cache_hint + llvm_i1_ty, // Flag for cache_hint + llvm_i16_ty], // byte_mask [IntrConvergent, IntrArgMemOnly, WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>, ImmArg>]>; // Intrinsics for Bulk Copy Prefetch L2 def int_nvvm_cp_async_bulk_prefetch_L2 - : DefaultAttrsIntrinsic<[], + : DefaultAttrsIntrinsicFlags<[], [llvm_global_ptr_ty, // src_gmem_ptr llvm_i32_ty, // copy_size - llvm_i64_ty, // cache_hint - llvm_i1_ty], // Flag for cache_hint + llvm_i64_ty], // cache_hint + [llvm_i1_ty], // Flag for cache_hint [IntrConvergent, IntrArgMemOnly, - NoCapture>, ReadOnly>, - ImmArg>]>; + NoCapture>, ReadOnly>]>; def int_nvvm_griddepcontrol_launch_dependents : Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; def int_nvvm_griddepcontrol_wait : Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; @@ -3946,8 +2241,7 @@ class NVVM_TCGEN05_LD : !listconcat([IntrConvergent, IntrArgMemOnly, NoCapture>], !if(!eq(Shape, "16x32bx2"), [ImmArg>, ImmArg>], - [ImmArg>])), - NVVM_TCGEN05_LDST_NAME<"ld", Shape, Num>.intr>; + [ImmArg>]))>; // Tcgen05 st intrinsics class NVVM_TCGEN05_ST : @@ -3959,32 +2253,28 @@ class NVVM_TCGEN05_ST : !listconcat([IntrConvergent, IntrArgMemOnly, NoCapture>], !if(!eq(Shape, "16x32bx2"), [ImmArg>, ImmArg>], - [ImmArg>])), - NVVM_TCGEN05_LDST_NAME<"st", Shape, Num>.intr>; + [ImmArg>]))>; foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in { - foreach num = !range(0, 8) in { + foreach num = 0...8 in { if NVVM_TCGEN05_LDST_ACCESS_SIZE.valid then { - def NVVM_TCGEN05_LDST_NAME<"ld", shape, num>.record : + def int_nvvm_tcgen05_ld_ # shape # _x # !shl(1, num) : NVVM_TCGEN05_LD; - def NVVM_TCGEN05_LDST_NAME<"st", shape, num>.record : + def int_nvvm_tcgen05_st_ # shape # _x # !shl(1, num) : NVVM_TCGEN05_ST; - } + } } } // // Bulk store intrinsics // +let IntrProperties = [IntrArgMemOnly, IntrWriteMem, WriteOnly>, + NoCapture>, ImmArg>] in { + def int_nvvm_st_bulk : + DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty]>; -def int_nvvm_st_bulk : DefaultAttrsIntrinsic<[], - [llvm_ptr_ty, llvm_i64_ty, llvm_i64_ty], - [IntrArgMemOnly, IntrWriteMem, - WriteOnly>, NoCapture>, ImmArg>]>; - -def int_nvvm_st_bulk_shared_cta : DefaultAttrsIntrinsic<[], - [llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty], - [IntrArgMemOnly, IntrWriteMem, - WriteOnly>, NoCapture>, ImmArg>]>; - + def int_nvvm_st_bulk_shared_cta : + DefaultAttrsIntrinsic<[], [llvm_shared_ptr_ty, llvm_i64_ty, llvm_i64_ty]>; +} } // let TargetPrefix = "nvvm" diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index 18b2883eb00e7..622a96cafb128 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1886,8 +1886,14 @@ let TargetPrefix = "riscv" in { def int_riscv_vsm3me : RISCVBinaryAAXUnMasked; } // TargetPrefix = "riscv" +// Zihintpause extensions +//===----------------------------------------------------------------------===// +let TargetPrefix = "riscv" in +def int_riscv_pause : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; + // Vendor extensions //===----------------------------------------------------------------------===// include "llvm/IR/IntrinsicsRISCVXTHead.td" include "llvm/IR/IntrinsicsRISCVXsf.td" include "llvm/IR/IntrinsicsRISCVXCV.td" +include "llvm/IR/IntrinsicsRISCVXAndes.td" diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td b/llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td new file mode 100644 index 0000000000000..d90fe2cd0e6f3 --- /dev/null +++ b/llvm/include/llvm/IR/IntrinsicsRISCVXAndes.td @@ -0,0 +1,17 @@ +//===- IntrinsicsRISCVXAndes.td - Andes intrinsics ---------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the Andes vendor intrinsics for RISC-V. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "riscv" in { + // Andes Vector Packed FP16 Extension + defm nds_vfpmadt : RISCVBinaryAAXRoundingMode; + defm nds_vfpmadb : RISCVBinaryAAXRoundingMode; +} diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 404467781b4d0..8d984d6ce58df 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -119,6 +119,11 @@ let TargetPrefix = "spv" in { [llvm_any_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], [IntrNoMem]>; + def int_spv_resource_handlefromimplicitbinding + : DefaultAttrsIntrinsic< + [llvm_any_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], + [IntrNoMem]>; def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>; def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h index 22ab59be55eb2..3d06edeed6c46 100644 --- a/llvm/include/llvm/IR/Metadata.h +++ b/llvm/include/llvm/IR/Metadata.h @@ -1076,8 +1076,8 @@ class MDNode : public Metadata { /// Explicity set alignment because bitfields by default have an /// alignment of 1 on z/OS. struct alignas(alignof(size_t)) Header { - bool IsResizable : 1; - bool IsLarge : 1; + size_t IsResizable : 1; + size_t IsLarge : 1; size_t SmallSize : 4; size_t SmallNumOps : 4; size_t : sizeof(size_t) * CHAR_BIT - 10; diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 5080fa235905d..65e428a3adea7 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -28,6 +28,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Module.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/InterleavedRange.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ScaledNumber.h" @@ -72,7 +73,8 @@ struct CalleeInfo { uint32_t Hotness : 3; // True if at least one of the calls to the callee is a tail call. - bool HasTailCall : 1; + LLVM_PREFERRED_TYPE(bool) + uint32_t HasTailCall : 1; /// The value stored in RelBlockFreq has to be interpreted as the digits of /// a scaled number with a scale of \p -ScaleShift. diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h index 39e1314bd8130..25ca8d744a591 100644 --- a/llvm/include/llvm/IR/User.h +++ b/llvm/include/llvm/IR/User.h @@ -79,8 +79,10 @@ class User : public Value { struct AllocInfo { public: const unsigned NumOps : NumUserOperandsBits; - const bool HasHungOffUses : 1; - const bool HasDescriptor : 1; + LLVM_PREFERRED_TYPE(bool) + const unsigned HasHungOffUses : 1; + LLVM_PREFERRED_TYPE(bool) + const unsigned HasDescriptor : 1; AllocInfo() = delete; diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 0e58caf6478a4..42610d505c2bd 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -155,7 +155,7 @@ void initializeLazyValueInfoWrapperPassPass(PassRegistry &); void initializeLegacyLICMPassPass(PassRegistry &); void initializeLegalizerPass(PassRegistry &); void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &); -void initializeGISelValueTrackingAnalysisPass(PassRegistry &); +void initializeGISelValueTrackingAnalysisLegacyPass(PassRegistry &); void initializeLiveDebugValuesLegacyPass(PassRegistry &); void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &); void initializeLiveIntervalsWrapperPassPass(PassRegistry &); diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index e3c4900ba175c..3496b5fff398f 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -15,12 +15,69 @@ namespace llvm { class raw_ostream; namespace mcdxbc { -struct RootParameter { +struct RootParameterInfo { dxbc::RootParameterHeader Header; - union { - dxbc::RootConstants Constants; - dxbc::RTS0::v2::RootDescriptor Descriptor; - }; + size_t Location; + + RootParameterInfo() = default; + + RootParameterInfo(dxbc::RootParameterHeader Header, size_t Location) + : Header(Header), Location(Location) {} +}; + +struct RootParametersContainer { + SmallVector ParametersInfo; + + SmallVector Constants; + SmallVector Descriptors; + + void addInfo(dxbc::RootParameterHeader Header, size_t Location) { + ParametersInfo.push_back(RootParameterInfo(Header, Location)); + } + + void addParameter(dxbc::RootParameterHeader Header, + dxbc::RootConstants Constant) { + addInfo(Header, Constants.size()); + Constants.push_back(Constant); + } + + void addInvalidParameter(dxbc::RootParameterHeader Header) { + addInfo(Header, -1); + } + + void addParameter(dxbc::RootParameterHeader Header, + dxbc::RTS0::v2::RootDescriptor Descriptor) { + addInfo(Header, Descriptors.size()); + Descriptors.push_back(Descriptor); + } + + const std::pair + getTypeAndLocForParameter(uint32_t Location) const { + const RootParameterInfo &Info = ParametersInfo[Location]; + return {Info.Header.ParameterType, Info.Location}; + } + + const dxbc::RootParameterHeader &getHeader(size_t Location) const { + const RootParameterInfo &Info = ParametersInfo[Location]; + return Info.Header; + } + + const dxbc::RootConstants &getConstant(size_t Index) const { + return Constants[Index]; + } + + const dxbc::RTS0::v2::RootDescriptor &getRootDescriptor(size_t Index) const { + return Descriptors[Index]; + } + + size_t size() const { return ParametersInfo.size(); } + + SmallVector::const_iterator begin() const { + return ParametersInfo.begin(); + } + SmallVector::const_iterator end() const { + return ParametersInfo.end(); + } }; struct RootSignatureDesc { @@ -29,7 +86,7 @@ struct RootSignatureDesc { uint32_t RootParameterOffset = 0U; uint32_t StaticSamplersOffset = 0u; uint32_t NumStaticSamplers = 0u; - SmallVector Parameters; + mcdxbc::RootParametersContainer ParametersContainer; void write(raw_ostream &OS) const; diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index c69573ee3ed97..518dc55acb99b 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -98,6 +98,7 @@ LOOP_PASS("loop-term-fold", LoopTermFoldPass()) // computed. (We still either need to regenerate kill flags after regalloc, or // preferably fix the scavenger to not depend on them). MACHINE_FUNCTION_ANALYSIS("edge-bundles", EdgeBundlesAnalysis()) +MACHINE_FUNCTION_ANALYSIS("gisel-value-tracking", GISelValueTrackingAnalysis()) MACHINE_FUNCTION_ANALYSIS("livedebugvars", LiveDebugVariablesAnalysis()) MACHINE_FUNCTION_ANALYSIS("live-intervals", LiveIntervalsAnalysis()) MACHINE_FUNCTION_ANALYSIS("live-reg-matrix", LiveRegMatrixAnalysis()) @@ -165,6 +166,7 @@ MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass(TM)) MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("post-ra-pseudos", ExpandPostRAPseudosPass()) MACHINE_FUNCTION_PASS("print", PrintMIRPass()) +MACHINE_FUNCTION_PASS("print", GISelValueTrackingPrinterPass(errs())) MACHINE_FUNCTION_PASS("print", LiveDebugVariablesPrinterPass(errs())) MACHINE_FUNCTION_PASS("print", LiveIntervalsPrinterPass(errs())) MACHINE_FUNCTION_PASS("print", LiveStacksPrinterPass(errs())) diff --git a/llvm/include/llvm/ProfileData/DataAccessProf.h b/llvm/include/llvm/ProfileData/DataAccessProf.h new file mode 100644 index 0000000000000..3cc8835a776dd --- /dev/null +++ b/llvm/include/llvm/ProfileData/DataAccessProf.h @@ -0,0 +1,214 @@ +//===- DataAccessProf.h - Data access profile format support ---------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains support to construct and use data access profiles. +// +// For the original RFC of this pass please see +// https://discourse.llvm.org/t/rfc-profile-guided-static-data-partitioning/83744 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_PROFILEDATA_DATAACCESSPROF_H_ +#define LLVM_PROFILEDATA_DATAACCESSPROF_H_ + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfoVariant.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/StringSaver.h" + +#include +#include +#include + +namespace llvm { + +namespace data_access_prof { + +/// The location of data in the source code. Used by profile lookup API. +struct SourceLocation { + SourceLocation(StringRef FileNameRef, uint32_t Line) + : FileName(FileNameRef.str()), Line(Line) {} + /// The filename where the data is located. + std::string FileName; + /// The line number in the source code. + uint32_t Line; +}; + +namespace internal { + +// Conceptually similar to SourceLocation except that FileNames are StringRef of +// which strings are owned by `DataAccessProfData`. Used by `DataAccessProfData` +// to represent data locations internally. +struct SourceLocationRef { + // The filename where the data is located. + StringRef FileName; + // The line number in the source code. + uint32_t Line; +}; + +// The data access profiles for a symbol. Used by `DataAccessProfData` +// to represent records internally. +struct DataAccessProfRecordRef { + DataAccessProfRecordRef(uint64_t SymbolID, uint64_t AccessCount, + bool IsStringLiteral) + : SymbolID(SymbolID), AccessCount(AccessCount), + IsStringLiteral(IsStringLiteral) {} + + // Represents a data symbol. The semantic comes in two forms: a symbol index + // for symbol name if `IsStringLiteral` is false, or the hash of a string + // content if `IsStringLiteral` is true. For most of the symbolizable static + // data, the mangled symbol names remain stable relative to the source code + // and therefore used to identify symbols across binary releases. String + // literals have unstable name patterns like `.str.N[.llvm.hash]`, so we use + // the content hash instead. This is a required field. + uint64_t SymbolID; + + // The access count of symbol. Required. + uint64_t AccessCount; + + // True iff this is a record for string literal (symbols with name pattern + // `.str.*` in the symbol table). Required. + bool IsStringLiteral; + + // The locations of data in the source code. Optional. + llvm::SmallVector Locations; +}; +} // namespace internal + +// SymbolID is either a string representing symbol name if the symbol has +// stable mangled name relative to source code, or a uint64_t representing the +// content hash of a string literal (with unstable name patterns like +// `.str.N[.llvm.hash]`). The StringRef is owned by the class's saver object. +using SymbolHandleRef = std::variant; + +// The senamtic is the same as `SymbolHandleRef` above. The strings are owned. +using SymbolHandle = std::variant; + +/// The data access profiles for a symbol. +struct DataAccessProfRecord { +public: + DataAccessProfRecord(SymbolHandleRef SymHandleRef, + ArrayRef LocRefs) { + if (std::holds_alternative(SymHandleRef)) { + SymHandle = std::get(SymHandleRef).str(); + } else + SymHandle = std::get(SymHandleRef); + + for (auto Loc : LocRefs) + Locations.push_back(SourceLocation(Loc.FileName, Loc.Line)); + } + SymbolHandle SymHandle; + + // The locations of data in the source code. Optional. + SmallVector Locations; +}; + +/// Encapsulates the data access profile data and the methods to operate on +/// it. This class provides profile look-up, serialization and +/// deserialization. +class DataAccessProfData { +public: + // Use MapVector to keep input order of strings for serialization and + // deserialization. + using StringToIndexMap = llvm::MapVector; + + DataAccessProfData() : Saver(Allocator) {} + + /// Serialize profile data to the output stream. + /// Storage layout: + /// - Serialized strings. + /// - The encoded hashes. + /// - Records. + Error serialize(ProfOStream &OS) const; + + /// Deserialize this class from the given buffer. + Error deserialize(const unsigned char *&Ptr); + + /// Returns a profile record for \p SymbolID, or std::nullopt if there + /// isn't a record. Internally, this function will canonicalize the symbol + /// name before the lookup. + std::optional + getProfileRecord(const SymbolHandleRef SymID) const; + + /// Returns true if \p SymID is seen in profiled binaries and cold. + bool isKnownColdSymbol(const SymbolHandleRef SymID) const; + + /// Methods to set symbolized data access profile. Returns error if + /// duplicated symbol names or content hashes are seen. The user of this + /// class should aggregate counters that correspond to the same symbol name + /// or with the same string literal hash before calling 'set*' methods. + Error setDataAccessProfile(SymbolHandleRef SymbolID, uint64_t AccessCount); + /// Similar to the method above, for records with \p Locations representing + /// the `filename:line` where this symbol shows up. Note because of linker's + /// merge of identical symbols (e.g., unnamed_addr string literals), one + /// symbol is likely to have multiple locations. + Error setDataAccessProfile(SymbolHandleRef SymbolID, uint64_t AccessCount, + ArrayRef Locations); + /// Add a symbol that's seen in the profiled binary without samples. + Error addKnownSymbolWithoutSamples(SymbolHandleRef SymbolID); + + /// The following methods return array reference for various internal data + /// structures. + ArrayRef getStrToIndexMapRef() const { + return StrToIndexMap.getArrayRef(); + } + ArrayRef< + MapVector::value_type> + getRecords() const { + return Records.getArrayRef(); + } + ArrayRef getKnownColdSymbols() const { + return KnownColdSymbols.getArrayRef(); + } + ArrayRef getKnownColdHashes() const { + return KnownColdHashes.getArrayRef(); + } + +private: + /// Serialize the symbol strings into the output stream. + Error serializeSymbolsAndFilenames(ProfOStream &OS) const; + + /// Deserialize the symbol strings from \p Ptr and increment \p Ptr to the + /// start of the next payload. + Error deserializeSymbolsAndFilenames(const unsigned char *&Ptr, + const uint64_t NumSampledSymbols, + const uint64_t NumColdKnownSymbols); + + /// Decode the records and increment \p Ptr to the start of the next + /// payload. + Error deserializeRecords(const unsigned char *&Ptr); + + /// A helper function to compute a storage index for \p SymbolID. + uint64_t getEncodedIndex(const SymbolHandleRef SymbolID) const; + + // Keeps owned copies of the input strings. + // NOTE: Keep `Saver` initialized before other class members that reference + // its string copies and destructed after they are destructed. + llvm::BumpPtrAllocator Allocator; + llvm::UniqueStringSaver Saver; + + // `Records` stores the records. + MapVector Records; + + StringToIndexMap StrToIndexMap; + llvm::SetVector KnownColdHashes; + llvm::SetVector KnownColdSymbols; +}; + +} // namespace data_access_prof +} // namespace llvm + +#endif // LLVM_PROFILEDATA_DATAACCESSPROF_H_ diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 2d011c89f27cb..544a59df43ed3 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -357,6 +357,13 @@ void createPGONameMetadata(GlobalObject &GO, StringRef PGOName); /// the duplicated profile variables for Comdat functions. bool needsComdatForCounter(const GlobalObject &GV, const Module &M); +/// \c NameStrings is a string composed of one or more possibly encoded +/// sub-strings. The substrings are separated by `\01` (returned by +/// InstrProf.h:getInstrProfNameSeparator). This method decodes the string and +/// calls `NameCallback` for each substring. +Error readAndDecodeStrings(StringRef NameStrings, + std::function NameCallback); + /// An enum describing the attributes of an instrumented profile. enum class InstrProfKind { Unknown = 0x0, @@ -493,6 +500,11 @@ class InstrProfSymtab { public: using AddrHashMap = std::vector>; + // Returns the canonical name of the given PGOName. In a canonical name, all + // suffixes that begins with "." except ".__uniq." are stripped. + // FIXME: Unify this with `FunctionSamples::getCanonicalFnName`. + static StringRef getCanonicalName(StringRef PGOName); + private: using AddrIntervalMap = IntervalMap>; @@ -528,11 +540,6 @@ class InstrProfSymtab { static StringRef getExternalSymbol() { return "** External Symbol **"; } - // Returns the canonial name of the given PGOName. In a canonical name, all - // suffixes that begins with "." except ".__uniq." are stripped. - // FIXME: Unify this with `FunctionSamples::getCanonicalFnName`. - static StringRef getCanonicalName(StringRef PGOName); - // Add the function into the symbol table, by creating the following // map entries: // name-set = {PGOFuncName} union {getCanonicalName(PGOFuncName)} diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h index 570531e6b9e92..42fe225709ef8 100644 --- a/llvm/include/llvm/Support/BranchProbability.h +++ b/llvm/include/llvm/Support/BranchProbability.h @@ -77,7 +77,9 @@ class BranchProbability { LLVM_ABI raw_ostream &print(raw_ostream &OS) const; - LLVM_ABI void dump() const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif /// Scale a large integer. /// diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h index dd446f280a483..3bb238e7df2ed 100644 --- a/llvm/include/llvm/Support/ConvertUTF.h +++ b/llvm/include/llvm/Support/ConvertUTF.h @@ -346,6 +346,10 @@ LLVM_ABI bool convertUTF32ToUTF8String(ArrayRef Src, std::string &Out); LLVM_ABI bool convertUTF8ToUTF16String(StringRef SrcUTF8, SmallVectorImpl &DstUTF16); +bool IsSingleCodeUnitUTF8Codepoint(unsigned); +bool IsSingleCodeUnitUTF16Codepoint(unsigned); +bool IsSingleCodeUnitUTF32Codepoint(unsigned); + #if defined(_WIN32) namespace sys { namespace windows { diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index 529a9f86f2e34..9611586a92c3b 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -119,8 +119,10 @@ class DebugCounter { Counter.CurrChunkIdx = State.ChunkIdx; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) // Dump or print the current counter set into llvm::dbgs(). - LLVM_ABI LLVM_DUMP_METHOD void dump() const; + LLVM_DUMP_METHOD void dump() const; +#endif LLVM_ABI void print(raw_ostream &OS) const; diff --git a/llvm/include/llvm/Support/FileOutputBuffer.h b/llvm/include/llvm/Support/FileOutputBuffer.h index d5b731522c11e..20999f408e325 100644 --- a/llvm/include/llvm/Support/FileOutputBuffer.h +++ b/llvm/include/llvm/Support/FileOutputBuffer.h @@ -32,9 +32,8 @@ class FileOutputBuffer { /// Set the 'x' bit on the resulting file. F_executable = 1, - /// Don't use mmap and instead write an in-memory buffer to a file when this - /// buffer is closed. - F_no_mmap = 2, + /// Use mmap for in-memory file buffer. + F_mmap = 2, }; /// Factory method to create an OutputBuffer object which manages a read/write diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h index 6a14328d431a4..e8dc1c2422646 100644 --- a/llvm/include/llvm/Support/KnownBits.h +++ b/llvm/include/llvm/Support/KnownBits.h @@ -513,7 +513,10 @@ struct KnownBits { bool operator!=(const KnownBits &Other) const { return !(*this == Other); } LLVM_ABI void print(raw_ostream &OS) const; - LLVM_ABI void dump() const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif private: // Internal helper for getting the initial KnownBits for an `srem` or `urem` diff --git a/llvm/include/llvm/Support/SMTAPI.h b/llvm/include/llvm/Support/SMTAPI.h index f1bb86cf81f1c..aed6241219c39 100644 --- a/llvm/include/llvm/Support/SMTAPI.h +++ b/llvm/include/llvm/Support/SMTAPI.h @@ -71,7 +71,9 @@ class SMTSort { virtual void print(raw_ostream &OS) const = 0; - LLVM_ABI LLVM_DUMP_METHOD void dump() const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif protected: /// Query the SMT solver and returns true if two sorts are equal (same kind @@ -118,7 +120,9 @@ class SMTExpr { virtual void print(raw_ostream &OS) const = 0; - LLVM_ABI LLVM_DUMP_METHOD void dump() const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif protected: /// Query the SMT solver and returns true if two sorts are equal (same kind @@ -136,7 +140,9 @@ class SMTSolverStatistics { virtual void print(raw_ostream &OS) const = 0; - LLVM_ABI LLVM_DUMP_METHOD void dump() const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif }; /// Shared pointer for SMTExprs, used by SMTSolver API. @@ -152,7 +158,9 @@ class SMTSolver { SMTSolver() = default; virtual ~SMTSolver() = default; - LLVM_ABI LLVM_DUMP_METHOD void dump() const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const; +#endif // Returns an appropriate floating-point sort for the given bitwidth. SMTSortRef getFloatSort(unsigned BitWidth) { diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h index 87a56809976a3..3d38677f0eb61 100644 --- a/llvm/include/llvm/Support/ScaledNumber.h +++ b/llvm/include/llvm/Support/ScaledNumber.h @@ -424,7 +424,10 @@ class ScaledNumberBase { public: static constexpr int DefaultPrecision = 10; - LLVM_ABI static void dump(uint64_t D, int16_t E, int Width); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD static void dump(uint64_t D, int16_t E, int Width); +#endif + LLVM_ABI static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width, unsigned Precision); LLVM_ABI static std::string toString(uint64_t D, int16_t E, int Width, @@ -607,7 +610,12 @@ template class ScaledNumber : ScaledNumberBase { unsigned Precision = DefaultPrecision) const { return ScaledNumberBase::print(OS, Digits, Scale, Width, Precision); } - void dump() const { return ScaledNumberBase::dump(Digits, Scale, Width); } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD void dump() const { + return ScaledNumberBase::dump(Digits, Scale, Width); + } +#endif ScaledNumber &operator+=(const ScaledNumber &X) { std::tie(Digits, Scale) = diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h index e7f712451d482..234979eebc881 100644 --- a/llvm/include/llvm/TableGen/DirectiveEmitter.h +++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h @@ -71,6 +71,10 @@ class DirectiveLanguage { return Records.getAllDerivedDefinitions("Category"); } + ArrayRef getSourceLanguages() const { + return Records.getAllDerivedDefinitions("SourceLanguage"); + } + ArrayRef getDirectives() const { return Records.getAllDerivedDefinitions("Directive"); } @@ -109,13 +113,15 @@ class BaseRecord { // Returns the name of the directive formatted for output. Whitespace are // replaced with underscores. - std::string getFormattedName() const { - StringRef Name = Def->getValueAsString("name"); + static std::string getFormattedName(const Record *R) { + StringRef Name = R->getValueAsString("name"); std::string N = Name.str(); llvm::replace(N, ' ', '_'); return N; } + std::string getFormattedName() const { return getFormattedName(Def); } + bool isDefault() const { return Def->getValueAsBit("isDefault"); } // Returns the record name. @@ -157,6 +163,10 @@ class Directive : public BaseRecord { const Record *getCategory() const { return Def->getValueAsDef("category"); } + std::vector getSourceLanguages() const { + return Def->getValueAsListOfDefs("languages"); + } + // Clang uses a different format for names of its directives enum. std::string getClangAccSpelling() const { std::string Name = Def->getValueAsString("name").str(); diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index 7205617988205..8b8abb6d52d0a 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -19,6 +19,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" @@ -244,7 +245,7 @@ class RecordRecTy final : public RecTy, RecordRecTy &operator=(const RecordRecTy &) = delete; // Do not use sized deallocation due to trailing objects. - void operator delete(void *p) { ::operator delete(p); } + void operator delete(void *Ptr) { ::operator delete(Ptr); } static bool classof(const RecTy *RT) { return RT->getRecTyKind() == RecordRecTyKind; @@ -598,7 +599,7 @@ class BitsInit final : public TypedInit, BitsInit &operator=(const BitsInit &) = delete; // Do not use sized deallocation due to trailing objects. - void operator delete(void *p) { ::operator delete(p); } + void operator delete(void *Ptr) { ::operator delete(Ptr); } static bool classof(const Init *I) { return I->getKind() == IK_BitsInit; @@ -615,18 +616,8 @@ class BitsInit final : public TypedInit, convertInitializerBitRange(ArrayRef Bits) const override; std::optional convertInitializerToInt() const; - bool isComplete() const override { - for (unsigned i = 0; i != getNumBits(); ++i) - if (!getBit(i)->isComplete()) return false; - return true; - } - - bool allInComplete() const { - for (unsigned i = 0; i != getNumBits(); ++i) - if (getBit(i)->isComplete()) return false; - return true; - } - + bool isComplete() const override; + bool allInComplete() const; bool isConcrete() const override; std::string getAsString() const override; @@ -769,7 +760,7 @@ class ListInit final : public TypedInit, ListInit &operator=(const ListInit &) = delete; // Do not use sized deallocation due to trailing objects. - void operator delete(void *p) { ::operator delete(p); } + void operator delete(void *Ptr) { ::operator delete(Ptr); } static bool classof(const Init *I) { return I->getKind() == IK_ListInit; @@ -782,13 +773,13 @@ class ListInit final : public TypedInit, return ArrayRef(getTrailingObjects(), NumValues); } - const Init *getElement(unsigned Index) const { return getValues()[Index]; } + const Init *getElement(unsigned Idx) const { return getValues()[Idx]; } const RecTy *getElementType() const { return cast(getType())->getElementType(); } - const Record *getElementAsRecord(unsigned i) const; + const Record *getElementAsRecord(unsigned Idx) const; const Init *convertInitializerTo(const RecTy *Ty) const override; @@ -1052,6 +1043,8 @@ class CondOpInit final : public TypedInit, return ArrayRef(getTrailingObjects() + NumConds, NumConds); } + auto getCondAndVals() const { return zip_equal(getConds(), getVals()); } + const Init *Fold(const Record *CurRec) const; const Init *resolveReferences(Resolver &R) const override; @@ -1341,7 +1334,7 @@ class VarDefInit final VarDefInit &operator=(const VarDefInit &) = delete; // Do not use sized deallocation due to trailing objects. - void operator delete(void *p) { ::operator delete(p); } + void operator delete(void *Ptr) { ::operator delete(Ptr); } static bool classof(const Init *I) { return I->getKind() == IK_VarDefInit; @@ -1445,11 +1438,23 @@ class DagInit final } static const DagInit *get(const Init *V, const StringInit *VN, - ArrayRef ArgRange, - ArrayRef NameRange); + ArrayRef Args, + ArrayRef ArgNames); + + static const DagInit *get(const Init *V, ArrayRef Args, + ArrayRef ArgNames) { + return DagInit::get(V, nullptr, Args, ArgNames); + } + static const DagInit * get(const Init *V, const StringInit *VN, - ArrayRef> Args); + ArrayRef> ArgAndNames); + + static const DagInit * + get(const Init *V, + ArrayRef> ArgAndNames) { + return DagInit::get(V, nullptr, ArgAndNames); + } void Profile(FoldingSetNodeID &ID) const; @@ -1487,6 +1492,15 @@ class DagInit final return getTrailingObjects(NumArgs); } + // Return a range of std::pair. + auto getArgAndNames() const { + auto Zip = llvm::zip_equal(getArgs(), getArgNames()); + using EltTy = decltype(*adl_begin(Zip)); + return llvm::map_range(Zip, [](const EltTy &E) { + return std::make_pair(std::get<0>(E), std::get<1>(E)); + }); + } + const Init *resolveReferences(Resolver &R) const override; bool isConcrete() const override; @@ -1790,12 +1804,11 @@ class Record { } void removeValue(const Init *Name) { - for (unsigned i = 0, e = Values.size(); i != e; ++i) - if (Values[i].getNameInit() == Name) { - Values.erase(Values.begin()+i); - return; - } - llvm_unreachable("Cannot remove an entry that does not exist!"); + auto It = llvm::find_if( + Values, [Name](const RecordVal &V) { return V.getNameInit() == Name; }); + if (It == Values.end()) + llvm_unreachable("Cannot remove an entry that does not exist!"); + Values.erase(It); } void removeValue(StringRef Name) { @@ -2115,10 +2128,7 @@ struct LessRecordRegister { size_t size() { return Parts.size(); } - std::pair getPart(size_t i) { - assert (i < Parts.size() && "Invalid idx!"); - return Parts[i]; - } + std::pair getPart(size_t Idx) { return Parts[Idx]; } }; bool operator()(const Record *Rec1, const Record *Rec2) const { diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 41fed692c7025..406baa4f5fdaa 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -860,6 +860,12 @@ def find_last_active : SDNode<"ISD::VECTOR_FIND_LAST_ACTIVE", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>, []>; +def get_active_lane_mask + : SDNode< + "ISD::GET_ACTIVE_LANE_MASK", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>]>, + []>; + // Nodes for intrinsics, you should use the intrinsic itself and let tblgen use // these internally. Don't reference these directly. def intrinsic_void : SDNode<"ISD::INTRINSIC_VOID", @@ -875,6 +881,7 @@ def SDT_assert : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>; def assertsext : SDNode<"ISD::AssertSext", SDT_assert>; def assertzext : SDNode<"ISD::AssertZext", SDT_assert>; +def assertnofpclass : SDNode<"ISD::AssertNoFPClass", SDTFPUnaryOp>; def assertalign : SDNode<"ISD::AssertAlign", SDT_assert>; def convergencectrl_anchor : SDNode<"ISD::CONVERGENCECTRL_ANCHOR", diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroShape.h b/llvm/include/llvm/Transforms/Coroutines/CoroShape.h index ea93ced1ce29e..891774b446571 100644 --- a/llvm/include/llvm/Transforms/Coroutines/CoroShape.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroShape.h @@ -79,7 +79,8 @@ struct Shape { // Scan the function and collect the above intrinsics for later processing void analyze(Function &F, SmallVectorImpl &CoroFrames, - SmallVectorImpl &UnusedCoroSaves); + SmallVectorImpl &UnusedCoroSaves, + CoroPromiseInst *&CoroPromise); // If for some reason, we were not able to find coro.begin, bailout. void invalidateCoroutine(Function &F, SmallVectorImpl &CoroFrames); @@ -87,7 +88,8 @@ struct Shape { void initABI(); // Remove orphaned and unnecessary intrinsics void cleanCoroutine(SmallVectorImpl &CoroFrames, - SmallVectorImpl &UnusedCoroSaves); + SmallVectorImpl &UnusedCoroSaves, + CoroPromiseInst *CoroPromise); // Field indexes for special fields in the switch lowering. struct SwitchFieldIndex { @@ -265,13 +267,14 @@ struct Shape { explicit Shape(Function &F) { SmallVector CoroFrames; SmallVector UnusedCoroSaves; + CoroPromiseInst *CoroPromise = nullptr; - analyze(F, CoroFrames, UnusedCoroSaves); + analyze(F, CoroFrames, UnusedCoroSaves, CoroPromise); if (!CoroBegin) { invalidateCoroutine(F, CoroFrames); return; } - cleanCoroutine(CoroFrames, UnusedCoroSaves); + cleanCoroutine(CoroFrames, UnusedCoroSaves, CoroPromise); } }; diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h new file mode 100644 index 0000000000000..3178dc762a195 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h @@ -0,0 +1,31 @@ +//===------ EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass optimizes a vectorized loop with canonical IV to using EVL-based +// IV if it was tail-folded by predicated EVL. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H +#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H + +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +class Loop; +class LPMUpdater; + +/// Turn vectorized loops with canonical induction variables into loops that +/// only use a single EVL-based induction variable. +struct EVLIndVarSimplifyPass : public PassInfoMixin { + PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; +} // namespace llvm +#endif diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 27bd179a58ede..2afabb75c7cc5 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -693,8 +693,8 @@ AnalysisKey AAManager::Key; ExternalAAWrapperPass::ExternalAAWrapperPass() : ImmutablePass(ID) {} -ExternalAAWrapperPass::ExternalAAWrapperPass(CallbackT CB) - : ImmutablePass(ID), CB(std::move(CB)) {} +ExternalAAWrapperPass::ExternalAAWrapperPass(CallbackT CB, bool RunEarly) + : ImmutablePass(ID), CB(std::move(CB)), RunEarly(RunEarly) {} char ExternalAAWrapperPass::ID = 0; @@ -741,7 +741,7 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) { // Add any target-specific alias analyses that should be run early. auto *ExtWrapperPass = getAnalysisIfAvailable(); - if (ExtWrapperPass && ExtWrapperPass->runEarly() && ExtWrapperPass->CB) { + if (ExtWrapperPass && ExtWrapperPass->RunEarly && ExtWrapperPass->CB) { LLVM_DEBUG(dbgs() << "AAResults register Early ExternalAA: " << ExtWrapperPass->getPassName() << "\n"); ExtWrapperPass->CB(*this, F, *AAR); @@ -777,7 +777,7 @@ bool AAResultsWrapperPass::runOnFunction(Function &F) { // If available, run an external AA providing callback over the results as // well. - if (ExtWrapperPass && !ExtWrapperPass->runEarly() && ExtWrapperPass->CB) { + if (ExtWrapperPass && !ExtWrapperPass->RunEarly && ExtWrapperPass->CB) { LLVM_DEBUG(dbgs() << "AAResults register Late ExternalAA: " << ExtWrapperPass->getPassName() << "\n"); ExtWrapperPass->CB(*this, F, *AAR); diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index af1a3c593c514..ab407e945bc53 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1541,11 +1541,11 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, return std::nullopt; } -std::optional llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, - Type *ElemTyB, Value *PtrB, - const DataLayout &DL, - ScalarEvolution &SE, bool StrictCheck, - bool CheckType) { +std::optional llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, + Type *ElemTyB, Value *PtrB, + const DataLayout &DL, + ScalarEvolution &SE, + bool StrictCheck, bool CheckType) { assert(PtrA && PtrB && "Expected non-nullptr pointers."); // Make sure that A and B are different pointers. @@ -1570,7 +1570,7 @@ std::optional llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets( DL, OffsetB, /*AllowNonInbounds=*/true); - int Val; + std::optional Val; if (PtrA1 == PtrB1) { // Retrieve the address space again as pointer stripping now tracks through // `addrspacecast`. @@ -1585,7 +1585,7 @@ std::optional llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, OffsetB = OffsetB.sextOrTrunc(IdxWidth); OffsetB -= OffsetA; - Val = OffsetB.getSExtValue(); + Val = OffsetB.trySExtValue(); } else { // Otherwise compute the distance with SCEV between the base pointers. const SCEV *PtrSCEVA = SE.getSCEV(PtrA); @@ -1594,10 +1594,14 @@ std::optional llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, SE.computeConstantDifference(PtrSCEVB, PtrSCEVA); if (!Diff) return std::nullopt; - Val = Diff->getSExtValue(); + Val = Diff->trySExtValue(); } - int Size = DL.getTypeStoreSize(ElemTyA); - int Dist = Val / Size; + + if (!Val) + return std::nullopt; + + int64_t Size = DL.getTypeStoreSize(ElemTyA); + int64_t Dist = *Val / Size; // Ensure that the calculated distance matches the type-based one after all // the bitcasts removal in the provided pointers. @@ -1616,14 +1620,15 @@ bool llvm::sortPtrAccesses(ArrayRef VL, Type *ElemTy, // first pointer in the array. Value *Ptr0 = VL[0]; - using DistOrdPair = std::pair; + using DistOrdPair = std::pair; auto Compare = llvm::less_first(); std::set Offsets(Compare); Offsets.emplace(0, 0); bool IsConsecutive = true; for (auto [Idx, Ptr] : drop_begin(enumerate(VL))) { - std::optional Diff = getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE, - /*StrictCheck=*/true); + std::optional Diff = + getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE, + /*StrictCheck=*/true); if (!Diff) return false; @@ -1654,7 +1659,7 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, return false; Type *ElemTyA = getLoadStoreType(A); Type *ElemTyB = getLoadStoreType(B); - std::optional Diff = + std::optional Diff = getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE, /*StrictCheck=*/true, CheckType); return Diff && *Diff == 1; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 3d403531cea2f..8405678aa9680 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7786,10 +7786,7 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, case Instruction::FMul: case Instruction::FDiv: case Instruction::FRem: - return false; case Instruction::GetElementPtr: - // inbounds is handled above - // TODO: what about inrange on constexpr? return false; default: { const auto *CE = dyn_cast(Op); diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp index 4afe77d78a4fe..87f1e76afb60b 100644 --- a/llvm/lib/CGData/StableFunctionMap.cpp +++ b/llvm/lib/CGData/StableFunctionMap.cpp @@ -206,12 +206,10 @@ void StableFunctionMap::finalize(bool SkipTrim) { auto &[StableHash, SFS] = *It; // Group stable functions by ModuleIdentifier. - std::stable_sort(SFS.begin(), SFS.end(), - [&](const std::unique_ptr &L, - const std::unique_ptr &R) { - return *getNameForId(L->ModuleNameId) < - *getNameForId(R->ModuleNameId); - }); + llvm::stable_sort(SFS, [&](const std::unique_ptr &L, + const std::unique_ptr &R) { + return *getNameForId(L->ModuleNameId) < *getNameForId(R->ModuleNameId); + }); // Consider the first function as the root function. auto &RSF = SFS[0]; diff --git a/llvm/lib/CGData/StableFunctionMapRecord.cpp b/llvm/lib/CGData/StableFunctionMapRecord.cpp index 8eb667a651ebe..e23b0e072c9a3 100644 --- a/llvm/lib/CGData/StableFunctionMapRecord.cpp +++ b/llvm/lib/CGData/StableFunctionMapRecord.cpp @@ -56,8 +56,8 @@ getStableFunctionEntries(const StableFunctionMap &SFM) { for (auto &Func : P.second) FuncEntries.emplace_back(Func.get()); - std::stable_sort( - FuncEntries.begin(), FuncEntries.end(), [&](auto &A, auto &B) { + llvm::stable_sort( + FuncEntries, [&](auto &A, auto &B) { return std::tuple(A->Hash, SFM.getNameForId(A->ModuleNameId), SFM.getNameForId(A->FunctionNameId)) < std::tuple(B->Hash, SFM.getNameForId(B->ModuleNameId), diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 7a138a0332b6d..d13b315135ad9 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -63,14 +63,14 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { int DataOffset : 31; /// Non-zero if this is a piece of an aggregate. - uint16_t IsSubfield : 1; + uint32_t IsSubfield : 1; /// Offset into aggregate. - uint16_t StructOffset : 15; + uint32_t StructOffset : 15; /// Register containing the data or the register base of the memory /// location containing the data. - uint16_t CVRegister; + uint32_t CVRegister : 16; uint64_t static toOpaqueValue(const LocalVarDef DR) { uint64_t Val = 0; diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 2c53a9c27ccb2..76f27623c8656 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -5771,6 +5771,35 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) { return false; } +// Find an insert position of Addr for MemoryInst. We can't guarantee MemoryInst +// is the first instruction that will use Addr. So we need to find the first +// user of Addr in current BB. +static BasicBlock::iterator findInsertPos(Value *Addr, Instruction *MemoryInst, + Value *SunkAddr) { + if (Addr->hasOneUse()) + return MemoryInst->getIterator(); + + // We already have a SunkAddr in current BB, but we may need to insert cast + // instruction after it. + if (SunkAddr) { + if (Instruction *AddrInst = dyn_cast(SunkAddr)) + return std::next(AddrInst->getIterator()); + } + + // Find the first user of Addr in current BB. + Instruction *Earliest = MemoryInst; + for (User *U : Addr->users()) { + Instruction *UserInst = dyn_cast(U); + if (UserInst && UserInst->getParent() == MemoryInst->getParent()) { + if (isa(UserInst) || UserInst->isDebugOrPseudoInst()) + continue; + if (UserInst->comesBefore(Earliest)) + Earliest = UserInst; + } + } + return Earliest->getIterator(); +} + /// Sink addressing mode computation immediate before MemoryInst if doing so /// can be done without increasing register pressure. The need for the /// register pressure constraint means this can end up being an all or nothing @@ -5895,11 +5924,6 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, return Modified; } - // Insert this computation right after this user. Since our caller is - // scanning from the top of the BB to the bottom, reuse of the expr are - // guaranteed to happen later. - IRBuilder<> Builder(MemoryInst); - // Now that we determined the addressing expression we want to use and know // that we have to sink it into this block. Check to see if we have already // done this for some other load/store instr in this block. If so, reuse @@ -5910,6 +5934,13 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr; Type *IntPtrTy = DL->getIntPtrType(Addr->getType()); + + // The current BB may be optimized multiple times, we can't guarantee the + // reuse of Addr happens later, call findInsertPos to find an appropriate + // insert position. + IRBuilder<> Builder(MemoryInst->getParent(), + findInsertPos(Addr, MemoryInst, SunkAddr)); + if (SunkAddr) { LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " << *MemoryInst << "\n"); diff --git a/llvm/lib/CodeGen/GCMetadata.cpp b/llvm/lib/CodeGen/GCMetadata.cpp index fa87b14e708e1..85e07efcdcfc7 100644 --- a/llvm/lib/CodeGen/GCMetadata.cpp +++ b/llvm/lib/CodeGen/GCMetadata.cpp @@ -26,7 +26,7 @@ bool GCStrategyMap::invalidate(Module &M, const PreservedAnalyses &PA, for (const auto &F : M) { if (F.isDeclaration() || !F.hasGC()) continue; - if (!StrategyMap.contains(F.getGC())) + if (!contains(F.getGC())) return true; } return false; @@ -36,17 +36,18 @@ AnalysisKey CollectorMetadataAnalysis::Key; CollectorMetadataAnalysis::Result CollectorMetadataAnalysis::run(Module &M, ModuleAnalysisManager &MAM) { - Result R; - auto &Map = R.StrategyMap; + Result StrategyMap; for (auto &F : M) { if (F.isDeclaration() || !F.hasGC()) continue; - auto GCName = F.getGC(); - auto [It, Inserted] = Map.try_emplace(GCName); - if (Inserted) + StringRef GCName = F.getGC(); + auto [It, Inserted] = StrategyMap.try_emplace(GCName); + if (Inserted) { It->second = getGCStrategy(GCName); + It->second->Name = GCName; + } } - return R; + return StrategyMap; } AnalysisKey GCFunctionAnalysis::Key; @@ -61,9 +62,9 @@ GCFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { MAMProxy.cachedResultExists(*F.getParent()) && "This pass need module analysis `collector-metadata`!"); auto &Map = - MAMProxy.getCachedResult(*F.getParent()) - ->StrategyMap; - GCFunctionInfo Info(F, *Map[F.getGC()]); + *MAMProxy.getCachedResult(*F.getParent()); + GCStrategy &S = *Map.try_emplace(F.getGC()).first->second; + GCFunctionInfo Info(F, S); return Info; } diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 89c3801dc203e..589936b6c260f 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -27,9 +27,9 @@ using namespace llvm; -char llvm::GISelValueTrackingAnalysis::ID = 0; +char llvm::GISelValueTrackingAnalysisLegacy::ID = 0; -INITIALIZE_PASS(GISelValueTrackingAnalysis, DEBUG_TYPE, +INITIALIZE_PASS(GISelValueTrackingAnalysisLegacy, DEBUG_TYPE, "Analysis for ComputingKnownBits", false, true) GISelValueTracking::GISelValueTracking(MachineFunction &MF, unsigned MaxDepth) @@ -893,16 +893,18 @@ unsigned GISelValueTracking::computeNumSignBits(Register R, unsigned Depth) { return computeNumSignBits(R, DemandedElts, Depth); } -void GISelValueTrackingAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { +void GISelValueTrackingAnalysisLegacy::getAnalysisUsage( + AnalysisUsage &AU) const { AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } -bool GISelValueTrackingAnalysis::runOnMachineFunction(MachineFunction &MF) { +bool GISelValueTrackingAnalysisLegacy::runOnMachineFunction( + MachineFunction &MF) { return false; } -GISelValueTracking &GISelValueTrackingAnalysis::get(MachineFunction &MF) { +GISelValueTracking &GISelValueTrackingAnalysisLegacy::get(MachineFunction &MF) { if (!Info) { unsigned MaxDepth = MF.getTarget().getOptLevel() == CodeGenOptLevel::None ? 2 : 6; @@ -910,3 +912,38 @@ GISelValueTracking &GISelValueTrackingAnalysis::get(MachineFunction &MF) { } return *Info; } + +AnalysisKey GISelValueTrackingAnalysis::Key; + +GISelValueTracking +GISelValueTrackingAnalysis::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + return Result(MF); +} + +PreservedAnalyses +GISelValueTrackingPrinterPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto &VTA = MFAM.getResult(MF); + const auto &MRI = MF.getRegInfo(); + OS << "name: "; + MF.getFunction().printAsOperand(OS, /*PrintType=*/false); + OS << '\n'; + + for (MachineBasicBlock &BB : MF) { + for (MachineInstr &MI : BB) { + for (MachineOperand &MO : MI.defs()) { + if (!MO.isReg() || MO.getReg().isPhysical()) + continue; + Register Reg = MO.getReg(); + if (!MRI.getType(Reg).isValid()) + continue; + KnownBits Known = VTA.getKnownBits(Reg); + unsigned SignedBits = VTA.computeNumSignBits(Reg); + OS << " " << MO << " KnownBits:" << Known << " SignBits:" << SignedBits + << '\n'; + }; + } + } + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 5842f204febf2..194cbc5b2ac87 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -56,7 +56,7 @@ INITIALIZE_PASS_BEGIN(InstructionSelect, DEBUG_TYPE, "Select target instructions out of generic instructions", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) INITIALIZE_PASS_END(InstructionSelect, DEBUG_TYPE, @@ -120,8 +120,8 @@ class InstructionSelect::MIIteratorMaintainer : public GISelChangeObserver { void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); if (OptLevel != CodeGenOptLevel::None) { AU.addRequired(); @@ -146,7 +146,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { OptLevel = MF.getFunction().hasOptNone() ? CodeGenOptLevel::None : MF.getTarget().getOptLevel(); - VT = &getAnalysis().get(MF); + VT = &getAnalysis().get(MF); if (OptLevel != CodeGenOptLevel::None) { PSI = &getAnalysis().getPSI(); if (PSI && PSI->hasProfileSummary()) diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp index e4bce16f230b8..1bb3f4bcc9b1b 100644 --- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -75,7 +75,7 @@ INITIALIZE_PASS_BEGIN(Legalizer, DEBUG_TYPE, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(Legalizer, DEBUG_TYPE, "Legalize the Machine IR a function's Machine IR", false, false) @@ -86,8 +86,8 @@ void Legalizer::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); getSelectionDAGFallbackAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -341,7 +341,8 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { AuxObservers.push_back(&LocObserver); // This allows Known Bits Analysis in the legalizer. - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); const LegalizerInfo &LI = *MF.getSubtarget().getLegalizerInfo(); MFResult Result = legalizeMachineFunction(MF, LI, AuxObservers, LocObserver, diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 906048679553c..d8dc00f7a61f7 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -575,9 +575,8 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF, // Sort call info by position of call instructions. llvm::sort(YMF.CallSitesInfo.begin(), YMF.CallSitesInfo.end(), [](yaml::CallSiteInfo A, yaml::CallSiteInfo B) { - if (A.CallLocation.BlockNum == B.CallLocation.BlockNum) - return A.CallLocation.Offset < B.CallLocation.Offset; - return A.CallLocation.BlockNum < B.CallLocation.BlockNum; + return std::tie(A.CallLocation.BlockNum, A.CallLocation.Offset) < + std::tie(B.CallLocation.BlockNum, B.CallLocation.Offset); }); } diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 6eab87c1292e0..6af3154b9ed13 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -553,9 +553,12 @@ void MachineCopyPropagation::readSuccessorLiveIns( // If a copy result is livein to a successor, it is not dead. for (const MachineBasicBlock *Succ : MBB.successors()) { for (const auto &LI : Succ->liveins()) { - for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) { - if (MachineInstr *Copy = Tracker.findCopyForUnit(Unit, *TRI)) - MaybeDeadCopies.remove(Copy); + for (MCRegUnitMaskIterator U(LI.PhysReg, TRI); U.isValid(); ++U) { + auto [Unit, Mask] = *U; + if ((Mask & LI.LaneMask).any()) { + if (MachineInstr *Copy = Tracker.findCopyForUnit(Unit, *TRI)) + MaybeDeadCopies.remove(Copy); + } } } } diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 07bffc6c3de90..3d161ffbe40a4 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -884,6 +884,63 @@ bool SUnitWithMemInfo::getUnderlyingObjects() { return true; } +/// Returns true if there is a loop-carried order dependency from \p Src to \p +/// Dst. +static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, + const SUnitWithMemInfo &Dst, + BatchAAResults &BAA, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) { + if (Src.isTriviallyDisjoint(Dst)) + return false; + if (isSuccOrder(Src.SU, Dst.SU)) + return false; + + MachineInstr &SrcMI = *Src.SU->getInstr(); + MachineInstr &DstMI = *Dst.SU->getInstr(); + // First, perform the cheaper check that compares the base register. + // If they are the same and the load offset is less than the store + // offset, then mark the dependence as loop carried potentially. + const MachineOperand *BaseOp1, *BaseOp2; + int64_t Offset1, Offset2; + bool Offset1IsScalable, Offset2IsScalable; + if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable, + TRI) && + TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable, + TRI)) { + if (BaseOp1->isIdenticalTo(*BaseOp2) && + Offset1IsScalable == Offset2IsScalable && (int)Offset1 < (int)Offset2) { + assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) && + "What happened to the chain edge?"); + return true; + } + } + + // Second, the more expensive check that uses alias analysis on the + // base registers. If they alias, and the load offset is less than + // the store offset, the mark the dependence as loop carried. + if (Src.isUnknown() || Dst.isUnknown()) + return true; + if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset) + return true; + + if (BAA.isNoAlias( + MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags), + MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags))) + return false; + + // AliasAnalysis sometimes gives up on following the underlying + // object. In such a case, separate checks for underlying objects may + // prove that there are no aliases between two accesses. + for (const Value *SrcObj : Src.UnderlyingObjs) + for (const Value *DstObj : Dst.UnderlyingObjs) + if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags), + MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags))) + return true; + + return false; +} + /// Add a chain edge between a load and store if the store can be an /// alias of the load on a subsequent iteration, i.e., a loop carried /// dependence. This code is very similar to the code in ScheduleDAGInstrs @@ -898,76 +955,12 @@ void SwingSchedulerDAG::addLoopCarriedDependences() { PendingLoads.emplace_back(&SU); } else if (MI.mayStore()) { SUnitWithMemInfo Store(&SU); - for (const SUnitWithMemInfo &Load : PendingLoads) { - if (Load.isTriviallyDisjoint(Store)) - continue; - if (isSuccOrder(Load.SU, Store.SU)) - continue; - MachineInstr &LdMI = *Load.SU->getInstr(); - // First, perform the cheaper check that compares the base register. - // If they are the same and the load offset is less than the store - // offset, then mark the dependence as loop carried potentially. - const MachineOperand *BaseOp1, *BaseOp2; - int64_t Offset1, Offset2; - bool Offset1IsScalable, Offset2IsScalable; - if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, - Offset1IsScalable, TRI) && - TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, - Offset2IsScalable, TRI)) { - if (BaseOp1->isIdenticalTo(*BaseOp2) && - Offset1IsScalable == Offset2IsScalable && - (int)Offset1 < (int)Offset2) { - assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) && - "What happened to the chain edge?"); - SDep Dep(Load.SU, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - } - // Second, the more expensive check that uses alias analysis on the - // base registers. If they alias, and the load offset is less than - // the store offset, the mark the dependence as loop carried. - if (Load.isUnknown() || Store.isUnknown()) { - SDep Dep(Load.SU, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - if (Load.MemOpValue == Store.MemOpValue && - Load.MemOpOffset <= Store.MemOpOffset) { - SDep Dep(Load.SU, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - - bool IsNoAlias = [&] { - if (BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(Load.MemOpValue, - Load.AATags), - MemoryLocation::getBeforeOrAfter(Store.MemOpValue, - Store.AATags))) - return true; - - // AliasAnalysis sometimes gives up on following the underlying - // object. In such a case, separate checks for underlying objects may - // prove that there are no aliases between two accesses. - for (const Value *LoadObj : Load.UnderlyingObjs) - for (const Value *StoreObj : Store.UnderlyingObjs) - if (!BAA.isNoAlias( - MemoryLocation::getBeforeOrAfter(LoadObj, Load.AATags), - MemoryLocation::getBeforeOrAfter(StoreObj, Store.AATags))) - return false; - - return true; - }(); - - if (!IsNoAlias) { + for (const SUnitWithMemInfo &Load : PendingLoads) + if (hasLoopCarriedMemDep(Load, Store, BAA, TII, TRI)) { SDep Dep(Load.SU, SDep::Barrier); Dep.setLatency(1); SU.addPred(Dep); } - } } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d6e288a59b2ee..4be3363e56987 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10972,6 +10972,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return DAG.getNode(ISD::SRL, DL, VT, N0, NewOp1); } + // fold (srl (logic_op x, (shl (zext y), c1)), c1) + // -> (logic_op (srl x, c1), (zext y)) + // c1 <= leadingzeros(zext(y)) + SDValue X, ZExtY; + if (N1C && sd_match(N0, m_OneUse(m_BitwiseLogic( + m_Value(X), + m_OneUse(m_Shl(m_AllOf(m_Value(ZExtY), + m_Opc(ISD::ZERO_EXTEND)), + m_Specific(N1))))))) { + unsigned NumLeadingZeros = ZExtY.getScalarValueSizeInBits() - + ZExtY.getOperand(0).getScalarValueSizeInBits(); + if (N1C->getZExtValue() <= NumLeadingZeros) + return DAG.getNode(N0.getOpcode(), SDLoc(N0), VT, + DAG.getNode(ISD::SRL, SDLoc(N0), VT, X, N1), ZExtY); + } + // fold operands of srl based on knowledge that the low bits are not // demanded. if (SimplifyDemandedBits(SDValue(N, 0))) @@ -12377,8 +12393,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { !MST->isCompressingStore() && !MST->isTruncatingStore()) return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(), MST->getBasePtr(), MST->getPointerInfo(), - MST->getOriginalAlign(), - MST->getMemOperand()->getFlags(), MST->getAAInfo()); + MST->getBaseAlign(), MST->getMemOperand()->getFlags(), + MST->getAAInfo()); // Try transforming N to an indexed store. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) @@ -12562,7 +12578,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) { SDValue NewLd = DAG.getLoad( N->getValueType(0), SDLoc(N), MLD->getChain(), MLD->getBasePtr(), - MLD->getPointerInfo(), MLD->getOriginalAlign(), + MLD->getPointerInfo(), MLD->getBaseAlign(), MLD->getMemOperand()->getFlags(), MLD->getAAInfo(), MLD->getRanges()); return CombineTo(N, NewLd, NewLd.getValue(1)); } @@ -13602,7 +13618,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue SplitLoad = DAG.getExtLoad(ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr, LN0->getPointerInfo().getWithOffset(Offset), - SplitSrcVT, LN0->getOriginalAlign(), + SplitSrcVT, LN0->getBaseAlign(), LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); BasePtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(Stride), DL); @@ -14101,7 +14117,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { } // If the trunc wasn't legal, try to fold to (sext_inreg (anyext x)) - if ((!LegalTypes || TLI.isTypeLegal(VT)) && N0.hasOneUse()) { + if (!LegalTypes || TLI.isTypeLegal(VT)) { SDValue ExtSrc = DAG.getAnyExtOrTrunc(N00, DL, VT); return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, ExtSrc, N0->getOperand(1)); @@ -15159,15 +15175,15 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { } else if (CR.getBitWidth() == BitSize) NewRanges = OldRanges; } - Load = DAG.getLoad( - VT, DL, LN0->getChain(), NewPtr, - LN0->getPointerInfo().getWithOffset(PtrOff), LN0->getOriginalAlign(), - LN0->getMemOperand()->getFlags(), LN0->getAAInfo(), NewRanges); + Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr, + LN0->getPointerInfo().getWithOffset(PtrOff), + LN0->getBaseAlign(), LN0->getMemOperand()->getFlags(), + LN0->getAAInfo(), NewRanges); } else Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT, - LN0->getOriginalAlign(), - LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); + LN0->getBaseAlign(), LN0->getMemOperand()->getFlags(), + LN0->getAAInfo()); // Replace the old load's chain with the new load's chain. WorklistRemover DeadNodes(*this); @@ -20583,16 +20599,15 @@ ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, ++OpsNarrowed; if (UseTruncStore) return DAG.getTruncStore(St->getChain(), SDLoc(St), IVal, Ptr, - St->getPointerInfo().getWithOffset(StOffset), - VT, St->getOriginalAlign()); + St->getPointerInfo().getWithOffset(StOffset), VT, + St->getBaseAlign()); // Truncate down to the new size. IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal); - return DAG - .getStore(St->getChain(), SDLoc(St), IVal, Ptr, - St->getPointerInfo().getWithOffset(StOffset), - St->getOriginalAlign()); + return DAG.getStore(St->getChain(), SDLoc(St), IVal, Ptr, + St->getPointerInfo().getWithOffset(StOffset), + St->getBaseAlign()); } /// Look for sequence of load / op / store where op is one of 'or', 'xor', and @@ -22113,11 +22128,11 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { AAMDNodes AAInfo = ST->getAAInfo(); SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(4), DL); SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1); } @@ -22588,13 +22603,13 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { SDValue Ptr = ST->getBasePtr(); // Lower value store. SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(HalfValBitSize / 8), DL); // Higher value store. SDValue St1 = DAG.getStore( St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); return St1; } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3e47136edbefc..528c07cc5549d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -441,7 +441,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { bitcastToAPInt().zextOrTrunc(32), SDLoc(CFP), MVT::i32); return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); } if (CFP->getValueType(0) == MVT::f64 && @@ -451,7 +451,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). zextOrTrunc(64), SDLoc(CFP), MVT::i64); return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); } if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) { @@ -465,11 +465,11 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) { std::swap(Lo, Hi); Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(4), dl); Hi = DAG.getStore(Chain, dl, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } @@ -525,7 +525,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { "Can only promote stores to same size type"); Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value); SDValue Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); ReplaceNode(SDValue(Node, 0), Result); break; } @@ -548,7 +548,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { Value = DAG.getZeroExtendInReg(Value, dl, StVT); SDValue Result = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT, - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); ReplaceNode(SDValue(Node, 0), Result); } else if (!StVT.isVector() && !isPowerOf2_64(StWidth.getFixedValue())) { // If not storing a power-of-2 number of bits, expand as two stores. @@ -571,7 +571,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16) // Store the bottom RoundWidth bits. Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), - RoundVT, ST->getOriginalAlign(), MMOFlags, AAInfo); + RoundVT, ST->getBaseAlign(), MMOFlags, AAInfo); // Store the remaining ExtraWidth bits. IncrementSize = RoundWidth / 8; @@ -583,7 +583,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { TLI.getShiftAmountTy(Value.getValueType(), DL))); Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, ST->getOriginalAlign(), MMOFlags, AAInfo); + ExtraVT, ST->getBaseAlign(), MMOFlags, AAInfo); } else { // Big endian - avoid unaligned stores. // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X @@ -593,7 +593,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { DAG.getConstant(ExtraWidth, dl, TLI.getShiftAmountTy(Value.getValueType(), DL))); Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(), RoundVT, - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); // Store the remaining ExtraWidth bits. IncrementSize = RoundWidth / 8; @@ -602,7 +602,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { Ptr.getValueType())); Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, ST->getOriginalAlign(), MMOFlags, AAInfo); + ExtraVT, ST->getBaseAlign(), MMOFlags, AAInfo); } // The order of the stores doesn't matter. @@ -638,16 +638,15 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { if (TLI.isTypeLegal(StVT)) { Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value); Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), - ST->getOriginalAlign(), MMOFlags, AAInfo); + ST->getBaseAlign(), MMOFlags, AAInfo); } else { // The in-memory type isn't legal. Truncate to the type it would promote // to, and then do a truncstore. Value = DAG.getNode(ISD::TRUNCATE, dl, TLI.getTypeToTransformTo(*DAG.getContext(), StVT), Value); - Result = - DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), StVT, - ST->getOriginalAlign(), MMOFlags, AAInfo); + Result = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), + StVT, ST->getBaseAlign(), MMOFlags, AAInfo); } ReplaceNode(SDValue(Node, 0), Result); @@ -753,7 +752,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { SDValue Result = DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), Chain, Ptr, LD->getPointerInfo(), NVT, - LD->getOriginalAlign(), MMOFlags, AAInfo); + LD->getBaseAlign(), MMOFlags, AAInfo); Ch = Result.getValue(1); // The chain. @@ -792,7 +791,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16) // Load the bottom RoundWidth bits. Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr, - LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(), + LD->getPointerInfo(), RoundVT, LD->getBaseAlign(), MMOFlags, AAInfo); // Load the remaining ExtraWidth bits. @@ -801,7 +800,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(IncrementSize), dl); Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo); + ExtraVT, LD->getBaseAlign(), MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of // the other one. @@ -821,7 +820,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8 // Load the top RoundWidth bits. Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr, - LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(), + LD->getPointerInfo(), RoundVT, LD->getBaseAlign(), MMOFlags, AAInfo); // Load the remaining ExtraWidth bits. @@ -830,7 +829,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(IncrementSize), dl); Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo); + ExtraVT, LD->getBaseAlign(), MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of // the other one. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 432209e8ecb0a..41e85521b41ea 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -168,6 +168,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::POISON: case ISD::UNDEF: R = SoftenFloatRes_UNDEF(N); break; case ISD::VAARG: R = SoftenFloatRes_VAARG(N); break; + case ISD::AssertNoFPClass: R = GetSoftenedFloat(N->getOperand(0)); break; case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_FMIN: @@ -967,8 +968,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) { if (L->getExtensionType() == ISD::NON_EXTLOAD) { NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(), - L->getPointerInfo(), NVT, L->getOriginalAlign(), - MMOFlags, L->getAAInfo()); + L->getPointerInfo(), NVT, L->getBaseAlign(), MMOFlags, + L->getAAInfo()); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), NewL.getValue(1)); @@ -978,8 +979,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) { // Do a non-extending load followed by FP_EXTEND. NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD, L->getMemoryVT(), dl, L->getChain(), L->getBasePtr(), L->getOffset(), - L->getPointerInfo(), L->getMemoryVT(), - L->getOriginalAlign(), MMOFlags, L->getAAInfo()); + L->getPointerInfo(), L->getMemoryVT(), L->getBaseAlign(), + MMOFlags, L->getAAInfo()); // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), NewL.getValue(1)); @@ -2582,6 +2583,7 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) { case ISD::LLROUND: case ISD::LRINT: case ISD::LLRINT: R = PromoteFloatOp_UnaryOp(N, OpNo); break; + case ISD::AssertNoFPClass: R = PromoteFloatOp_AssertNoFPClass(N, OpNo); break; case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break; @@ -2640,6 +2642,12 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo) { return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op); } +// Convert the promoted float value to the desired integer type +SDValue DAGTypeLegalizer::PromoteFloatOp_AssertNoFPClass(SDNode *N, + unsigned OpNo) { + return GetPromotedFloat(N->getOperand(0)); +} + SDValue DAGTypeLegalizer::PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo) { SDValue Op = GetPromotedFloat(N->getOperand(0)); @@ -2804,6 +2812,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FTAN: case ISD::FTANH: case ISD::FCANONICALIZE: R = PromoteFloatRes_UnaryOp(N); break; + case ISD::AssertNoFPClass: + R = PromoteFloatRes_AssertNoFPClass(N); + break; // Binary FP Operations case ISD::FADD: @@ -2996,10 +3007,16 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_UnaryOp(SDNode *N) { EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Op = GetPromotedFloat(N->getOperand(0)); - return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op); } +// Unary operation with a more non-float operand where the result and the +// operand have PromoteFloat type action. Construct a new SDNode with the +// promoted float value of the old operand. +SDValue DAGTypeLegalizer::PromoteFloatRes_AssertNoFPClass(SDNode *N) { + return GetPromotedFloat(N->getOperand(0)); +} + // Binary operations where the result and both operands have PromoteFloat type // action. Construct a new SDNode with the promoted float values of the old // operands. @@ -3105,7 +3122,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) { SDValue newL = DAG.getLoad( L->getAddressingMode(), L->getExtensionType(), IVT, SDLoc(N), L->getChain(), L->getBasePtr(), L->getOffset(), L->getPointerInfo(), IVT, - L->getOriginalAlign(), L->getMemOperand()->getFlags(), L->getAAInfo()); + L->getBaseAlign(), L->getMemOperand()->getFlags(), L->getAAInfo()); // Legalize the chain result by replacing uses of the old value chain with the // new one ReplaceValueWith(SDValue(N, 1), newL.getValue(1)); @@ -3281,6 +3298,9 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FTAN: case ISD::FTANH: case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break; + case ISD::AssertNoFPClass: + R = SoftPromoteHalfRes_AssertNoFPClass(N); + break; // Binary FP Operations case ISD::FADD: @@ -3531,7 +3551,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) { SDValue NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), MVT::i16, SDLoc(N), L->getChain(), L->getBasePtr(), L->getOffset(), - L->getPointerInfo(), MVT::i16, L->getOriginalAlign(), + L->getPointerInfo(), MVT::i16, L->getBaseAlign(), L->getMemOperand()->getFlags(), L->getAAInfo()); // Legalize the chain result by replacing uses of the old value chain with the // new one @@ -3607,6 +3627,10 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) { return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_AssertNoFPClass(SDNode *N) { + return GetSoftPromotedHalf(N->getOperand(0)); +} + SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) { EVT OVT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 25e74a2ae5b71..90af5f2cd8e70 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -160,6 +160,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(N); break; + case ISD::GET_ACTIVE_LANE_MASK: + Res = PromoteIntRes_GET_ACTIVE_LANE_MASK(N); + break; + case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: Res = PromoteIntRes_PARTIAL_REDUCE_MLA(N); @@ -4249,7 +4253,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, EVT MemVT = N->getMemoryVT(); Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT, - N->getOriginalAlign(), MMOFlags, AAInfo); + N->getBaseAlign(), MMOFlags, AAInfo); // Remember the chain. Ch = Lo.getValue(1); @@ -4271,8 +4275,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, } } else if (DAG.getDataLayout().isLittleEndian()) { // Little-endian - low bits are at low addresses. - Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), - N->getOriginalAlign(), MMOFlags, AAInfo); + Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), N->getBaseAlign(), + MMOFlags, AAInfo); unsigned ExcessBits = N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); @@ -4283,7 +4287,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(IncrementSize), dl); Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT, - N->getOriginalAlign(), MMOFlags, AAInfo); + N->getBaseAlign(), MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of the // other one. @@ -4301,7 +4305,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits() - ExcessBits), - N->getOriginalAlign(), MMOFlags, AAInfo); + N->getBaseAlign(), MMOFlags, AAInfo); // Increment the pointer to the other half. Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(IncrementSize), dl); @@ -4309,7 +4313,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), EVT::getIntegerVT(*DAG.getContext(), ExcessBits), - N->getOriginalAlign(), MMOFlags, AAInfo); + N->getBaseAlign(), MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of the // other one. @@ -5804,7 +5808,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { if (N->getMemoryVT().bitsLE(NVT)) { GetExpandedInteger(N->getValue(), Lo, Hi); return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), - N->getMemoryVT(), N->getOriginalAlign(), MMOFlags, + N->getMemoryVT(), N->getBaseAlign(), MMOFlags, AAInfo); } @@ -5812,8 +5816,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { // Little-endian - low bits are at low addresses. GetExpandedInteger(N->getValue(), Lo, Hi); - Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), - N->getOriginalAlign(), MMOFlags, AAInfo); + Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), N->getBaseAlign(), + MMOFlags, AAInfo); unsigned ExcessBits = N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); @@ -5824,7 +5828,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize)); Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), - NEVT, N->getOriginalAlign(), MMOFlags, AAInfo); + NEVT, N->getBaseAlign(), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } @@ -5853,7 +5857,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { // Store both the high bits and maybe some of the low bits. Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT, - N->getOriginalAlign(), MMOFlags, AAInfo); + N->getBaseAlign(), MMOFlags, AAInfo); // Increment the pointer to the other half. Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize)); @@ -5861,7 +5865,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), EVT::getIntegerVT(*DAG.getContext(), ExcessBits), - N->getOriginalAlign(), MMOFlags, AAInfo); + N->getBaseAlign(), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } @@ -6222,6 +6226,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N) { return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, SDLoc(N), NVT, N->ops()); } +SDValue DAGTypeLegalizer::PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N) { + EVT VT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), NVT, N->ops()); +} + SDValue DAGTypeLegalizer::PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 720393158aa5e..cf3a9e23f4878 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -379,6 +379,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); + SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); // Integer Operand Promotion. @@ -772,6 +773,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteFloatRes_SELECT(SDNode *N); SDValue PromoteFloatRes_SELECT_CC(SDNode *N); SDValue PromoteFloatRes_UnaryOp(SDNode *N); + SDValue PromoteFloatRes_AssertNoFPClass(SDNode *N); SDValue PromoteFloatRes_UNDEF(SDNode *N); SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N); SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N); @@ -785,6 +787,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo); + SDValue PromoteFloatOp_AssertNoFPClass(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo); @@ -820,6 +823,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftPromoteHalfRes_SELECT(SDNode *N); SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N); SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N); + SDValue SoftPromoteHalfRes_AssertNoFPClass(SDNode *N); SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N); SDValue SoftPromoteHalfRes_UNDEF(SDNode *N); SDValue SoftPromoteHalfRes_VECREDUCE(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 113a3bc0bbea6..88c1af20a321e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -260,15 +260,14 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo, assert(NVT.isByteSized() && "Expanded type not byte sized!"); Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), - LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), - AAInfo); + LD->getBaseAlign(), LD->getMemOperand()->getFlags(), AAInfo); // Increment the pointer to the other half. unsigned IncrementSize = NVT.getSizeInBits() / 8; Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize)); - Hi = DAG.getLoad( - NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), AAInfo); + Hi = DAG.getLoad(NVT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(IncrementSize), + LD->getBaseAlign(), LD->getMemOperand()->getFlags(), AAInfo); // Build a factor node to remember that this load is independent of the // other one. @@ -495,14 +494,14 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) { if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout())) std::swap(Lo, Hi); - Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(), - St->getOriginalAlign(), St->getMemOperand()->getFlags(), - AAInfo); + Lo = + DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(), St->getBaseAlign(), + St->getMemOperand()->getFlags(), AAInfo); Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize)); Hi = DAG.getStore( Chain, dl, Hi, Ptr, St->getPointerInfo().getWithOffset(IncrementSize), - St->getOriginalAlign(), St->getMemOperand()->getFlags(), AAInfo); + St->getBaseAlign(), St->getMemOperand()->getFlags(), AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index ee31baac7b321..f9fbd303d5e89 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -61,6 +61,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::AssertZext: case ISD::AssertSext: case ISD::FPOWI: + case ISD::AssertNoFPClass: R = ScalarizeVecRes_UnaryOpWithExtraInput(N); break; case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break; @@ -466,7 +467,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) { N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(), N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()), N->getPointerInfo(), N->getMemoryVT().getVectorElementType(), - N->getOriginalAlign(), N->getMemOperand()->getFlags(), N->getAAInfo()); + N->getBaseAlign(), N->getMemOperand()->getFlags(), N->getAAInfo()); // Legalize the chain result - switch anything that used the old chain to // use the new one. @@ -1006,13 +1007,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){ return DAG.getTruncStore( N->getChain(), dl, GetScalarizedVector(N->getOperand(1)), N->getBasePtr(), N->getPointerInfo(), - N->getMemoryVT().getVectorElementType(), N->getOriginalAlign(), + N->getMemoryVT().getVectorElementType(), N->getBaseAlign(), N->getMemOperand()->getFlags(), N->getAAInfo()); return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)), - N->getBasePtr(), N->getPointerInfo(), - N->getOriginalAlign(), N->getMemOperand()->getFlags(), - N->getAAInfo()); + N->getBasePtr(), N->getPointerInfo(), N->getBaseAlign(), + N->getMemOperand()->getFlags(), N->getAAInfo()); } /// If the value to round is a vector that needs to be scalarized, it must be @@ -1276,6 +1276,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::UINT_TO_FP: case ISD::VP_UINT_TO_FP: case ISD::FCANONICALIZE: + case ISD::AssertNoFPClass: SplitVecRes_UnaryOp(N, Lo, Hi); break; case ISD::ADDRSPACECAST: @@ -2132,14 +2133,14 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, } Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset, - LD->getPointerInfo(), LoMemVT, LD->getOriginalAlign(), - MMOFlags, AAInfo); + LD->getPointerInfo(), LoMemVT, LD->getBaseAlign(), MMOFlags, + AAInfo); MachinePointerInfo MPI; IncrementPointer(LD, LoMemVT, MPI, Ptr); Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset, MPI, - HiMemVT, LD->getOriginalAlign(), MMOFlags, AAInfo); + HiMemVT, LD->getBaseAlign(), MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of the // other one. @@ -2163,7 +2164,7 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue Ptr = LD->getBasePtr(); SDValue Offset = LD->getOffset(); assert(Offset.isUndef() && "Unexpected indexed variable-length load offset"); - Align Alignment = LD->getOriginalAlign(); + Align Alignment = LD->getBaseAlign(); SDValue Mask = LD->getMask(); SDValue EVL = LD->getVectorLength(); EVT MemoryVT = LD->getMemoryVT(); @@ -2287,7 +2288,7 @@ void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, SLD->getBasePtr(), Increment); - Align Alignment = SLD->getOriginalAlign(); + Align Alignment = SLD->getBaseAlign(); if (LoMemVT.isScalableVector()) Alignment = commonAlignment( Alignment, LoMemVT.getSizeInBits().getKnownMinValue() / 8); @@ -2326,7 +2327,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, assert(Offset.isUndef() && "Unexpected indexed masked load offset"); SDValue Mask = MLD->getMask(); SDValue PassThru = MLD->getPassThru(); - Align Alignment = MLD->getOriginalAlign(); + Align Alignment = MLD->getBaseAlign(); ISD::LoadExtType ExtType = MLD->getExtensionType(); // Split Mask operand @@ -2418,7 +2419,7 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo, }(); EVT MemoryVT = N->getMemoryVT(); - Align Alignment = N->getOriginalAlign(); + Align Alignment = N->getBaseAlign(); // Split Mask operand SDValue MaskLo, MaskHi; @@ -2614,7 +2615,7 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, const SDNodeFlags Flags = N->getFlags(); unsigned Opcode = N->getOpcode(); if (N->getNumOperands() <= 2) { - if (Opcode == ISD::FP_ROUND) { + if (Opcode == ISD::FP_ROUND || Opcode == ISD::AssertNoFPClass) { Lo = DAG.getNode(Opcode, dl, LoVT, Lo, N->getOperand(1), Flags); Hi = DAG.getNode(Opcode, dl, HiVT, Hi, N->getOperand(1), Flags); } else { @@ -3853,7 +3854,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) { SDValue Mask = N->getMask(); SDValue EVL = N->getVectorLength(); SDValue Data = N->getValue(); - Align Alignment = N->getOriginalAlign(); + Align Alignment = N->getBaseAlign(); SDLoc DL(N); SDValue DataLo, DataHi; @@ -3977,7 +3978,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, DAG.getSExtOrTrunc(N->getStride(), DL, PtrVT)); SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, N->getBasePtr(), Increment); - Align Alignment = N->getOriginalAlign(); + Align Alignment = N->getBaseAlign(); if (LoMemVT.isScalableVector()) Alignment = commonAlignment(Alignment, LoMemVT.getSizeInBits().getKnownMinValue() / 8); @@ -4006,7 +4007,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, assert(Offset.isUndef() && "Unexpected indexed masked store offset"); SDValue Mask = N->getMask(); SDValue Data = N->getValue(); - Align Alignment = N->getOriginalAlign(); + Align Alignment = N->getBaseAlign(); SDLoc DL(N); SDValue DataLo, DataHi; @@ -4081,7 +4082,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) { SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); EVT MemoryVT = N->getMemoryVT(); - Align Alignment = N->getOriginalAlign(); + Align Alignment = N->getBaseAlign(); SDLoc DL(N); struct Operands { SDValue Mask; @@ -4170,7 +4171,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); EVT MemoryVT = N->getMemoryVT(); - Align Alignment = N->getOriginalAlign(); + Align Alignment = N->getBaseAlign(); MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags(); AAMDNodes AAInfo = N->getAAInfo(); SDValue Lo, Hi; @@ -4872,6 +4873,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::FREEZE: case ISD::ARITH_FENCE: case ISD::FCANONICALIZE: + case ISD::AssertNoFPClass: Res = WidenVecRes_Unary(N); break; case ISD::FMA: case ISD::VP_FMA: @@ -5616,6 +5618,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) { SDValue InOp = GetWidenedVector(N->getOperand(0)); if (N->getNumOperands() == 1) return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, N->getFlags()); + if (N->getOpcode() == ISD::AssertNoFPClass) + return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, + N->getOperand(1), N->getFlags()); assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); assert(N->isVPOpcode() && "Expected VP opcode"); @@ -7870,7 +7875,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl &LdChain, } SDValue LdOp = DAG.getLoad(*FirstVT, dl, Chain, BasePtr, LD->getPointerInfo(), - LD->getOriginalAlign(), MMOFlags, AAInfo); + LD->getBaseAlign(), MMOFlags, AAInfo); LdChain.push_back(LdOp.getValue(1)); // Check if we can load the element with one instruction. @@ -7911,7 +7916,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl &LdChain, for (EVT MemVT : MemVTs) { Align NewAlign = ScaledOffset == 0 - ? LD->getOriginalAlign() + ? LD->getBaseAlign() : commonAlignment(LD->getAlign(), ScaledOffset); SDValue L = DAG.getLoad(MemVT, dl, Chain, BasePtr, MPI, NewAlign, MMOFlags, AAInfo); @@ -8021,7 +8026,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl &LdChain, unsigned Increment = LdEltVT.getSizeInBits() / 8; Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(), - LdEltVT, LD->getOriginalAlign(), MMOFlags, AAInfo); + LdEltVT, LD->getBaseAlign(), MMOFlags, AAInfo); LdChain.push_back(Ops[0].getValue(1)); unsigned i = 0, Offset = Increment; for (i=1; i < NumElts; ++i, Offset += Increment) { @@ -8029,7 +8034,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl &LdChain, DAG.getObjectPtrOffset(dl, BasePtr, TypeSize::getFixed(Offset)); Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, LD->getPointerInfo().getWithOffset(Offset), LdEltVT, - LD->getOriginalAlign(), MMOFlags, AAInfo); + LD->getBaseAlign(), MMOFlags, AAInfo); LdChain.push_back(Ops[i].getValue(1)); } @@ -8097,7 +8102,7 @@ bool DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl &StChain, unsigned NumVTElts = NewVT.getVectorMinNumElements(); do { Align NewAlign = ScaledOffset == 0 - ? ST->getOriginalAlign() + ? ST->getBaseAlign() : commonAlignment(ST->getAlign(), ScaledOffset); SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp, DAG.getVectorIdxConstant(Idx, dl)); @@ -8119,9 +8124,8 @@ bool DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl &StChain, do { SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp, DAG.getVectorIdxConstant(Idx++, dl)); - SDValue PartStore = - DAG.getStore(Chain, dl, EOp, BasePtr, MPI, ST->getOriginalAlign(), - MMOFlags, AAInfo); + SDValue PartStore = DAG.getStore(Chain, dl, EOp, BasePtr, MPI, + ST->getBaseAlign(), MMOFlags, AAInfo); StChain.push_back(PartStore); IncrementPointer(cast(PartStore), NewVT, MPI, BasePtr); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index bbf1b0fd590ef..5d640c39a56d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5831,6 +5831,15 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, return false; return true; } + case ISD::AssertNoFPClass: { + FPClassTest NoFPClass = + static_cast(Op.getConstantOperandVal(1)); + if ((NoFPClass & fcNan) == fcNan) + return true; + if (SNaN && (NoFPClass & fcSNan) == fcSNan) + return true; + return isKnownNeverNaN(Op.getOperand(0), DemandedElts, SNaN, Depth + 1); + } default: if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN || Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::INTRINSIC_VOID) { @@ -7490,6 +7499,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, N2.getOpcode() == ISD::TargetConstant && "Invalid FP_ROUND!"); if (N1.getValueType() == VT) return N1; // noop conversion. break; + case ISD::AssertNoFPClass: { + assert(N1.getValueType().isFloatingPoint() && + "AssertNoFPClass is used for a non-floating type"); + assert(isa(N2) && "NoFPClass is not Constant"); + FPClassTest NoFPClass = static_cast(N2->getAsZExtVal()); + assert(llvm::to_underlying(NoFPClass) <= + BitmaskEnumDetail::Mask() && + "FPClassTest value too large"); + (void)NoFPClass; + break; + } case ISD::AssertSext: case ISD::AssertZext: { EVT EVT = cast(N2)->getVT(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 8e74a076cc013..3ebd3a4b88097 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7987,14 +7987,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::get_active_lane_mask: { EVT CCVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); SDValue Index = getValue(I.getOperand(0)); + SDValue TripCount = getValue(I.getOperand(1)); EVT ElementVT = Index.getValueType(); if (!TLI.shouldExpandGetActiveLaneMask(CCVT, ElementVT)) { - visitTargetIntrinsic(I, Intrinsic); + setValue(&I, DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, sdl, CCVT, Index, + TripCount)); return; } - SDValue TripCount = getValue(I.getOperand(1)); EVT VecTy = EVT::getVectorVT(*DAG.getContext(), ElementVT, CCVT.getVectorElementCount()); @@ -11803,9 +11804,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) { else if (Arg.hasAttribute(Attribute::ZExt)) AssertOp = ISD::AssertZext; - ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, - PartVT, VT, nullptr, NewRoot, - F.getCallingConv(), AssertOp)); + SDValue OutVal = + getCopyFromParts(DAG, dl, &InVals[i], NumParts, PartVT, VT, nullptr, + NewRoot, F.getCallingConv(), AssertOp); + + FPClassTest NoFPClass = Arg.getNoFPClass(); + if (NoFPClass != fcNone) { + SDValue SDNoFPClass = DAG.getTargetConstant( + static_cast(NoFPClass), dl, MVT::i32); + OutVal = DAG.getNode(ISD::AssertNoFPClass, dl, OutVal.getValueType(), + OutVal, SDNoFPClass); + } + ArgValues.push_back(OutVal); } i += NumParts; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 8faf97271d99e..803894e298dd5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -124,6 +124,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::TokenFactor: return "TokenFactor"; case ISD::AssertSext: return "AssertSext"; case ISD::AssertZext: return "AssertZext"; + case ISD::AssertNoFPClass: return "AssertNoFPClass"; case ISD::AssertAlign: return "AssertAlign"; case ISD::BasicBlock: return "BasicBlock"; @@ -576,6 +577,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::VECTOR_FIND_LAST_ACTIVE: return "find_last_active"; + case ISD::GET_ACTIVE_LANE_MASK: + return "get_active_lane_mask"; + case ISD::PARTIAL_REDUCE_UMLA: return "partial_reduce_umla"; case ISD::PARTIAL_REDUCE_SMLA: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 1bc30336a02bf..586728a44571e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -3264,6 +3264,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, return; case ISD::AssertSext: case ISD::AssertZext: + case ISD::AssertNoFPClass: case ISD::AssertAlign: ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0)); CurDAG->RemoveDeadNode(NodeToMatch); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a476b191abf62..da999b5057d49 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4766,7 +4766,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, SDValue NewLoad = DAG.getLoad(newVT, dl, Lod->getChain(), Ptr, Lod->getPointerInfo().getWithOffset(bestOffset), - Lod->getOriginalAlign()); + Lod->getBaseAlign()); SDValue And = DAG.getNode(ISD::AND, dl, newVT, NewLoad, DAG.getConstant(bestMask.trunc(bestWidth), dl, newVT)); @@ -6071,10 +6071,9 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences( Ret.emplace_back(Code, CType); } - std::stable_sort( - Ret.begin(), Ret.end(), [](ConstraintPair a, ConstraintPair b) { - return getConstraintPiority(a.second) > getConstraintPiority(b.second); - }); + llvm::stable_sort(Ret, [](ConstraintPair a, ConstraintPair b) { + return getConstraintPiority(a.second) > getConstraintPiority(b.second); + }); return Ret; } @@ -10008,7 +10007,7 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, // the codegen worse. SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, SL, LoadVT, Chain, BasePTR, - LD->getPointerInfo(), SrcIntVT, LD->getOriginalAlign(), + LD->getPointerInfo(), SrcIntVT, LD->getBaseAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); SmallVector Vals; @@ -10041,11 +10040,10 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, SmallVector LoadChains; for (unsigned Idx = 0; Idx < NumElem; ++Idx) { - SDValue ScalarLoad = - DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR, - LD->getPointerInfo().getWithOffset(Idx * Stride), - SrcEltVT, LD->getOriginalAlign(), - LD->getMemOperand()->getFlags(), LD->getAAInfo()); + SDValue ScalarLoad = DAG.getExtLoad( + ExtType, SL, DstEltVT, Chain, BasePTR, + LD->getPointerInfo().getWithOffset(Idx * Stride), SrcEltVT, + LD->getBaseAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); BasePTR = DAG.getObjectPtrOffset(SL, BasePTR, TypeSize::getFixed(Stride)); @@ -10107,7 +10105,7 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST, } return DAG.getStore(Chain, SL, CurrVal, BasePtr, ST->getPointerInfo(), - ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), + ST->getBaseAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo()); } @@ -10127,7 +10125,7 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST, // This scalar TruncStore may be illegal, but we legalize it later. SDValue Store = DAG.getTruncStore( Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride), - MemSclVT, ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), + MemSclVT, ST->getBaseAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo()); Stores.push_back(Store); @@ -10193,8 +10191,7 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { // Load one integer register's worth from the original location. SDValue Load = DAG.getLoad( RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset), - LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), - LD->getAAInfo()); + LD->getBaseAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); // Follow the load with a store to the stack slot. Remember the store. Stores.push_back(DAG.getStore( Load.getValue(1), dl, Load, StackPtr, @@ -10209,11 +10206,10 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { // The last copy may be partial. Do an extending load. EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), 8 * (LoadedBytes - Offset)); - SDValue Load = - DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr, - LD->getPointerInfo().getWithOffset(Offset), MemVT, - LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), - LD->getAAInfo()); + SDValue Load = DAG.getExtLoad( + ISD::EXTLOAD, dl, RegVT, Chain, Ptr, + LD->getPointerInfo().getWithOffset(Offset), MemVT, LD->getBaseAlign(), + LD->getMemOperand()->getFlags(), LD->getAAInfo()); // Follow the load with a store to the stack slot. Remember the store. // On big-endian machines this requires a truncating store to ensure // that the bits end up in the right place. @@ -10243,7 +10239,7 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2); NumBits >>= 1; - Align Alignment = LD->getOriginalAlign(); + Align Alignment = LD->getBaseAlign(); unsigned IncrementSize = NumBits / 8; ISD::LoadExtType HiExtType = LD->getExtensionType(); @@ -10294,7 +10290,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, SDValue Ptr = ST->getBasePtr(); SDValue Val = ST->getValue(); EVT VT = Val.getValueType(); - Align Alignment = ST->getOriginalAlign(); + Align Alignment = ST->getBaseAlign(); auto &MF = DAG.getMachineFunction(); EVT StoreMemVT = ST->getMemoryVT(); @@ -10351,7 +10347,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, // Store it to the final location. Remember the store. Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr, ST->getPointerInfo().getWithOffset(Offset), - ST->getOriginalAlign(), + ST->getBaseAlign(), ST->getMemOperand()->getFlags())); // Increment the pointers. Offset += RegBytes; @@ -10370,11 +10366,10 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, ISD::EXTLOAD, dl, RegVT, Store, StackPtr, MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), LoadMemVT); - Stores.push_back( - DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr, - ST->getPointerInfo().getWithOffset(Offset), LoadMemVT, - ST->getOriginalAlign(), - ST->getMemOperand()->getFlags(), ST->getAAInfo())); + Stores.push_back(DAG.getTruncStore( + Load.getValue(1), dl, Load, Ptr, + ST->getPointerInfo().getWithOffset(Offset), LoadMemVT, + ST->getBaseAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo())); // The order of the stores doesn't matter - say it with a TokenFactor. SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); return Result; diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index 60c8372577a93..1f9beb84ef62d 100644 --- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -109,7 +109,7 @@ class ShadowStackGCLowering : public FunctionPass { PreservedAnalyses ShadowStackGCLoweringPass::run(Module &M, ModuleAnalysisManager &MAM) { auto &Map = MAM.getResult(M); - if (Map.StrategyMap.contains("shadow-stack")) + if (!Map.contains("shadow-stack")) return PreservedAnalyses::all(); ShadowStackGCLoweringImpl Impl; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index e76e518ef8595..1d2f379d1509b 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -622,7 +622,9 @@ class ThreadUnsafeDWARFContextState : public DWARFContext::DWARFContextState { return getNormalTypeUnitMap(); } - + Error doWorkThreadSafely(function_ref Work) override { + return Work(); + } }; class ThreadSafeState : public ThreadUnsafeDWARFContextState { @@ -738,6 +740,11 @@ class ThreadSafeState : public ThreadUnsafeDWARFContextState { std::unique_lock LockGuard(Mutex); return ThreadUnsafeDWARFContextState::getTypeUnitMap(IsDWO); } + + Error doWorkThreadSafely(function_ref Work) override { + std::unique_lock LockGuard(Mutex); + return ThreadUnsafeDWARFContextState::doWorkThreadSafely(Work); + } }; } // namespace diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp index bdd04b00f557b..8dc4050e2d8a2 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -496,108 +496,111 @@ void DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) { } Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) { - if ((CUDieOnly && !DieArray.empty()) || - DieArray.size() > 1) - return Error::success(); // Already parsed. - - bool HasCUDie = !DieArray.empty(); - extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray); - - if (DieArray.empty()) - return Error::success(); + return Context.doWorkThreadSafely([&]() -> Error { + if ((CUDieOnly && !DieArray.empty()) || DieArray.size() > 1) + return Error::success(); // Already parsed. + + bool HasCUDie = !DieArray.empty(); + extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray); + + if (DieArray.empty()) + return Error::success(); + + // If CU DIE was just parsed, copy several attribute values from it. + if (HasCUDie) + return Error::success(); + + DWARFDie UnitDie(this, &DieArray[0]); + if (std::optional DWOId = + toUnsigned(UnitDie.find(DW_AT_GNU_dwo_id))) + Header.setDWOId(*DWOId); + if (!IsDWO) { + assert(AddrOffsetSectionBase == std::nullopt); + assert(RangeSectionBase == 0); + assert(LocSectionBase == 0); + AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_addr_base)); + if (!AddrOffsetSectionBase) + AddrOffsetSectionBase = + toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base)); + RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0); + LocSectionBase = toSectionOffset(UnitDie.find(DW_AT_loclists_base), 0); + } - // If CU DIE was just parsed, copy several attribute values from it. - if (HasCUDie) - return Error::success(); + // In general, in DWARF v5 and beyond we derive the start of the unit's + // contribution to the string offsets table from the unit DIE's + // DW_AT_str_offsets_base attribute. Split DWARF units do not use this + // attribute, so we assume that there is a contribution to the string + // offsets table starting at offset 0 of the debug_str_offsets.dwo section. + // In both cases we need to determine the format of the contribution, + // which may differ from the unit's format. + DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection, + IsLittleEndian, 0); + if (IsDWO || getVersion() >= 5) { + auto StringOffsetOrError = + IsDWO ? determineStringOffsetsTableContributionDWO(DA) + : determineStringOffsetsTableContribution(DA); + if (!StringOffsetOrError) { + return createStringError(errc::invalid_argument, + "invalid reference to or invalid content in " + ".debug_str_offsets[.dwo]: " + + toString(StringOffsetOrError.takeError())); + } - DWARFDie UnitDie(this, &DieArray[0]); - if (std::optional DWOId = - toUnsigned(UnitDie.find(DW_AT_GNU_dwo_id))) - Header.setDWOId(*DWOId); - if (!IsDWO) { - assert(AddrOffsetSectionBase == std::nullopt); - assert(RangeSectionBase == 0); - assert(LocSectionBase == 0); - AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_addr_base)); - if (!AddrOffsetSectionBase) - AddrOffsetSectionBase = - toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base)); - RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0); - LocSectionBase = toSectionOffset(UnitDie.find(DW_AT_loclists_base), 0); - } + StringOffsetsTableContribution = *StringOffsetOrError; + } - // In general, in DWARF v5 and beyond we derive the start of the unit's - // contribution to the string offsets table from the unit DIE's - // DW_AT_str_offsets_base attribute. Split DWARF units do not use this - // attribute, so we assume that there is a contribution to the string - // offsets table starting at offset 0 of the debug_str_offsets.dwo section. - // In both cases we need to determine the format of the contribution, - // which may differ from the unit's format. - DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection, - IsLittleEndian, 0); - if (IsDWO || getVersion() >= 5) { - auto StringOffsetOrError = - IsDWO ? determineStringOffsetsTableContributionDWO(DA) - : determineStringOffsetsTableContribution(DA); - if (!StringOffsetOrError) - return createStringError(errc::invalid_argument, - "invalid reference to or invalid content in " - ".debug_str_offsets[.dwo]: " + - toString(StringOffsetOrError.takeError())); - - StringOffsetsTableContribution = *StringOffsetOrError; - } + // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to + // describe address ranges. + if (getVersion() >= 5) { + // In case of DWP, the base offset from the index has to be added. + if (IsDWO) { + uint64_t ContributionBaseOffset = 0; + if (auto *IndexEntry = Header.getIndexEntry()) + if (auto *Contrib = IndexEntry->getContribution(DW_SECT_RNGLISTS)) + ContributionBaseOffset = Contrib->getOffset(); + setRangesSection( + &Context.getDWARFObj().getRnglistsDWOSection(), + ContributionBaseOffset + + DWARFListTableHeader::getHeaderSize(Header.getFormat())); + } else + setRangesSection(&Context.getDWARFObj().getRnglistsSection(), + toSectionOffset(UnitDie.find(DW_AT_rnglists_base), + DWARFListTableHeader::getHeaderSize( + Header.getFormat()))); + } - // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to - // describe address ranges. - if (getVersion() >= 5) { - // In case of DWP, the base offset from the index has to be added. if (IsDWO) { - uint64_t ContributionBaseOffset = 0; + // If we are reading a package file, we need to adjust the location list + // data based on the index entries. + StringRef Data = Header.getVersion() >= 5 + ? Context.getDWARFObj().getLoclistsDWOSection().Data + : Context.getDWARFObj().getLocDWOSection().Data; if (auto *IndexEntry = Header.getIndexEntry()) - if (auto *Contrib = IndexEntry->getContribution(DW_SECT_RNGLISTS)) - ContributionBaseOffset = Contrib->getOffset(); - setRangesSection( - &Context.getDWARFObj().getRnglistsDWOSection(), - ContributionBaseOffset + - DWARFListTableHeader::getHeaderSize(Header.getFormat())); - } else - setRangesSection(&Context.getDWARFObj().getRnglistsSection(), - toSectionOffset(UnitDie.find(DW_AT_rnglists_base), - DWARFListTableHeader::getHeaderSize( - Header.getFormat()))); - } + if (const auto *C = IndexEntry->getContribution( + Header.getVersion() >= 5 ? DW_SECT_LOCLISTS : DW_SECT_EXT_LOC)) + Data = Data.substr(C->getOffset(), C->getLength()); + + DWARFDataExtractor DWARFData(Data, IsLittleEndian, getAddressByteSize()); + LocTable = + std::make_unique(DWARFData, Header.getVersion()); + LocSectionBase = DWARFListTableHeader::getHeaderSize(Header.getFormat()); + } else if (getVersion() >= 5) { + LocTable = std::make_unique( + DWARFDataExtractor(Context.getDWARFObj(), + Context.getDWARFObj().getLoclistsSection(), + IsLittleEndian, getAddressByteSize()), + getVersion()); + } else { + LocTable = std::make_unique(DWARFDataExtractor( + Context.getDWARFObj(), Context.getDWARFObj().getLocSection(), + IsLittleEndian, getAddressByteSize())); + } - if (IsDWO) { - // If we are reading a package file, we need to adjust the location list - // data based on the index entries. - StringRef Data = Header.getVersion() >= 5 - ? Context.getDWARFObj().getLoclistsDWOSection().Data - : Context.getDWARFObj().getLocDWOSection().Data; - if (auto *IndexEntry = Header.getIndexEntry()) - if (const auto *C = IndexEntry->getContribution( - Header.getVersion() >= 5 ? DW_SECT_LOCLISTS : DW_SECT_EXT_LOC)) - Data = Data.substr(C->getOffset(), C->getLength()); - - DWARFDataExtractor DWARFData(Data, IsLittleEndian, getAddressByteSize()); - LocTable = - std::make_unique(DWARFData, Header.getVersion()); - LocSectionBase = DWARFListTableHeader::getHeaderSize(Header.getFormat()); - } else if (getVersion() >= 5) { - LocTable = std::make_unique( - DWARFDataExtractor(Context.getDWARFObj(), - Context.getDWARFObj().getLoclistsSection(), - IsLittleEndian, getAddressByteSize()), - getVersion()); - } else { - LocTable = std::make_unique(DWARFDataExtractor( - Context.getDWARFObj(), Context.getDWARFObj().getLocSection(), - IsLittleEndian, getAddressByteSize())); - } + // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for + // skeleton CU DIE, so that DWARF users not aware of it are not broken. - // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for - // skeleton CU DIE, so that DWARF users not aware of it are not broken. - return Error::success(); + return Error::success(); + }); } bool DWARFUnit::parseDWO(StringRef DWOAlternativeLocation) { @@ -652,15 +655,21 @@ bool DWARFUnit::parseDWO(StringRef DWOAlternativeLocation) { return true; } -void DWARFUnit::clearDIEs(bool KeepCUDie) { - // Do not use resize() + shrink_to_fit() to free memory occupied by dies. - // shrink_to_fit() is a *non-binding* request to reduce capacity() to size(). - // It depends on the implementation whether the request is fulfilled. - // Create a new vector with a small capacity and assign it to the DieArray to - // have previous contents freed. - DieArray = (KeepCUDie && !DieArray.empty()) - ? std::vector({DieArray[0]}) - : std::vector(); +void DWARFUnit::clearDIEs(bool KeepCUDie, bool KeepDWODies) { + assert(!Context.doWorkThreadSafely([&] { + if (!KeepDWODies && DWO) { + DWO->clearDIEs(KeepCUDie, KeepDWODies); + } + // Do not use resize() + shrink_to_fit() to free memory occupied by dies. + // shrink_to_fit() is a *non-binding* request to reduce capacity() to + // size(). It depends on the implementation whether the request is + // fulfilled. Create a new vector with a small capacity and assign it to the + // DieArray to have previous contents freed. + DieArray = (KeepCUDie && !DieArray.empty()) + ? std::vector({DieArray[0]}) + : std::vector(); + return Error::success(); + })); } Expected diff --git a/llvm/lib/DebugInfo/GSYM/CMakeLists.txt b/llvm/lib/DebugInfo/GSYM/CMakeLists.txt index 724b5b213d643..eb610a6b34f51 100644 --- a/llvm/lib/DebugInfo/GSYM/CMakeLists.txt +++ b/llvm/lib/DebugInfo/GSYM/CMakeLists.txt @@ -4,7 +4,7 @@ add_llvm_component_library(LLVMDebugInfoGSYM FileWriter.cpp FunctionInfo.cpp GsymCreator.cpp - GsymDIContext.cpp + GsymContext.cpp GsymReader.cpp InlineInfo.cpp LineTable.cpp diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp index 7a0256f10ea60..1f70d273a9d9d 100644 --- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp +++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp @@ -656,6 +656,11 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) { DWARFDie Die = getDie(*CU); CUInfo CUI(DICtx, dyn_cast(CU.get())); handleDie(Out, CUI, Die); + // Release the line table, once we're done. + DICtx.clearLineTableForUnit(CU.get()); + // Free any DIEs that were allocated by the DWARF parser. + // If/when they're needed by other CU's, they'll be recreated. + CU->clearDIEs(/*KeepCUDie=*/false, /*KeepDWODIEs=*/false); } } else { // LLVM Dwarf parser is not thread-safe and we need to parse all DWARF up @@ -668,12 +673,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) { for (const auto &CU : DICtx.compile_units()) CU->getAbbreviations(); - // Now parse all DIEs in case we have cross compile unit references in a - // thread pool. DefaultThreadPool pool(hardware_concurrency(NumThreads)); - for (const auto &CU : DICtx.compile_units()) - pool.async([&CU]() { CU->getUnitDIE(false /*CUDieOnly*/); }); - pool.wait(); // Now convert all DWARF to GSYM in a thread pool. std::mutex LogMutex; @@ -681,11 +681,15 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) { DWARFDie Die = getDie(*CU); if (Die) { CUInfo CUI(DICtx, dyn_cast(CU.get())); - pool.async([this, CUI, &LogMutex, &Out, Die]() mutable { + pool.async([this, CUI, &CU, &LogMutex, &Out, Die]() mutable { std::string storage; raw_string_ostream StrStream(storage); OutputAggregator ThreadOut(Out.GetOS() ? &StrStream : nullptr); handleDie(ThreadOut, CUI, Die); + DICtx.clearLineTableForUnit(CU.get()); + // Free any DIEs that were allocated by the DWARF parser. + // If/when they're needed by other CU's, they'll be recreated. + CU->clearDIEs(/*KeepCUDie=*/false, /*KeepDWODIEs=*/false); // Print ThreadLogStorage lines into an actual stream under a lock std::lock_guard guard(LogMutex); if (Out.GetOS()) { @@ -697,6 +701,9 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) { } pool.wait(); } + // Now get rid of all the DIEs that may have been recreated + for (const auto &CU : DICtx.compile_units()) + CU->clearDIEs(/*KeepCUDie=*/false, /*KeepDWODIEs=*/false); size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore; Out << "Loaded " << FunctionsAddedCount << " functions from DWARF.\n"; return Error::success(); diff --git a/llvm/lib/DebugInfo/GSYM/GsymDIContext.cpp b/llvm/lib/DebugInfo/GSYM/GsymContext.cpp similarity index 82% rename from llvm/lib/DebugInfo/GSYM/GsymDIContext.cpp rename to llvm/lib/DebugInfo/GSYM/GsymContext.cpp index 68024a9c9e782..18be6d0985462 100644 --- a/llvm/lib/DebugInfo/GSYM/GsymDIContext.cpp +++ b/llvm/lib/DebugInfo/GSYM/GsymContext.cpp @@ -1,4 +1,4 @@ -//===-- GsymDIContext.cpp ------------------------------------------------===// +//===-- GsymContext.cpp ------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===/ -#include "llvm/DebugInfo/GSYM/GsymDIContext.h" +#include "llvm/DebugInfo/GSYM/GsymContext.h" #include "llvm/DebugInfo/GSYM/GsymReader.h" #include "llvm/Support/Path.h" @@ -14,10 +14,10 @@ using namespace llvm; using namespace llvm::gsym; -GsymDIContext::GsymDIContext(std::unique_ptr Reader) +GsymContext::GsymContext(std::unique_ptr Reader) : DIContext(CK_GSYM), Reader(std::move(Reader)) {} -void GsymDIContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {} +void GsymContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {} static bool fillLineInfoFromLocation(const SourceLocation &Location, DILineInfoSpecifier Specifier, @@ -61,8 +61,8 @@ static bool fillLineInfoFromLocation(const SourceLocation &Location, } std::optional -GsymDIContext::getLineInfoForAddress(object::SectionedAddress Address, - DILineInfoSpecifier Specifier) { +GsymContext::getLineInfoForAddress(object::SectionedAddress Address, + DILineInfoSpecifier Specifier) { if (Address.SectionIndex != object::SectionedAddress::UndefSection) return {}; @@ -93,16 +93,16 @@ GsymDIContext::getLineInfoForAddress(object::SectionedAddress Address, } std::optional -GsymDIContext::getLineInfoForDataAddress(object::SectionedAddress Address) { +GsymContext::getLineInfoForDataAddress(object::SectionedAddress Address) { // We can't implement this, there's no such information in the GSYM file. return {}; } DILineInfoTable -GsymDIContext::getLineInfoForAddressRange(object::SectionedAddress Address, - uint64_t Size, - DILineInfoSpecifier Specifier) { +GsymContext::getLineInfoForAddressRange(object::SectionedAddress Address, + uint64_t Size, + DILineInfoSpecifier Specifier) { if (Size == 0) return DILineInfoTable(); @@ -131,8 +131,8 @@ GsymDIContext::getLineInfoForAddressRange(object::SectionedAddress Address, } DIInliningInfo -GsymDIContext::getInliningInfoForAddress(object::SectionedAddress Address, - DILineInfoSpecifier Specifier) { +GsymContext::getInliningInfoForAddress(object::SectionedAddress Address, + DILineInfoSpecifier Specifier) { auto ResultOrErr = Reader->lookup(Address.Address); if (!ResultOrErr) @@ -159,7 +159,7 @@ GsymDIContext::getInliningInfoForAddress(object::SectionedAddress Address, } std::vector -GsymDIContext::getLocalsForAddress(object::SectionedAddress Address) { +GsymContext::getLocalsForAddress(object::SectionedAddress Address) { // We can't implement this, there's no such information in the GSYM file. return {}; diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVRange.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVRange.cpp index b6003fcb8b93b..4dc4b588ad60d 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVRange.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVRange.cpp @@ -139,7 +139,7 @@ void LVRange::sort() { }; // Sort the ranges using low address and range size. - std::stable_sort(RangeEntries.begin(), RangeEntries.end(), CompareRangeEntry); + llvm::stable_sort(RangeEntries, CompareRangeEntry); } void LVRange::print(raw_ostream &OS, bool Full) const { diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp index c3f1d6843957f..328ced9f4eb66 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp @@ -65,10 +65,9 @@ bool checkIntegrityScopesTree(LVScope *Root) { TraverseScope(Root); bool PassIntegrity = true; if (Duplicate.size()) { - std::stable_sort(begin(Duplicate), end(Duplicate), - [](const auto &l, const auto &r) { - return std::get<0>(l)->getID() < std::get<0>(r)->getID(); - }); + llvm::stable_sort(Duplicate, [](const auto &l, const auto &r) { + return std::get<0>(l)->getID() < std::get<0>(r)->getID(); + }); auto PrintIndex = [](unsigned Index) { if (Index) diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp index 8bbaf93db0caa..e1ea5da4dc51a 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp @@ -679,7 +679,7 @@ void LVScope::sort() { [&](LVScope *Parent, LVSortFunction SortFunction) { auto Traverse = [&](auto &Set, LVSortFunction SortFunction) { if (Set) - std::stable_sort(Set->begin(), Set->end(), SortFunction); + llvm::stable_sort(*Set, SortFunction); }; Traverse(Parent->Types, SortFunction); Traverse(Parent->Symbols, SortFunction); @@ -1627,8 +1627,7 @@ void LVScopeCompileUnit::printMatchedElements(raw_ostream &OS, bool UseMatchedElements) { LVSortFunction SortFunction = getSortFunction(); if (SortFunction) - std::stable_sort(MatchedElements.begin(), MatchedElements.end(), - SortFunction); + llvm::stable_sort(MatchedElements, SortFunction); // Check the type of elements required to be printed. 'MatchedElements' // contains generic elements (lines, scopes, symbols, types). If we have a diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp index 42da957233667..37bc60d4045c7 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp @@ -913,7 +913,7 @@ Error LVDWARFReader::createScopes() { LT->getFileNameByIndex( 1, None, DILineInfoSpecifier::FileLineInfoKind::RawValue, FileOne); - return FileZero.compare(FileOne); + return FileZero != FileOne; } } diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp index 78a1421005de2..56527719da51f 100644 --- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp +++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp @@ -15,7 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/BTF/BTFContext.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" -#include "llvm/DebugInfo/GSYM/GsymDIContext.h" +#include "llvm/DebugInfo/GSYM/GsymContext.h" #include "llvm/DebugInfo/GSYM/GsymReader.h" #include "llvm/DebugInfo/PDB/PDB.h" #include "llvm/DebugInfo/PDB/PDBContext.h" @@ -665,7 +665,7 @@ LLVMSymbolizer::getOrCreateModuleInfo(StringRef ModuleName) { // If this is a COFF object containing PDB info and not containing DWARF // section, use a PDBContext to symbolize. Otherwise, use DWARF. // Create a DIContext to symbolize as follows: - // - If there is a GSYM file, create a GsymDIContext. + // - If there is a GSYM file, create a GsymContext. // - Otherwise, if this is a COFF object containing PDB info, create a // PDBContext. // - Otherwise, create a DWARFContext. @@ -677,7 +677,7 @@ LLVMSymbolizer::getOrCreateModuleInfo(StringRef ModuleName) { std::unique_ptr Reader = std::make_unique(std::move(*ReaderOrErr)); - Context = std::make_unique(std::move(Reader)); + Context = std::make_unique(std::move(Reader)); } } if (!Context) { diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 90194d7fcc119..369a047f65076 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -487,7 +487,7 @@ MachOPlatform::MachOPlatform( if ((Err = ES.getBootstrapMapValue("darwin-use-ehframes-only", ForceEHFrames))) return; - this->ForceEHFrames = ForceEHFrames.has_value() ? *ForceEHFrames : false; + this->ForceEHFrames = ForceEHFrames.value_or(false); } BootstrapInfo BI; diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp index cd3c6f8dde8be..abf076944b273 100644 --- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp @@ -12,6 +12,9 @@ #include "llvm/Frontend/HLSL/HLSLRootSignature.h" #include "llvm/ADT/bit.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" namespace llvm { namespace hlsl { @@ -160,6 +163,65 @@ void dumpRootElements(raw_ostream &OS, ArrayRef Elements) { OS << "}"; } +MDNode *MetadataBuilder::BuildRootSignature() { + for (const RootElement &Element : Elements) { + MDNode *ElementMD = nullptr; + if (const auto &Clause = std::get_if(&Element)) + ElementMD = BuildDescriptorTableClause(*Clause); + if (const auto &Table = std::get_if(&Element)) + ElementMD = BuildDescriptorTable(*Table); + + // FIXME(#126586): remove once all RootElemnt variants are handled in a + // visit or otherwise + assert(ElementMD != nullptr && + "Constructed an unhandled root element type."); + + GeneratedMetadata.push_back(ElementMD); + } + + return MDNode::get(Ctx, GeneratedMetadata); +} + +MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) { + IRBuilder<> Builder(Ctx); + SmallVector TableOperands; + // Set the mandatory arguments + TableOperands.push_back(MDString::get(Ctx, "DescriptorTable")); + TableOperands.push_back(ConstantAsMetadata::get( + Builder.getInt32(llvm::to_underlying(Table.Visibility)))); + + // Remaining operands are references to the table's clauses. The in-memory + // representation of the Root Elements created from parsing will ensure that + // the previous N elements are the clauses for this table. + assert(Table.NumClauses <= GeneratedMetadata.size() && + "Table expected all owned clauses to be generated already"); + // So, add a refence to each clause to our operands + TableOperands.append(GeneratedMetadata.end() - Table.NumClauses, + GeneratedMetadata.end()); + // Then, remove those clauses from the general list of Root Elements + GeneratedMetadata.pop_back_n(Table.NumClauses); + + return MDNode::get(Ctx, TableOperands); +} + +MDNode *MetadataBuilder::BuildDescriptorTableClause( + const DescriptorTableClause &Clause) { + IRBuilder<> Builder(Ctx); + std::string Name; + llvm::raw_string_ostream OS(Name); + OS << Clause.Type; + return MDNode::get( + Ctx, { + MDString::get(Ctx, OS.str()), + ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)), + ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)), + ConstantAsMetadata::get(Builder.getInt32(Clause.Space)), + ConstantAsMetadata::get(Builder.getInt32(Clause.Offset)), + ConstantAsMetadata::get( + Builder.getInt32(llvm::to_underlying(Clause.Flags))), + }); +} + } // namespace rootsig } // namespace hlsl } // namespace llvm diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 8717cd092b0b5..41aa06add6aba 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -646,8 +646,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, if (Name == "thread.pointer") { // '(arm|aarch64).thread.pointer'. - NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), - Intrinsic::thread_pointer); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::thread_pointer, F->getReturnType()); return true; } @@ -1475,6 +1475,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, } break; + case 't': + if (Name == "thread.pointer") { + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::thread_pointer, F->getReturnType()); + return true; + } + break; + case 'v': { if (Name == "var.annotation" && F->arg_size() == 4) { rename(F); @@ -1605,7 +1613,7 @@ GlobalVariable *llvm::UpgradeGlobalVariable(GlobalVariable *GV) { auto Ctor = cast(Init->getOperand(i)); NewCtors[i] = ConstantStruct::get(EltTy, Ctor->getAggregateElement(0u), Ctor->getAggregateElement(1), - Constant::getNullValue(IRB.getPtrTy())); + ConstantPointerNull::get(IRB.getPtrTy())); } Constant *NewInit = ConstantArray::get(ArrayType::get(EltTy, N), NewCtors); @@ -4713,10 +4721,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { } // Create a new call with an added null annotation attribute argument. - NewCall = - Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), CI->getArgOperand(3), - Constant::getNullValue(Builder.getPtrTy())}); + NewCall = Builder.CreateCall( + NewFn, + {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), + CI->getArgOperand(3), ConstantPointerNull::get(Builder.getPtrTy())}); NewCall->takeName(CI); CI->replaceAllUsesWith(NewCall); CI->eraseFromParent(); @@ -4729,10 +4737,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { return; } // Create a new call with an added null annotation attribute argument. - NewCall = - Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), CI->getArgOperand(3), - Constant::getNullValue(Builder.getPtrTy())}); + NewCall = Builder.CreateCall( + NewFn, + {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), + CI->getArgOperand(3), ConstantPointerNull::get(Builder.getPtrTy())}); NewCall->takeName(CI); CI->replaceAllUsesWith(NewCall); CI->eraseFromParent(); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 8adb85ec6091a..580b0af709337 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -169,8 +169,7 @@ Value *IRBuilderBase::CreateStepVector(Type *DstType, const Twine &Name) { CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, MaybeAlign Align, bool isVolatile, - MDNode *TBAATag, MDNode *ScopeTag, - MDNode *NoAliasTag) { + const AAMDNodes &AAInfo) { Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)}; Type *Tys[] = {Ptr->getType(), Size->getType()}; @@ -178,25 +177,14 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, if (Align) cast(CI)->setDestAlignment(*Align); - - // Set the TBAA info if present. - if (TBAATag) - CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); - - if (ScopeTag) - CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); - - if (NoAliasTag) - CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); - + CI->setAAMetadata(AAInfo); return CI; } CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Val, Value *Size, - bool IsVolatile, MDNode *TBAATag, - MDNode *ScopeTag, - MDNode *NoAliasTag) { + bool IsVolatile, + const AAMDNodes &AAInfo) { Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)}; Type *Tys[] = {Dst->getType(), Size->getType()}; @@ -204,23 +192,13 @@ CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, if (DstAlign) cast(CI)->setDestAlignment(*DstAlign); - - // Set the TBAA info if present. - if (TBAATag) - CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); - - if (ScopeTag) - CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); - - if (NoAliasTag) - CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); - + CI->setAAMetadata(AAInfo); return CI; } CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( Value *Ptr, Value *Val, Value *Size, Align Alignment, uint32_t ElementSize, - MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) { + const AAMDNodes &AAInfo) { Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)}; Type *Tys[] = {Ptr->getType(), Size->getType()}; @@ -229,24 +207,15 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( CreateIntrinsic(Intrinsic::memset_element_unordered_atomic, Tys, Ops); cast(CI)->setDestAlignment(Alignment); - - // Set the TBAA info if present. - if (TBAATag) - CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); - - if (ScopeTag) - CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); - - if (NoAliasTag) - CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); - + CI->setAAMetadata(AAInfo); return CI; } -CallInst *IRBuilderBase::CreateMemTransferInst( - Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, - MaybeAlign SrcAlign, Value *Size, bool isVolatile, MDNode *TBAATag, - MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) { +CallInst *IRBuilderBase::CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, + MaybeAlign DstAlign, Value *Src, + MaybeAlign SrcAlign, Value *Size, + bool isVolatile, + const AAMDNodes &AAInfo) { assert((IntrID == Intrinsic::memcpy || IntrID == Intrinsic::memcpy_inline || IntrID == Intrinsic::memmove) && "Unexpected intrinsic ID"); @@ -260,28 +229,13 @@ CallInst *IRBuilderBase::CreateMemTransferInst( MCI->setDestAlignment(*DstAlign); if (SrcAlign) MCI->setSourceAlignment(*SrcAlign); - - // Set the TBAA info if present. - if (TBAATag) - CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); - - // Set the TBAA Struct info if present. - if (TBAAStructTag) - CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag); - - if (ScopeTag) - CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); - - if (NoAliasTag) - CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); - + MCI->setAAMetadata(AAInfo); return CI; } CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy( Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, - uint32_t ElementSize, MDNode *TBAATag, MDNode *TBAAStructTag, - MDNode *ScopeTag, MDNode *NoAliasTag) { + uint32_t ElementSize, const AAMDNodes &AAInfo) { assert(DstAlign >= ElementSize && "Pointer alignment must be at least element size"); assert(SrcAlign >= ElementSize && @@ -296,21 +250,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy( auto *AMCI = cast(CI); AMCI->setDestAlignment(DstAlign); AMCI->setSourceAlignment(SrcAlign); - - // Set the TBAA info if present. - if (TBAATag) - CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); - - // Set the TBAA Struct info if present. - if (TBAAStructTag) - CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag); - - if (ScopeTag) - CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); - - if (NoAliasTag) - CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); - + AMCI->setAAMetadata(AAInfo); return CI; } @@ -394,8 +334,7 @@ CallInst *IRBuilderBase::CreateFree(Value *Source, CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove( Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, - uint32_t ElementSize, MDNode *TBAATag, MDNode *TBAAStructTag, - MDNode *ScopeTag, MDNode *NoAliasTag) { + uint32_t ElementSize, const AAMDNodes &AAInfo) { assert(DstAlign >= ElementSize && "Pointer alignment must be at least element size"); assert(SrcAlign >= ElementSize && @@ -409,21 +348,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove( // Set the alignment of the pointer args. CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), DstAlign)); CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), SrcAlign)); - - // Set the TBAA info if present. - if (TBAATag) - CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); - - // Set the TBAA Struct info if present. - if (TBAAStructTag) - CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag); - - if (ScopeTag) - CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); - - if (NoAliasTag) - CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); - + CI->setAAMetadata(AAInfo); return CI; } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 81cf53f0f492e..73b4274a41ee6 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6418,7 +6418,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "SGPR arguments must have the `inreg` attribute", &Call); Check(!Call.paramHasAttr(3, Attribute::InReg), "VGPR arguments must not have the `inreg` attribute", &Call); - Check(isa_and_present(Call.getNextNode()), + + auto *Next = Call.getNextNonDebugInstruction(); + bool IsAMDUnreachable = Next && isa(Next) && + cast(Next)->getIntrinsicID() == + Intrinsic::amdgcn_unreachable; + Check(Next && (isa(Next) || IsAMDUnreachable), "llvm.amdgcn.cs.chain must be followed by unreachable", &Call); break; } @@ -6548,6 +6553,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { &Call); break; } + case Intrinsic::thread_pointer: { + Check(Call.getType()->getPointerAddressSpace() == + DL.getDefaultGlobalsAddressSpace(), + "llvm.thread.pointer intrinsic return type must be for the globals " + "address space", + &Call); + break; + } case Intrinsic::threadlocal_address: { const Value &Arg0 = *Call.getArgOperand(0); Check(isa(Arg0), diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index 8a85ac835000a..b7db70b99bcbc 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -439,27 +439,33 @@ static void codegen(const Config &Conf, TargetMachine *TM, std::unique_ptr &Stream = *StreamOrErr; TM->Options.ObjectFilenameForDebug = Stream->ObjectPathName; - legacy::PassManager CodeGenPasses; - TargetLibraryInfoImpl TLII(Mod.getTargetTriple()); - CodeGenPasses.add(new TargetLibraryInfoWrapperPass(TLII)); - // No need to make index available if the module is empty. - // In theory these passes should not use the index for an empty - // module, however, this guards against doing any unnecessary summary-based - // analysis in the case of a ThinLTO build where this might be an empty - // regular LTO combined module, with a large combined index from ThinLTO. - if (!isEmptyModule(Mod)) - CodeGenPasses.add( - createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex)); - if (Conf.PreCodeGenPassesHook) - Conf.PreCodeGenPassesHook(CodeGenPasses); - if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS, - DwoOut ? &DwoOut->os() : nullptr, - Conf.CGFileType)) - report_fatal_error("Failed to setup codegen"); - CodeGenPasses.run(Mod); - - if (DwoOut) - DwoOut->keep(); + // Create the codegen pipeline in its own scope so it gets deleted before + // Stream->commit() is called. The commit function of CacheStream deletes + // the raw stream, which is too early as streamers (e.g. MCAsmStreamer) + // keep the pointer and may use it until their destruction. See #138194. + { + legacy::PassManager CodeGenPasses; + TargetLibraryInfoImpl TLII(Mod.getTargetTriple()); + CodeGenPasses.add(new TargetLibraryInfoWrapperPass(TLII)); + // No need to make index available if the module is empty. + // In theory these passes should not use the index for an empty + // module, however, this guards against doing any unnecessary summary-based + // analysis in the case of a ThinLTO build where this might be an empty + // regular LTO combined module, with a large combined index from ThinLTO. + if (!isEmptyModule(Mod)) + CodeGenPasses.add( + createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex)); + if (Conf.PreCodeGenPassesHook) + Conf.PreCodeGenPassesHook(CodeGenPasses); + if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS, + DwoOut ? &DwoOut->os() : nullptr, + Conf.CGFileType)) + report_fatal_error("Failed to setup codegen"); + CodeGenPasses.run(Mod); + + if (DwoOut) + DwoOut->keep(); + } if (Error Err = Stream->commit()) report_fatal_error(std::move(Err)); diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index 161711a79e467..a9394541d18da 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -30,10 +30,10 @@ static void rewriteOffsetToCurrentByte(raw_svector_ostream &Stream, size_t RootSignatureDesc::getSize() const { size_t Size = sizeof(dxbc::RootSignatureHeader) + - Parameters.size() * sizeof(dxbc::RootParameterHeader); + ParametersContainer.size() * sizeof(dxbc::RootParameterHeader); - for (const mcdxbc::RootParameter &P : Parameters) { - switch (P.Header.ParameterType) { + for (const RootParameterInfo &I : ParametersContainer) { + switch (I.Header.ParameterType) { case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): Size += sizeof(dxbc::RootConstants); break; @@ -56,7 +56,7 @@ void RootSignatureDesc::write(raw_ostream &OS) const { raw_svector_ostream BOS(Storage); BOS.reserveExtraSpace(getSize()); - const uint32_t NumParameters = Parameters.size(); + const uint32_t NumParameters = ParametersContainer.size(); support::endian::write(BOS, Version, llvm::endianness::little); support::endian::write(BOS, NumParameters, llvm::endianness::little); @@ -66,7 +66,7 @@ void RootSignatureDesc::write(raw_ostream &OS) const { support::endian::write(BOS, Flags, llvm::endianness::little); SmallVector ParamsOffsets; - for (const mcdxbc::RootParameter &P : Parameters) { + for (const RootParameterInfo &P : ParametersContainer) { support::endian::write(BOS, P.Header.ParameterType, llvm::endianness::little); support::endian::write(BOS, P.Header.ShaderVisibility, @@ -78,27 +78,33 @@ void RootSignatureDesc::write(raw_ostream &OS) const { assert(NumParameters == ParamsOffsets.size()); for (size_t I = 0; I < NumParameters; ++I) { rewriteOffsetToCurrentByte(BOS, ParamsOffsets[I]); - const mcdxbc::RootParameter &P = Parameters[I]; - - switch (P.Header.ParameterType) { - case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): - support::endian::write(BOS, P.Constants.ShaderRegister, + const auto &[Type, Loc] = ParametersContainer.getTypeAndLocForParameter(I); + switch (Type) { + case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): { + const dxbc::RootConstants &Constants = + ParametersContainer.getConstant(Loc); + support::endian::write(BOS, Constants.ShaderRegister, llvm::endianness::little); - support::endian::write(BOS, P.Constants.RegisterSpace, + support::endian::write(BOS, Constants.RegisterSpace, llvm::endianness::little); - support::endian::write(BOS, P.Constants.Num32BitValues, + support::endian::write(BOS, Constants.Num32BitValues, llvm::endianness::little); break; + } case llvm::to_underlying(dxbc::RootParameterType::CBV): case llvm::to_underlying(dxbc::RootParameterType::SRV): - case llvm::to_underlying(dxbc::RootParameterType::UAV): - support::endian::write(BOS, P.Descriptor.ShaderRegister, + case llvm::to_underlying(dxbc::RootParameterType::UAV): { + const dxbc::RTS0::v2::RootDescriptor &Descriptor = + ParametersContainer.getRootDescriptor(Loc); + + support::endian::write(BOS, Descriptor.ShaderRegister, llvm::endianness::little); - support::endian::write(BOS, P.Descriptor.RegisterSpace, + support::endian::write(BOS, Descriptor.RegisterSpace, llvm::endianness::little); if (Version > 1) - support::endian::write(BOS, P.Descriptor.Flags, - llvm::endianness::little); + support::endian::write(BOS, Descriptor.Flags, llvm::endianness::little); + break; + } } } assert(Storage.size() == getSize()); diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index f27a27833858a..857985199cc48 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -1760,15 +1760,6 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, // Treat '.' as a valid identifier in this context. Lex(); IDVal = "."; - } else if (Lexer.is(AsmToken::LCurly)) { - // Treat '{' as a valid identifier in this context. - Lex(); - IDVal = "{"; - - } else if (Lexer.is(AsmToken::RCurly)) { - // Treat '}' as a valid identifier in this context. - Lex(); - IDVal = "}"; } else if (getTargetParser().tokenIsStartOfStatement(ID.getKind())) { Lex(); IDVal = ID.getString(); diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 239ee9e3de9b1..c00cd3e08d59d 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -274,27 +274,33 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { RS.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; for (const auto &Param : P.RootSignature->Parameters) { - mcdxbc::RootParameter NewParam; - NewParam.Header = dxbc::RootParameterHeader{ - Param.Type, Param.Visibility, Param.Offset}; + dxbc::RootParameterHeader Header{Param.Type, Param.Visibility, + Param.Offset}; switch (Param.Type) { case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): - NewParam.Constants.Num32BitValues = Param.Constants.Num32BitValues; - NewParam.Constants.RegisterSpace = Param.Constants.RegisterSpace; - NewParam.Constants.ShaderRegister = Param.Constants.ShaderRegister; + dxbc::RootConstants Constants; + Constants.Num32BitValues = Param.Constants.Num32BitValues; + Constants.RegisterSpace = Param.Constants.RegisterSpace; + Constants.ShaderRegister = Param.Constants.ShaderRegister; + RS.ParametersContainer.addParameter(Header, Constants); break; case llvm::to_underlying(dxbc::RootParameterType::SRV): case llvm::to_underlying(dxbc::RootParameterType::UAV): case llvm::to_underlying(dxbc::RootParameterType::CBV): - NewParam.Descriptor.RegisterSpace = Param.Descriptor.RegisterSpace; - NewParam.Descriptor.ShaderRegister = Param.Descriptor.ShaderRegister; - if (P.RootSignature->Version > 1) - NewParam.Descriptor.Flags = Param.Descriptor.getEncodedFlags(); + dxbc::RTS0::v2::RootDescriptor Descriptor; + Descriptor.RegisterSpace = Param.Descriptor.RegisterSpace; + Descriptor.ShaderRegister = Param.Descriptor.ShaderRegister; + if (RS.Version > 1) + Descriptor.Flags = Param.Descriptor.getEncodedFlags(); + RS.ParametersContainer.addParameter(Header, Descriptor); break; + default: + // Handling invalid parameter type edge case. We intentionally let + // obj2yaml/yaml2obj parse and emit invalid dxcontainer data, in order + // for that to be used as a testing tool more effectively. + RS.ParametersContainer.addInvalidParameter(Header); } - - RS.Parameters.push_back(NewParam); } RS.write(OS); diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt index 6425f4934b210..91c8c4f67074d 100644 --- a/llvm/lib/Passes/CMakeLists.txt +++ b/llvm/lib/Passes/CMakeLists.txt @@ -19,6 +19,7 @@ add_llvm_component_library(LLVMPasses Analysis CFGuard CodeGen + GlobalISel Core Coroutines HipStdPar diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 7740f622ede7c..56e91703cb019 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -99,6 +99,7 @@ #include "llvm/CodeGen/FinalizeISel.h" #include "llvm/CodeGen/FixupStatepointCallerSaved.h" #include "llvm/CodeGen/GCMetadata.h" +#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" #include "llvm/CodeGen/GlobalMerge.h" #include "llvm/CodeGen/GlobalMergeFunctions.h" #include "llvm/CodeGen/HardwareLoops.h" @@ -370,6 +371,7 @@ #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Utils/UnifyLoopExits.h" +#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 5a85b308925a6..f3654600c5abb 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -142,6 +142,7 @@ #include "llvm/Transforms/Utils/NameAnonGlobals.h" #include "llvm/Transforms/Utils/RelLookupTableConverter.h" #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" +#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index ea792280ed975..94dabe290213d 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -673,6 +673,7 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch", #endif LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass()) LOOP_PASS("dot-ddg", DDGDotPrinterPass()) +LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass()) LOOP_PASS("guard-widening", GuardWideningPass()) LOOP_PASS("extra-simple-loop-unswitch-passes", ExtraLoopPassManager()) diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt index eb7c2a3c1a28a..67a69d7761b2c 100644 --- a/llvm/lib/ProfileData/CMakeLists.txt +++ b/llvm/lib/ProfileData/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_component_library(LLVMProfileData + DataAccessProf.cpp GCOV.cpp IndexedMemProfData.cpp InstrProf.cpp diff --git a/llvm/lib/ProfileData/DataAccessProf.cpp b/llvm/lib/ProfileData/DataAccessProf.cpp new file mode 100644 index 0000000000000..a31f3db0621fb --- /dev/null +++ b/llvm/lib/ProfileData/DataAccessProf.cpp @@ -0,0 +1,265 @@ +#include "llvm/ProfileData/DataAccessProf.h" +#include "llvm/ADT/DenseMapInfoVariant.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/Compression.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace llvm { +namespace data_access_prof { + +// If `Map` has an entry keyed by `Str`, returns the entry iterator. Otherwise, +// creates an owned copy of `Str`, adds a map entry for it and returns the +// iterator. +static std::pair +saveStringToMap(DataAccessProfData::StringToIndexMap &Map, + llvm::UniqueStringSaver &Saver, StringRef Str) { + auto [Iter, Inserted] = Map.try_emplace(Saver.save(Str), Map.size()); + return *Iter; +} + +// Returns the canonical name or error. +static Expected getCanonicalName(StringRef Name) { + if (Name.empty()) + return make_error("Empty symbol name", + llvm::errc::invalid_argument); + return InstrProfSymtab::getCanonicalName(Name); +} + +std::optional +DataAccessProfData::getProfileRecord(const SymbolHandleRef SymbolID) const { + auto Key = SymbolID; + if (std::holds_alternative(SymbolID)) { + auto NameOrErr = getCanonicalName(std::get(SymbolID)); + // If name canonicalization fails, suppress the error inside. + if (!NameOrErr) { + assert( + std::get(SymbolID).empty() && + "Name canonicalization only fails when stringified string is empty."); + return std::nullopt; + } + Key = *NameOrErr; + } + + auto It = Records.find(Key); + if (It != Records.end()) { + return DataAccessProfRecord(Key, It->second.Locations); + } + + return std::nullopt; +} + +bool DataAccessProfData::isKnownColdSymbol(const SymbolHandleRef SymID) const { + if (std::holds_alternative(SymID)) + return KnownColdHashes.contains(std::get(SymID)); + return KnownColdSymbols.contains(std::get(SymID)); +} + +Error DataAccessProfData::setDataAccessProfile(SymbolHandleRef Symbol, + uint64_t AccessCount) { + uint64_t RecordID = -1; + const bool IsStringLiteral = std::holds_alternative(Symbol); + SymbolHandleRef Key; + if (IsStringLiteral) { + RecordID = std::get(Symbol); + Key = RecordID; + } else { + auto CanonicalName = getCanonicalName(std::get(Symbol)); + if (!CanonicalName) + return CanonicalName.takeError(); + std::tie(Key, RecordID) = + saveStringToMap(StrToIndexMap, Saver, *CanonicalName); + } + + auto [Iter, Inserted] = + Records.try_emplace(Key, RecordID, AccessCount, IsStringLiteral); + if (!Inserted) + return make_error("Duplicate symbol or string literal added. " + "User of DataAccessProfData should " + "aggregate count for the same symbol. ", + llvm::errc::invalid_argument); + + return Error::success(); +} + +Error DataAccessProfData::setDataAccessProfile( + SymbolHandleRef SymbolID, uint64_t AccessCount, + ArrayRef Locations) { + if (Error E = setDataAccessProfile(SymbolID, AccessCount)) + return E; + + auto &Record = Records.back().second; + for (const auto &Location : Locations) + Record.Locations.push_back( + {saveStringToMap(StrToIndexMap, Saver, Location.FileName).first, + Location.Line}); + + return Error::success(); +} + +Error DataAccessProfData::addKnownSymbolWithoutSamples( + SymbolHandleRef SymbolID) { + if (std::holds_alternative(SymbolID)) { + KnownColdHashes.insert(std::get(SymbolID)); + return Error::success(); + } + auto CanonicalName = getCanonicalName(std::get(SymbolID)); + if (!CanonicalName) + return CanonicalName.takeError(); + KnownColdSymbols.insert( + saveStringToMap(StrToIndexMap, Saver, *CanonicalName).first); + return Error::success(); +} + +Error DataAccessProfData::deserialize(const unsigned char *&Ptr) { + uint64_t NumSampledSymbols = + support::endian::readNext(Ptr); + uint64_t NumColdKnownSymbols = + support::endian::readNext(Ptr); + if (Error E = deserializeSymbolsAndFilenames(Ptr, NumSampledSymbols, + NumColdKnownSymbols)) + return E; + + uint64_t Num = + support::endian::readNext(Ptr); + for (uint64_t I = 0; I < Num; ++I) + KnownColdHashes.insert( + support::endian::readNext(Ptr)); + + return deserializeRecords(Ptr); +} + +Error DataAccessProfData::serializeSymbolsAndFilenames(ProfOStream &OS) const { + OS.write(StrToIndexMap.size()); + OS.write(KnownColdSymbols.size()); + + std::vector Strs; + Strs.reserve(StrToIndexMap.size() + KnownColdSymbols.size()); + for (const auto &Str : StrToIndexMap) + Strs.push_back(Str.first.str()); + for (const auto &Str : KnownColdSymbols) + Strs.push_back(Str.str()); + + std::string CompressedStrings; + if (!Strs.empty()) + if (Error E = collectGlobalObjectNameStrings( + Strs, compression::zlib::isAvailable(), CompressedStrings)) + return E; + const uint64_t CompressedStringLen = CompressedStrings.length(); + // Record the length of compressed string. + OS.write(CompressedStringLen); + // Write the chars in compressed strings. + for (char C : CompressedStrings) + OS.writeByte(static_cast(C)); + // Pad up to a multiple of 8. + // InstrProfReader could read bytes according to 'CompressedStringLen'. + const uint64_t PaddedLength = alignTo(CompressedStringLen, 8); + for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) + OS.writeByte(0); + return Error::success(); +} + +uint64_t +DataAccessProfData::getEncodedIndex(const SymbolHandleRef SymbolID) const { + if (std::holds_alternative(SymbolID)) + return std::get(SymbolID); + + auto Iter = StrToIndexMap.find(std::get(SymbolID)); + assert(Iter != StrToIndexMap.end() && + "String literals not found in StrToIndexMap"); + return Iter->second; +} + +Error DataAccessProfData::serialize(ProfOStream &OS) const { + if (Error E = serializeSymbolsAndFilenames(OS)) + return E; + OS.write(KnownColdHashes.size()); + for (const auto &Hash : KnownColdHashes) + OS.write(Hash); + OS.write((uint64_t)(Records.size())); + for (const auto &[Key, Rec] : Records) { + OS.write(getEncodedIndex(Rec.SymbolID)); + OS.writeByte(Rec.IsStringLiteral); + OS.write(Rec.AccessCount); + OS.write(Rec.Locations.size()); + for (const auto &Loc : Rec.Locations) { + OS.write(getEncodedIndex(Loc.FileName)); + OS.write32(Loc.Line); + } + } + return Error::success(); +} + +Error DataAccessProfData::deserializeSymbolsAndFilenames( + const unsigned char *&Ptr, const uint64_t NumSampledSymbols, + const uint64_t NumColdKnownSymbols) { + uint64_t Len = + support::endian::readNext(Ptr); + + // The first NumSampledSymbols strings are symbols with samples, and next + // NumColdKnownSymbols strings are known cold symbols. + uint64_t StringCnt = 0; + std::function addName = [&](StringRef Name) { + if (StringCnt < NumSampledSymbols) + saveStringToMap(StrToIndexMap, Saver, Name); + else + KnownColdSymbols.insert(Saver.save(Name)); + ++StringCnt; + return Error::success(); + }; + if (Error E = + readAndDecodeStrings(StringRef((const char *)Ptr, Len), addName)) + return E; + + Ptr += alignTo(Len, 8); + return Error::success(); +} + +Error DataAccessProfData::deserializeRecords(const unsigned char *&Ptr) { + SmallVector Strings = + llvm::to_vector(llvm::make_first_range(getStrToIndexMapRef())); + + uint64_t NumRecords = + support::endian::readNext(Ptr); + + for (uint64_t I = 0; I < NumRecords; ++I) { + uint64_t ID = + support::endian::readNext(Ptr); + + bool IsStringLiteral = + support::endian::readNext(Ptr); + + uint64_t AccessCount = + support::endian::readNext(Ptr); + + SymbolHandleRef SymbolID; + if (IsStringLiteral) + SymbolID = ID; + else + SymbolID = Strings[ID]; + if (Error E = setDataAccessProfile(SymbolID, AccessCount)) + return E; + + auto &Record = Records.back().second; + + uint64_t NumLocations = + support::endian::readNext(Ptr); + + Record.Locations.reserve(NumLocations); + for (uint64_t J = 0; J < NumLocations; ++J) { + uint64_t FileNameIndex = + support::endian::readNext(Ptr); + uint32_t Line = + support::endian::readNext(Ptr); + Record.Locations.push_back({Strings[FileNameIndex], Line}); + } + } + return Error::success(); +} +} // namespace data_access_prof +} // namespace llvm diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 76e8ca6a67590..368e3535fe905 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -572,12 +572,8 @@ Error InstrProfSymtab::addVTableWithName(GlobalVariable &VTable, return Error::success(); } -/// \c NameStrings is a string composed of one of more possibly encoded -/// sub-strings. The substrings are separated by 0 or more zero bytes. This -/// method decodes the string and calls `NameCallback` for each substring. -static Error -readAndDecodeStrings(StringRef NameStrings, - std::function NameCallback) { +Error readAndDecodeStrings(StringRef NameStrings, + std::function NameCallback) { const uint8_t *P = NameStrings.bytes_begin(); const uint8_t *EndP = NameStrings.bytes_end(); while (P < EndP) { diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 2759346935b14..9dc1a0d0b4678 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -152,9 +152,7 @@ void InstrProfWriter::setValueProfDataEndianness(llvm::endianness Endianness) { InfoObj->ValueProfDataEndianness = Endianness; } -void InstrProfWriter::setOutputSparse(bool Sparse) { - this->Sparse = Sparse; -} +void InstrProfWriter::setOutputSparse(bool Sparse) { this->Sparse = Sparse; } void InstrProfWriter::addRecord(NamedInstrProfRecord &&I, uint64_t Weight, function_ref Warn) { diff --git a/llvm/lib/Support/APFixedPoint.cpp b/llvm/lib/Support/APFixedPoint.cpp index f395919287b72..9a7caa4112625 100644 --- a/llvm/lib/Support/APFixedPoint.cpp +++ b/llvm/lib/Support/APFixedPoint.cpp @@ -439,7 +439,10 @@ void APFixedPoint::print(raw_ostream &OS) const { Sema.print(OS); OS << "})"; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void APFixedPoint::dump() const { print(llvm::errs()); } +#endif APFixedPoint APFixedPoint::negate(bool *Overflow) const { if (!isSaturated()) { diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp index 4952fe65d7767..76ead00c977bd 100644 --- a/llvm/lib/Support/ConvertUTFWrapper.cpp +++ b/llvm/lib/Support/ConvertUTFWrapper.cpp @@ -303,5 +303,15 @@ bool convertWideToUTF8(const std::wstring &Source, std::string &Result) { } } +bool IsSingleCodeUnitUTF8Codepoint(unsigned V) { return V <= 0x7F; } + +bool IsSingleCodeUnitUTF16Codepoint(unsigned V) { + return V <= 0xD7FF || (V >= 0xE000 && V <= 0xFFFF); +} + +bool IsSingleCodeUnitUTF32Codepoint(unsigned V) { + return V <= 0xD7FF || (V >= 0xE000 && V <= 0x10FFFF); +} + } // end namespace llvm diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp index a6de07a55482a..9c4a4429ca0ee 100644 --- a/llvm/lib/Support/DebugCounter.cpp +++ b/llvm/lib/Support/DebugCounter.cpp @@ -248,6 +248,8 @@ bool DebugCounter::shouldExecuteImpl(unsigned CounterName) { return true; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void DebugCounter::dump() const { print(dbgs()); } +#endif diff --git a/llvm/lib/Support/DynamicAPInt.cpp b/llvm/lib/Support/DynamicAPInt.cpp index bfcb97e0cc96a..9def5c782af4c 100644 --- a/llvm/lib/Support/DynamicAPInt.cpp +++ b/llvm/lib/Support/DynamicAPInt.cpp @@ -32,4 +32,6 @@ raw_ostream &DynamicAPInt::print(raw_ostream &OS) const { return OS << ValLarge; } -void DynamicAPInt::dump() const { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DynamicAPInt::dump() const { print(dbgs()); } +#endif diff --git a/llvm/lib/Support/FileOutputBuffer.cpp b/llvm/lib/Support/FileOutputBuffer.cpp index 58a06a34e8cf3..c8b035986fb5b 100644 --- a/llvm/lib/Support/FileOutputBuffer.cpp +++ b/llvm/lib/Support/FileOutputBuffer.cpp @@ -99,7 +99,7 @@ class InMemoryBuffer : public FileOutputBuffer { int FD; std::error_code EC; if (auto EC = - openFileForWrite(FinalPath, FD, CD_CreateAlways, OF_None, Mode)) + openFileForWrite(FinalPath, FD, CD_CreateAlways, OF_Delete, Mode)) return errorCodeToError(EC); raw_fd_ostream OS(FD, /*shouldClose=*/true, /*unbuffered=*/true); OS << StringRef((const char *)Buffer.base(), BufferSize); @@ -186,7 +186,7 @@ FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) { case fs::file_type::regular_file: case fs::file_type::file_not_found: case fs::file_type::status_error: - if (Flags & F_no_mmap) + if (Flags & F_mmap) return createInMemoryBuffer(Path, Size, Mode); else return createOnDiskBuffer(Path, Size, Mode); diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp index 16229598b612a..94a04ab90987a 100644 --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -1152,7 +1152,10 @@ void KnownBits::print(raw_ostream &OS) const { OS << "?"; } } -void KnownBits::dump() const { + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void KnownBits::dump() const { print(dbgs()); dbgs() << "\n"; } +#endif diff --git a/llvm/lib/Support/ScaledNumber.cpp b/llvm/lib/Support/ScaledNumber.cpp index 85d7afbea5c69..33e8cc3030873 100644 --- a/llvm/lib/Support/ScaledNumber.cpp +++ b/llvm/lib/Support/ScaledNumber.cpp @@ -317,7 +317,9 @@ raw_ostream &ScaledNumberBase::print(raw_ostream &OS, uint64_t D, int16_t E, return OS << toString(D, E, Width, Precision); } -void ScaledNumberBase::dump(uint64_t D, int16_t E, int Width) { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void ScaledNumberBase::dump(uint64_t D, int16_t E, int Width) { print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E << "]"; } +#endif diff --git a/llvm/lib/Support/SlowDynamicAPInt.cpp b/llvm/lib/Support/SlowDynamicAPInt.cpp index 8b4030ddf9fc4..a57fec2f824e1 100644 --- a/llvm/lib/Support/SlowDynamicAPInt.cpp +++ b/llvm/lib/Support/SlowDynamicAPInt.cpp @@ -283,4 +283,6 @@ SlowDynamicAPInt &SlowDynamicAPInt::operator--() { /// --------------------------------------------------------------------------- void SlowDynamicAPInt::print(raw_ostream &OS) const { OS << Val; } -void SlowDynamicAPInt::dump() const { print(dbgs()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void SlowDynamicAPInt::dump() const { print(dbgs()); } +#endif diff --git a/llvm/lib/Support/Z3Solver.cpp b/llvm/lib/Support/Z3Solver.cpp index 9aece099b0629..27027093a0c6f 100644 --- a/llvm/lib/Support/Z3Solver.cpp +++ b/llvm/lib/Support/Z3Solver.cpp @@ -989,7 +989,9 @@ llvm::SMTSolverRef llvm::CreateZ3Solver() { #endif } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void SMTSort::dump() const { print(llvm::errs()); } LLVM_DUMP_METHOD void SMTExpr::dump() const { print(llvm::errs()); } LLVM_DUMP_METHOD void SMTSolver::dump() const { print(llvm::errs()); } LLVM_DUMP_METHOD void SMTSolverStatistics::dump() const { print(llvm::errs()); } +#endif diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 0d9fcb0e63dae..97e185bbd1267 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -294,11 +294,9 @@ std::string RecordRecTy::getAsString() const { return getClasses()[0]->getNameInitAsString(); std::string Str = "{"; - bool First = true; + ListSeparator LS; for (const Record *R : getClasses()) { - if (!First) - Str += ", "; - First = false; + Str += LS; Str += R->getNameInitAsString(); } Str += "}"; @@ -520,9 +518,9 @@ const Init *BitsInit::convertInitializerTo(const RecTy *Ty) const { std::optional BitsInit::convertInitializerToInt() const { int64_t Result = 0; - for (unsigned i = 0, e = getNumBits(); i != e; ++i) - if (auto *Bit = dyn_cast(getBit(i))) - Result |= static_cast(Bit->getValue()) << i; + for (auto [Idx, InitV] : enumerate(getBits())) + if (auto *Bit = dyn_cast(InitV)) + Result |= static_cast(Bit->getValue()) << Idx; else return std::nullopt; return Result; @@ -532,27 +530,30 @@ const Init * BitsInit::convertInitializerBitRange(ArrayRef Bits) const { SmallVector NewBits(Bits.size()); - for (unsigned i = 0, e = Bits.size(); i != e; ++i) { - if (Bits[i] >= getNumBits()) + for (auto [Bit, NewBit] : zip_equal(Bits, NewBits)) { + if (Bit >= getNumBits()) return nullptr; - NewBits[i] = getBit(Bits[i]); + NewBit = getBit(Bit); } return BitsInit::get(getRecordKeeper(), NewBits); } +bool BitsInit::isComplete() const { + return all_of(getBits(), [](const Init *Bit) { return Bit->isComplete(); }); +} +bool BitsInit::allInComplete() const { + return all_of(getBits(), [](const Init *Bit) { return !Bit->isComplete(); }); +} bool BitsInit::isConcrete() const { - for (unsigned i = 0, e = getNumBits(); i != e; ++i) { - if (!getBit(i)->isConcrete()) - return false; - } - return true; + return all_of(getBits(), [](const Init *Bit) { return Bit->isConcrete(); }); } std::string BitsInit::getAsString() const { std::string Result = "{ "; - for (unsigned i = 0, e = getNumBits(); i != e; ++i) { - if (i) Result += ", "; - if (const Init *Bit = getBit(e - i - 1)) + ListSeparator LS; + for (const Init *Bit : reverse(getBits())) { + Result += LS; + if (Bit) Result += Bit->getAsString(); else Result += "*"; @@ -569,9 +570,8 @@ const Init *BitsInit::resolveReferences(Resolver &R) const { const Init *CachedBitVarRef = nullptr; const Init *CachedBitVarResolved = nullptr; - for (unsigned i = 0, e = getNumBits(); i != e; ++i) { - const Init *CurBit = getBit(i); - const Init *NewBit = CurBit; + for (auto [CurBit, NewBit] : zip_equal(getBits(), NewBits)) { + NewBit = CurBit; if (const auto *CurBitVar = dyn_cast(CurBit)) { if (CurBitVar->getBitVar() != CachedBitVarRef) { @@ -587,7 +587,6 @@ const Init *BitsInit::resolveReferences(Resolver &R) const { if (isa(NewBit) && R.keepUnsetBits()) NewBit = CurBit; - NewBits[i] = NewBit; Changed |= CurBit != NewBit; } @@ -644,12 +643,11 @@ const Init *IntInit::convertInitializerTo(const RecTy *Ty) const { const Init *IntInit::convertInitializerBitRange(ArrayRef Bits) const { SmallVector NewBits(Bits.size()); - for (unsigned i = 0, e = Bits.size(); i != e; ++i) { - if (Bits[i] >= 64) + for (auto [Bit, NewBit] : zip_equal(Bits, NewBits)) { + if (Bit >= 64) return nullptr; - NewBits[i] = - BitInit::get(getRecordKeeper(), Value & (INT64_C(1) << Bits[i])); + NewBit = BitInit::get(getRecordKeeper(), Value & (INT64_C(1) << Bit)); } return BitsInit::get(getRecordKeeper(), NewBits); } @@ -763,9 +761,8 @@ const Init *ListInit::convertInitializerTo(const RecTy *Ty) const { return nullptr; } -const Record *ListInit::getElementAsRecord(unsigned i) const { - assert(i < NumValues && "List element index out of range!"); - const auto *DI = dyn_cast(getElement(i)); +const Record *ListInit::getElementAsRecord(unsigned Idx) const { + const auto *DI = dyn_cast(getElement(Idx)); if (!DI) PrintFatalError("Expected record in list!"); return DI->getDef(); @@ -788,27 +785,20 @@ const Init *ListInit::resolveReferences(Resolver &R) const { } bool ListInit::isComplete() const { - for (const Init *Element : *this) { - if (!Element->isComplete()) - return false; - } - return true; + return all_of(*this, + [](const Init *Element) { return Element->isComplete(); }); } bool ListInit::isConcrete() const { - for (const Init *Element : *this) { - if (!Element->isConcrete()) - return false; - } - return true; + return all_of(*this, + [](const Init *Element) { return Element->isConcrete(); }); } std::string ListInit::getAsString() const { std::string Result = "["; - const char *sep = ""; + ListSeparator LS; for (const Init *Element : *this) { - Result += sep; - sep = ", "; + Result += LS; Result += Element->getAsString(); } return Result + "]"; @@ -1126,9 +1116,9 @@ static const StringInit *interleaveStringList(const ListInit *List, SmallString<80> Result(Element->getValue()); StringInit::StringFormat Fmt = StringInit::SF_String; - for (unsigned I = 1, E = List->size(); I < E; ++I) { + for (const Init *Elem : List->getValues().drop_front()) { Result.append(Delim->getValue()); - const auto *Element = dyn_cast(List->getElement(I)); + const auto *Element = dyn_cast(Elem); if (!Element) return nullptr; Result.append(Element->getValue()); @@ -1148,10 +1138,10 @@ static const StringInit *interleaveIntList(const ListInit *List, return nullptr; SmallString<80> Result(Element->getAsString()); - for (unsigned I = 1, E = List->size(); I < E; ++I) { + for (const Init *Elem : List->getValues().drop_front()) { Result.append(Delim->getValue()); const auto *Element = dyn_cast_or_null( - List->getElement(I)->convertInitializerTo(IntRecTy::get(RK))); + Elem->convertInitializerTo(IntRecTy::get(RK))); if (!Element) return nullptr; Result.append(Element->getAsString()); @@ -1316,17 +1306,10 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { if (!Op) Op = UnsetInit::get(getRecordKeeper()); - SmallVector Args; - SmallVector ArgNames; - for (unsigned i = 0, e = LHSs->getNumArgs(); i != e; ++i) { - Args.push_back(LHSs->getArg(i)); - ArgNames.push_back(LHSs->getArgName(i)); - } - for (unsigned i = 0, e = RHSs->getNumArgs(); i != e; ++i) { - Args.push_back(RHSs->getArg(i)); - ArgNames.push_back(RHSs->getArgName(i)); - } - return DagInit::get(Op, nullptr, Args, ArgNames); + SmallVector, 8> Args; + llvm::append_range(Args, LHSs->getArgAndNames()); + llvm::append_range(Args, RHSs->getArgAndNames()); + return DagInit::get(Op, Args); } break; } @@ -1423,8 +1406,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { if (!LHSi || !RHSi) break; - auto Start = LHSi->getValue(); - auto End = RHSi->getValue(); + int64_t Start = LHSi->getValue(); + int64_t End = RHSi->getValue(); SmallVector Args; if (getOpcode() == RANGEC) { // Closed interval @@ -1520,15 +1503,8 @@ const Init *BinOpInit::Fold(const Record *CurRec) const { case SETDAGOP: { const auto *Dag = dyn_cast(LHS); const auto *Op = dyn_cast(RHS); - if (Dag && Op) { - SmallVector Args; - SmallVector ArgNames; - for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) { - Args.push_back(Dag->getArg(i)); - ArgNames.push_back(Dag->getArgName(i)); - } - return DagInit::get(Op, nullptr, Args, ArgNames); - } + if (Dag && Op) + return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames()); break; } case ADD: @@ -1697,10 +1673,8 @@ static const Init *ForeachDagApply(const Init *LHS, const DagInit *MHSd, Change = true; SmallVector, 8> NewArgs; - for (unsigned int i = 0; i < MHSd->getNumArgs(); ++i) { - const Init *Arg = MHSd->getArg(i); + for (auto [Arg, ArgName] : MHSd->getArgAndNames()) { const Init *NewArg; - const StringInit *ArgName = MHSd->getArgName(i); if (const auto *Argd = dyn_cast(Arg)) NewArg = ForeachDagApply(LHS, Argd, RHS, CurRec); @@ -1713,7 +1687,7 @@ static const Init *ForeachDagApply(const Init *LHS, const DagInit *MHSd, } if (Change) - return DagInit::get(Val, nullptr, NewArgs); + return DagInit::get(Val, NewArgs); return MHSd; } @@ -1796,14 +1770,13 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { if (LHSs && MHSs && RHSs) { std::string Val = RHSs->getValue().str(); - StringRef::size_type found; - StringRef::size_type idx = 0; + std::string::size_type Idx = 0; while (true) { - found = StringRef(Val).find(LHSs->getValue(), idx); - if (found == StringRef::npos) + std::string::size_type Found = Val.find(LHSs->getValue(), Idx); + if (Found == std::string::npos) break; - Val.replace(found, LHSs->getValue().size(), MHSs->getValue().str()); - idx = found + MHSs->getValue().size(); + Val.replace(Found, LHSs->getValue().size(), MHSs->getValue().str()); + Idx = Found + MHSs->getValue().size(); } return StringInit::get(RK, Val); @@ -1852,7 +1825,7 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { return this; Children.emplace_back(Node, dyn_cast(Name)); } - return DagInit::get(LHS, nullptr, Children); + return DagInit::get(LHS, Children); } break; } @@ -1937,9 +1910,9 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { assert(*ArgNo < Dag->getNumArgs()); SmallVector Args(Dag->getArgs()); - SmallVector Names(Dag->getArgNames()); Args[*ArgNo] = RHS; - return DagInit::get(Dag->getOperator(), Dag->getName(), Args, Names); + return DagInit::get(Dag->getOperator(), Dag->getName(), Args, + Dag->getArgNames()); } break; } @@ -1954,10 +1927,10 @@ const Init *TernOpInit::Fold(const Record *CurRec) const { assert(*ArgNo < Dag->getNumArgs()); - SmallVector Args(Dag->getArgs()); SmallVector Names(Dag->getArgNames()); Names[*ArgNo] = dyn_cast(RHS); - return DagInit::get(Dag->getOperator(), Dag->getName(), Args, Names); + return DagInit::get(Dag->getOperator(), Dag->getName(), Dag->getArgs(), + Names); } break; } @@ -2546,10 +2519,9 @@ const Init *VarDefInit::Fold() const { std::string VarDefInit::getAsString() const { std::string Result = Class->getNameInitAsString() + "<"; - const char *sep = ""; + ListSeparator LS; for (const Init *Arg : args()) { - Result += sep; - sep = ", "; + Result += LS; Result += Arg->getAsString(); } return Result + ">"; @@ -2648,15 +2620,14 @@ const CondOpInit *CondOpInit::get(ArrayRef Conds, const Init *CondOpInit::resolveReferences(Resolver &R) const { SmallVector NewConds; + SmallVector NewVals; + bool Changed = false; - for (const Init *Case : getConds()) { - const Init *NewCase = Case->resolveReferences(R); - NewConds.push_back(NewCase); - Changed |= NewCase != Case; - } + for (auto [Cond, Val] : getCondAndVals()) { + const Init *NewCond = Cond->resolveReferences(R); + NewConds.push_back(NewCond); + Changed |= NewCond != Cond; - SmallVector NewVals; - for (const Init *Val : getVals()) { const Init *NewVal = Val->resolveReferences(R); NewVals.push_back(NewVal); Changed |= NewVal != Val; @@ -2671,10 +2642,7 @@ const Init *CondOpInit::resolveReferences(Resolver &R) const { const Init *CondOpInit::Fold(const Record *CurRec) const { RecordKeeper &RK = getRecordKeeper(); - for (unsigned i = 0; i < NumConds; ++i) { - const Init *Cond = getCond(i); - const Init *Val = getVal(i); - + for (auto [Cond, Val] : getCondAndVals()) { if (const auto *CondI = dyn_cast_or_null( Cond->convertInitializerTo(IntRecTy::get(RK)))) { if (CondI->getValue()) @@ -2692,36 +2660,24 @@ const Init *CondOpInit::Fold(const Record *CurRec) const { } bool CondOpInit::isConcrete() const { - for (const Init *Case : getConds()) - if (!Case->isConcrete()) - return false; - - for (const Init *Val : getVals()) - if (!Val->isConcrete()) - return false; - - return true; + return all_of(getCondAndVals(), [](const auto &Pair) { + return std::get<0>(Pair)->isConcrete() && std::get<1>(Pair)->isConcrete(); + }); } bool CondOpInit::isComplete() const { - for (const Init *Case : getConds()) - if (!Case->isComplete()) - return false; - - for (const Init *Val : getVals()) - if (!Val->isConcrete()) - return false; - - return true; + return all_of(getCondAndVals(), [](const auto &Pair) { + return std::get<0>(Pair)->isComplete() && std::get<1>(Pair)->isComplete(); + }); } std::string CondOpInit::getAsString() const { std::string Result = "!cond("; - for (unsigned i = 0; i < getNumConds(); i++) { - Result += getCond(i)->getAsString() + ": "; - Result += getVal(i)->getAsString(); - if (i != getNumConds()-1) - Result += ", "; + ListSeparator LS; + for (auto [Cond, Val] : getCondAndVals()) { + Result += LS; + Result += Cond->getAsString() + ": "; + Result += Val->getAsString(); } return Result + ")"; } @@ -2731,20 +2687,15 @@ const Init *CondOpInit::getBit(unsigned Bit) const { } static void ProfileDagInit(FoldingSetNodeID &ID, const Init *V, - const StringInit *VN, - ArrayRef ArgRange, - ArrayRef NameRange) { + const StringInit *VN, ArrayRef Args, + ArrayRef ArgNames) { ID.AddPointer(V); ID.AddPointer(VN); - ArrayRef::iterator Arg = ArgRange.begin(); - ArrayRef::iterator Name = NameRange.begin(); - while (Arg != ArgRange.end()) { - assert(Name != NameRange.end() && "Arg name underflow!"); - ID.AddPointer(*Arg++); - ID.AddPointer(*Name++); + for (auto [Arg, Name] : zip_equal(Args, ArgNames)) { + ID.AddPointer(Arg); + ID.AddPointer(Name); } - assert(Name == NameRange.end() && "Arg name overflow!"); } DagInit::DagInit(const Init *V, const StringInit *VN, @@ -2779,17 +2730,14 @@ const DagInit *DagInit::get(const Init *V, const StringInit *VN, return I; } -const DagInit * -DagInit::get(const Init *V, const StringInit *VN, - ArrayRef> args) { +const DagInit *DagInit::get( + const Init *V, const StringInit *VN, + ArrayRef> ArgAndNames) { SmallVector Args; SmallVector Names; - for (const auto &[Arg, Name] : args) { - Args.push_back(Arg); - Names.push_back(Name); - } - + llvm::append_range(Args, make_first_range(ArgAndNames)); + llvm::append_range(Names, make_second_range(ArgAndNames)); return DagInit::get(V, VN, Args, Names); } @@ -2805,12 +2753,13 @@ const Record *DagInit::getOperatorAsDef(ArrayRef Loc) const { } std::optional DagInit::getArgNo(StringRef Name) const { - for (unsigned i = 0, e = getNumArgs(); i < e; ++i) { - const StringInit *ArgName = getArgName(i); - if (ArgName && ArgName->getValue() == Name) - return i; - } - return std::nullopt; + ArrayRef ArgNames = getArgNames(); + auto It = llvm::find_if(ArgNames, [Name](const StringInit *ArgName) { + return ArgName && ArgName->getValue() == Name; + }); + if (It == ArgNames.end()) + return std::nullopt; + return std::distance(ArgNames.begin(), It); } const Init *DagInit::resolveReferences(Resolver &R) const { @@ -2833,11 +2782,7 @@ const Init *DagInit::resolveReferences(Resolver &R) const { bool DagInit::isConcrete() const { if (!Val->isConcrete()) return false; - for (const Init *Elt : getArgs()) { - if (!Elt->isConcrete()) - return false; - } - return true; + return all_of(getArgs(), [](const Init *Elt) { return Elt->isConcrete(); }); } std::string DagInit::getAsString() const { @@ -2845,11 +2790,13 @@ std::string DagInit::getAsString() const { if (ValName) Result += ":" + ValName->getAsUnquotedString(); if (!arg_empty()) { - Result += " " + getArg(0)->getAsString(); - if (getArgName(0)) Result += ":$" + getArgName(0)->getAsUnquotedString(); - for (unsigned i = 1, e = getNumArgs(); i != e; ++i) { - Result += ", " + getArg(i)->getAsString(); - if (getArgName(i)) Result += ":$" + getArgName(i)->getAsUnquotedString(); + Result += " "; + ListSeparator LS; + for (auto [Arg, Name] : getArgAndNames()) { + Result += LS; + Result += Arg->getAsString(); + if (Name) + Result += ":$" + Name->getAsUnquotedString(); } } return Result + ")"; @@ -2893,24 +2840,26 @@ std::string RecordVal::getPrintType() const { } bool RecordVal::setValue(const Init *V) { - if (V) { - Value = V->getCastTo(getType()); - if (Value) { - assert(!isa(Value) || - cast(Value)->getType()->typeIsA(getType())); - if (const auto *BTy = dyn_cast(getType())) { - if (!isa(Value)) { - SmallVector Bits; - Bits.reserve(BTy->getNumBits()); - for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I) - Bits.push_back(Value->getBit(I)); - Value = BitsInit::get(V->getRecordKeeper(), Bits); - } - } - } - return Value == nullptr; + if (!V) { + Value = nullptr; + return false; + } + + Value = V->getCastTo(getType()); + if (!Value) + return true; + + assert(!isa(Value) || + cast(Value)->getType()->typeIsA(getType())); + if (const auto *BTy = dyn_cast(getType())) { + if (isa(Value)) + return false; + SmallVector Bits(BTy->getNumBits()); + for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I) + Bits[I] = Value->getBit(I); + Value = BitsInit::get(V->getRecordKeeper(), Bits); } - Value = nullptr; + return false; } @@ -2918,29 +2867,10 @@ bool RecordVal::setValue(const Init *V) { // location in the RecordVal. bool RecordVal::setValue(const Init *V, SMLoc NewLoc) { Loc = NewLoc; - if (V) { - Value = V->getCastTo(getType()); - if (Value) { - assert(!isa(Value) || - cast(Value)->getType()->typeIsA(getType())); - if (const auto *BTy = dyn_cast(getType())) { - if (!isa(Value)) { - SmallVector Bits; - Bits.reserve(BTy->getNumBits()); - for (unsigned I = 0, E = BTy->getNumBits(); I < E; ++I) - Bits.push_back(Value->getBit(I)); - Value = BitsInit::get(getRecordKeeper(), Bits); - } - } - } - return Value == nullptr; - } - Value = nullptr; - return false; + return setValue(V); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -#include "llvm/TableGen/Record.h" LLVM_DUMP_METHOD void RecordVal::dump() const { errs() << *this; } #endif @@ -3035,14 +2965,14 @@ void Record::resolveReferences(Resolver &R, const RecordVal *SkipVal) { } // Resolve the assertion expressions. - for (auto &Assertion : Assertions) { + for (AssertionInfo &Assertion : Assertions) { const Init *Value = Assertion.Condition->resolveReferences(R); Assertion.Condition = Value; Value = Assertion.Message->resolveReferences(R); Assertion.Message = Value; } // Resolve the dump expressions. - for (auto &Dump : Dumps) { + for (DumpInfo &Dump : Dumps) { const Init *Value = Dump.Message->resolveReferences(R); Dump.Message = Value; } @@ -3065,12 +2995,11 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) { ArrayRef TArgs = R.getTemplateArgs(); if (!TArgs.empty()) { OS << "<"; - bool NeedComma = false; + ListSeparator LS; for (const Init *TA : TArgs) { - if (NeedComma) OS << ", "; - NeedComma = true; const RecordVal *RV = R.getValue(TA); assert(RV && "Template argument record not found??"); + OS << LS; RV->print(OS, false); } OS << ">"; @@ -3314,7 +3243,7 @@ void Record::emitRecordDumps() { RecordResolver R(*this); R.setFinal(true); - for (const auto &Dump : getDumps()) { + for (const DumpInfo &Dump : getDumps()) { const Init *Message = Dump.Message->resolveReferences(R); dumpMessage(Dump.Loc, Message); } @@ -3342,12 +3271,12 @@ LLVM_DUMP_METHOD void RecordKeeper::dump() const { errs() << *this; } raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) { OS << "------------- Classes -----------------\n"; - for (const auto &C : RK.getClasses()) - OS << "class " << *C.second; + for (const auto &[_, C] : RK.getClasses()) + OS << "class " << *C; OS << "------------- Defs -----------------\n"; - for (const auto &D : RK.getDefs()) - OS << "def " << *D.second; + for (const auto &[_, D] : RK.getDefs()) + OS << "def " << *D; return OS; } @@ -3373,7 +3302,7 @@ RecordKeeper::getAllDerivedDefinitions(ArrayRef ClassNames) const { std::vector Defs; assert(ClassNames.size() > 0 && "At least one class must be passed."); - for (const auto &ClassName : ClassNames) { + for (StringRef ClassName : ClassNames) { const Record *Class = getClass(ClassName); if (!Class) PrintFatalError("The class '" + ClassName + "' is not defined\n"); diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index a53606851d0a2..f55b7ef7c20bb 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1344,10 +1344,12 @@ AArch64AsmPrinter::getCodeViewJumpTableInfo(int JTI, } void AArch64AsmPrinter::emitFunctionEntryLabel() { - if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall || - MF->getFunction().getCallingConv() == - CallingConv::AArch64_SVE_VectorCall || - MF->getInfo()->isSVECC()) { + const Triple &TT = TM.getTargetTriple(); + if (TT.isOSBinFormatELF() && + (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall || + MF->getFunction().getCallingConv() == + CallingConv::AArch64_SVE_VectorCall || + MF->getInfo()->isSVECC())) { auto *TS = static_cast(OutStreamer->getTargetStreamer()); TS->emitDirectiveVariantPCS(CurrentFnSym); @@ -1355,8 +1357,7 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { AsmPrinter::emitFunctionEntryLabel(); - if (TM.getTargetTriple().isWindowsArm64EC() && - !MF->getFunction().hasLocalLinkage()) { + if (TT.isWindowsArm64EC() && !MF->getFunction().hasLocalLinkage()) { // For ARM64EC targets, a function definition's name is mangled differently // from the normal symbol, emit required aliases here. auto emitFunctionAlias = [&](MCSymbol *Src, MCSymbol *Dst) { diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index 4d0d99bce258a..c3370cd6e946c 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -272,9 +272,12 @@ static int mapRegToGPRIndex(MCRegister Reg) { /// datastructure for each tracked general purpose register. struct LOHInfo { MCLOHType Type : 8; ///< "Best" type of LOH possible. - bool IsCandidate : 1; ///< Possible LOH candidate. - bool OneUser : 1; ///< Found exactly one user (yet). - bool MultiUsers : 1; ///< Found multiple users. + LLVM_PREFERRED_TYPE(bool) + unsigned IsCandidate : 1; ///< Possible LOH candidate. + LLVM_PREFERRED_TYPE(bool) + unsigned OneUser : 1; ///< Found exactly one user (yet). + LLVM_PREFERRED_TYPE(bool) + unsigned MultiUsers : 1; ///< Found multiple users. const MachineInstr *MI0; ///< First instruction involved in the LOH. const MachineInstr *MI1; ///< Second instruction involved in the LOH /// (if any). diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 040662a5f11dd..bcff151fe62e7 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3611,6 +3611,9 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned ExtraCSSpill = 0; bool HasUnpairedGPR64 = false; bool HasPairZReg = false; + BitVector UserReservedRegs = RegInfo->getUserReservedRegs(MF); + BitVector ReservedRegs = RegInfo->getReservedRegs(MF); + // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { const unsigned Reg = CSRegs[i]; @@ -3621,7 +3624,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Don't save manually reserved registers set through +reserve-x#i, // even for callee-saved registers, as per GCC's behavior. - if (RegInfo->isUserReservedReg(MF, Reg)) { + if (UserReservedRegs[Reg]) { SavedRegs.reset(Reg); continue; } @@ -3653,8 +3656,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, AArch64::FPR128RegClass.contains(Reg, PairedReg)); if (!RegUsed) { - if (AArch64::GPR64RegClass.contains(Reg) && - !RegInfo->isReservedReg(MF, Reg)) { + if (AArch64::GPR64RegClass.contains(Reg) && !ReservedRegs[Reg]) { UnspilledCSGPR = Reg; UnspilledCSGPRPaired = PairedReg; } @@ -3676,7 +3678,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, !SavedRegs.test(PairedReg)) { SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && - !RegInfo->isReservedReg(MF, PairedReg)) + !ReservedRegs[PairedReg]) ExtraCSSpill = PairedReg; } // Check if there is a pair of ZRegs, so it can select PReg for spill/fill @@ -3699,7 +3701,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, AFI->setPredicateRegForFillSpill(AArch64::PN8); } - assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) && + assert(!ReservedRegs[AFI->getPredicateRegForFillSpill()] && "Predicate cannot be a reserved register"); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 13fb6a32233fe..293292d47dd48 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -290,6 +290,7 @@ static bool isZeroingInactiveLanes(SDValue Op) { return false; // We guarantee i1 splat_vectors to zero the other lanes case ISD::SPLAT_VECTOR: + case ISD::GET_ACTIVE_LANE_MASK: case AArch64ISD::PTRUE: case AArch64ISD::SETCC_MERGE_ZERO: return true; @@ -1178,6 +1179,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::CTLZ); + setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK); + setTargetDAGCombine(ISD::VECREDUCE_AND); setTargetDAGCombine(ISD::VECREDUCE_OR); setTargetDAGCombine(ISD::VECREDUCE_XOR); @@ -1493,8 +1496,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); } - for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) + for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) { setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal); + setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal); + } + + for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32}) + setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom); } if (Subtarget->isSVEorStreamingSVEAvailable()) { @@ -5731,21 +5739,27 @@ static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, DAG.getTargetConstant(Pattern, DL, MVT::i32)); } -static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, +static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual) { - if (!isa(Op.getOperand(1)) || - !isa(Op.getOperand(2))) + unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0; + unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1; + + if (!isa(N->getOperand(Op1))) return SDValue(); - SDLoc dl(Op); - APInt X = Op.getConstantOperandAPInt(1); - APInt Y = Op.getConstantOperandAPInt(2); + SDLoc dl(N); + APInt Y = N->getConstantOperandAPInt(Op1); // When the second operand is the maximum value, comparisons that include // equality can never fail and thus we can return an all active predicate. if (IsEqual) if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue()) - return DAG.getConstant(1, dl, Op.getValueType()); + return DAG.getConstant(1, dl, N->getValueType(0)); + + if (!isa(N->getOperand(Op0))) + return SDValue(); + + APInt X = N->getConstantOperandAPInt(Op0); bool Overflow; APInt NumActiveElems = @@ -5766,10 +5780,10 @@ static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue()); unsigned MinSVEVectorSize = std::max( DAG.getSubtarget().getMinSVEVectorSizeInBits(), 128u); - unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements(); + unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements(); if (PredPattern != std::nullopt && NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize)) - return getPTrue(DAG, dl, Op.getValueType(), *PredPattern); + return getPTrue(DAG, dl, N->getValueType(0), *PredPattern); return SDValue(); } @@ -6221,17 +6235,14 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getNode( AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2))); return SDValue(); - case Intrinsic::aarch64_sve_whilelo: - return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false, - /*IsEqual=*/false); case Intrinsic::aarch64_sve_whilelt: - return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true, + return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true, /*IsEqual=*/false); case Intrinsic::aarch64_sve_whilels: - return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false, + return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false, /*IsEqual=*/true); case Intrinsic::aarch64_sve_whilele: - return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true, + return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true, /*IsEqual=*/true); case Intrinsic::aarch64_sve_sunpkhi: return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), @@ -6532,28 +6543,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } - case Intrinsic::get_active_lane_mask: { - SDValue ID = - DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); - - EVT VT = Op.getValueType(); - if (VT.isScalableVector()) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1), - Op.getOperand(2)); - - // We can use the SVE whilelo instruction to lower this intrinsic by - // creating the appropriate sequence of scalable vector operations and - // then extracting a fixed-width subvector from the scalable vector. - - EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); - EVT WhileVT = ContainerVT.changeElementType(MVT::i1); - - SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID, - Op.getOperand(1), Op.getOperand(2)); - SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt, - DAG.getVectorIdxConstant(0, dl)); - } case Intrinsic::aarch64_neon_saddlv: case Intrinsic::aarch64_neon_uaddlv: { EVT OpVT = Op.getOperand(1).getValueType(); @@ -7039,7 +7028,7 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, DAG.getConstant(i * 8, Dl, PtrVT)); Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), - StoreNode->getOriginalAlign()); + StoreNode->getBaseAlign()); } return Chain; } @@ -7093,9 +7082,9 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, for (unsigned i = 0; i < 8; i++) { SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, DAG.getConstant(i * 8, DL, PtrVT)); - SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, - LoadNode->getPointerInfo(), - LoadNode->getOriginalAlign()); + SDValue Part = + DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(), + LoadNode->getBaseAlign()); Ops.push_back(Part); Chain = SDValue(Part.getNode(), 1); } @@ -7692,6 +7681,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerVECTOR_DEINTERLEAVE(Op, DAG); case ISD::VECTOR_INTERLEAVE: return LowerVECTOR_INTERLEAVE(Op, DAG); + case ISD::GET_ACTIVE_LANE_MASK: + return LowerGET_ACTIVE_LANE_MASK(Op, DAG); case ISD::LRINT: case ISD::LLRINT: if (Op.getValueType().isVector()) @@ -8240,26 +8231,26 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } // varargs - // Note that IsWin64 part is required to prevent odd miscompilations on arm64 - // windows platforms. For more info refer to GH#126780 PR comments. - if (isVarArg && - (DAG.getMachineFunction().getFrameInfo().hasVAStart() || IsWin64)) { - if (!Subtarget->isTargetDarwin() || IsWin64) { - // The AAPCS variadic function ABI is identical to the non-variadic - // one. As a result there may be more arguments in registers and we should - // save them for future reference. - // Win64 variadic functions also pass arguments in registers, but all float - // arguments are passed in integer registers. - saveVarArgRegisters(CCInfo, DAG, DL, Chain); - } - - // This will point to the next argument passed via stack. - unsigned VarArgsOffset = CCInfo.getStackSize(); - // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 - VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8); - FuncInfo->setVarArgsStackOffset(VarArgsOffset); - FuncInfo->setVarArgsStackIndex( - MFI.CreateFixedObject(4, VarArgsOffset, true)); + if (isVarArg) { + if (DAG.getMachineFunction().getFrameInfo().hasVAStart()) { + if (!Subtarget->isTargetDarwin() || IsWin64) { + // The AAPCS variadic function ABI is identical to the non-variadic + // one. As a result there may be more arguments in registers and we + // should save them for future reference. + // Win64 variadic functions also pass arguments in registers, but all + // float arguments are passed in integer registers. + saveVarArgRegisters(CCInfo, DAG, DL, Chain); + } + + // This will point to the next argument passed via stack. + unsigned VarArgsOffset = CCInfo.getStackSize(); + // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 + VarArgsOffset = + alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8); + FuncInfo->setVarArgsStackOffset(VarArgsOffset); + FuncInfo->setVarArgsStackIndex( + MFI.CreateFixedObject(4, VarArgsOffset, true)); + } if (MFI.hasMustTailInVarArgFunc()) { SmallVector RegParmTypes; @@ -8641,6 +8632,16 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, } } +static SMECallAttrs +getSMECallAttrs(const Function &Caller, + const TargetLowering::CallLoweringInfo &CLI) { + if (CLI.CB) + return SMECallAttrs(*CLI.CB); + if (auto *ES = dyn_cast(CLI.Callee)) + return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol())); + return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal)); +} + bool AArch64TargetLowering::isEligibleForTailCallOptimization( const CallLoweringInfo &CLI) const { CallingConv::ID CalleeCC = CLI.CallConv; @@ -8659,12 +8660,10 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. - SMEAttrs CallerAttrs(MF.getFunction()); - auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); - if (CallerAttrs.requiresSMChange(CalleeAttrs) || - CallerAttrs.requiresLazySave(CalleeAttrs) || - CallerAttrs.requiresPreservingAllZAState(CalleeAttrs) || - CallerAttrs.hasStreamingBody()) + SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI); + if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || + CallAttrs.requiresPreservingAllZAState() || + CallAttrs.caller().hasStreamingBody()) return false; // Functions using the C or Fast calling convention that have an SVE signature @@ -8956,14 +8955,14 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, return TLI.LowerCallTo(CLI).second; } -static unsigned getSMCondition(const SMEAttrs &CallerAttrs, - const SMEAttrs &CalleeAttrs) { - if (!CallerAttrs.hasStreamingCompatibleInterface() || - CallerAttrs.hasStreamingBody()) +static AArch64SME::ToggleCondition +getSMToggleCondition(const SMECallAttrs &CallAttrs) { + if (!CallAttrs.caller().hasStreamingCompatibleInterface() || + CallAttrs.caller().hasStreamingBody()) return AArch64SME::Always; - if (CalleeAttrs.hasNonStreamingInterface()) + if (CallAttrs.callee().hasNonStreamingInterface()) return AArch64SME::IfCallerIsStreaming; - if (CalleeAttrs.hasStreamingInterface()) + if (CallAttrs.callee().hasStreamingInterface()) return AArch64SME::IfCallerIsNonStreaming; llvm_unreachable("Unsupported attributes"); @@ -9096,11 +9095,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } // Determine whether we need any streaming mode changes. - SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); - if (CLI.CB) - CalleeAttrs = SMEAttrs(*CLI.CB); - else if (auto *ES = dyn_cast(CLI.Callee)) - CalleeAttrs = SMEAttrs(ES->getSymbol()); + SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI); auto DescribeCallsite = [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & { @@ -9115,9 +9110,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return R; }; - bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); - bool RequiresSaveAllZA = - CallerAttrs.requiresPreservingAllZAState(CalleeAttrs); + bool RequiresLazySave = CallAttrs.requiresLazySave(); + bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState(); if (RequiresLazySave) { const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); MachinePointerInfo MPI = @@ -9145,18 +9139,18 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return DescribeCallsite(R) << " sets up a lazy save for ZA"; }); } else if (RequiresSaveAllZA) { - assert(!CalleeAttrs.hasSharedZAInterface() && + assert(!CallAttrs.callee().hasSharedZAInterface() && "Cannot share state that may not exist"); Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain, /*IsSave=*/true); } SDValue PStateSM; - bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs); + bool RequiresSMChange = CallAttrs.requiresSMChange(); if (RequiresSMChange) { - if (CallerAttrs.hasStreamingInterfaceOrBody()) + if (CallAttrs.caller().hasStreamingInterfaceOrBody()) PStateSM = DAG.getConstant(1, DL, MVT::i64); - else if (CallerAttrs.hasNonStreamingInterface()) + else if (CallAttrs.caller().hasNonStreamingInterface()) PStateSM = DAG.getConstant(0, DL, MVT::i64); else PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64); @@ -9173,7 +9167,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue ZTFrameIdx; MachineFrameInfo &MFI = MF.getFrameInfo(); - bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs); + bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0(); // If the caller has ZT0 state which will not be preserved by the callee, // spill ZT0 before the call. @@ -9189,7 +9183,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If caller shares ZT0 but the callee is not shared ZA, we need to stop // PSTATE.ZA before the call if there is no lazy-save active. - bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs); + bool DisableZA = CallAttrs.requiresDisablingZABeforeCall(); assert((!DisableZA || !RequiresLazySave) && "Lazy-save should have PSTATE.SM=1 on entry to the function"); @@ -9442,7 +9436,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } } - if (IsVarArg && Subtarget->isWindowsArm64EC()) { + if (IsVarArg && Subtarget->isWindowsArm64EC() && + !(CLI.CB && CLI.CB->isMustTailCall())) { SDValue ParamPtr = StackPtr; if (IsTailCall) { // Create a dummy object at the top of the stack that can be used to get @@ -9472,8 +9467,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } SDValue NewChain = changeStreamingMode( - DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue, - getSMCondition(CallerAttrs, CalleeAttrs), PStateSM); + DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue, + getSMToggleCondition(CallAttrs), PStateSM); Chain = NewChain.getValue(0); InGlue = NewChain.getValue(1); } @@ -9659,8 +9654,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresSMChange) { assert(PStateSM && "Expected a PStateSM to be set"); Result = changeStreamingMode( - DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue, - getSMCondition(CallerAttrs, CalleeAttrs), PStateSM); + DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue, + getSMToggleCondition(CallAttrs), PStateSM); if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) { InGlue = Result.getValue(1); @@ -9670,7 +9665,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } } - if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs)) + if (CallAttrs.requiresEnablingZAAfterCall()) // Unconditionally resume ZA. Result = DAG.getNode( AArch64ISD::SMSTART, DL, MVT::Other, Result, @@ -18149,6 +18144,70 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP); } +static SDValue +performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *ST) { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false, + /*IsEqual=*/false)) + return While; + + if (!ST->hasSVE2p1()) + return SDValue(); + + if (!N->hasNUsesOfValue(2, 0)) + return SDValue(); + + const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2; + if (HalfSize < 2) + return SDValue(); + + auto It = N->user_begin(); + SDNode *Lo = *It++; + SDNode *Hi = *It; + + if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR || + Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + + uint64_t OffLo = Lo->getConstantOperandVal(1); + uint64_t OffHi = Hi->getConstantOperandVal(1); + + if (OffLo > OffHi) { + std::swap(Lo, Hi); + std::swap(OffLo, OffHi); + } + + if (OffLo != 0 || OffHi != HalfSize) + return SDValue(); + + EVT HalfVec = Lo->getValueType(0); + if (HalfVec != Hi->getValueType(0) || + HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64); + SDValue Idx = N->getOperand(0); + SDValue TC = N->getOperand(1); + if (Idx.getValueType() != MVT::i64) { + Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64); + TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64); + } + auto R = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, + {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC}); + + DCI.CombineTo(Lo, R.getValue(0)); + DCI.CombineTo(Hi, R.getValue(1)); + + return SDValue(N, 0); +} + // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one)) // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B)) @@ -19679,6 +19738,8 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, static bool isPredicateCCSettingOp(SDValue N) { if ((N.getOpcode() == ISD::SETCC) || + // get_active_lane_mask is lowered to a whilelo instruction. + (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) || (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN && (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt || @@ -19687,9 +19748,7 @@ static bool isPredicateCCSettingOp(SDValue N) { N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo || N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels || - N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt || - // get_active_lane_mask is lowered to a whilelo instruction. - N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask))) + N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt))) return true; return false; @@ -21319,7 +21378,7 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { for (const auto &[L0, L1] : zip(Loads0, Loads1)) { SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(), L0->getBasePtr(), L0->getPointerInfo(), - L0->getOriginalAlign()); + L0->getBaseAlign()); DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1)); DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1)); NewLoads.push_back(Load); @@ -21803,66 +21862,6 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, return SDValue(); } -static SDValue tryCombineWhileLo(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const AArch64Subtarget *Subtarget) { - if (DCI.isBeforeLegalize()) - return SDValue(); - - if (!Subtarget->hasSVE2p1()) - return SDValue(); - - if (!N->hasNUsesOfValue(2, 0)) - return SDValue(); - - const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2; - if (HalfSize < 2) - return SDValue(); - - auto It = N->user_begin(); - SDNode *Lo = *It++; - SDNode *Hi = *It; - - if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR || - Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR) - return SDValue(); - - uint64_t OffLo = Lo->getConstantOperandVal(1); - uint64_t OffHi = Hi->getConstantOperandVal(1); - - if (OffLo > OffHi) { - std::swap(Lo, Hi); - std::swap(OffLo, OffHi); - } - - if (OffLo != 0 || OffHi != HalfSize) - return SDValue(); - - EVT HalfVec = Lo->getValueType(0); - if (HalfVec != Hi->getValueType(0) || - HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize)) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - SDValue ID = - DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64); - SDValue Idx = N->getOperand(1); - SDValue TC = N->getOperand(2); - if (Idx.getValueType() != MVT::i64) { - Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64); - TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64); - } - auto R = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, - {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC}); - - DCI.CombineTo(Lo, R.getValue(0)); - DCI.CombineTo(Hi, R.getValue(1)); - - return SDValue(N, 0); -} - SDValue tryLowerPartialReductionToDot(SDNode *N, const AArch64Subtarget *Subtarget, SelectionDAG &DAG) { @@ -22342,7 +22341,8 @@ static SDValue performIntrinsicCombine(SDNode *N, return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), AArch64CC::LAST_ACTIVE); case Intrinsic::aarch64_sve_whilelo: - return tryCombineWhileLo(N, DCI, Subtarget); + return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_sve_bsl: case Intrinsic::aarch64_sve_bsl1n: case Intrinsic::aarch64_sve_bsl2n: @@ -23581,7 +23581,7 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { EVT MemVT = LD->getMemoryVT(); if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) || - LD->getOriginalAlign() >= 4) + LD->getBaseAlign() >= 4) return SDValue(); SDLoc DL(LD); @@ -23643,7 +23643,7 @@ static SDValue performLOADCombine(SDNode *N, DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0); return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(), Cast, LD->getPointerInfo(), MemVT, - LD->getOriginalAlign(), + LD->getBaseAlign(), LD->getMemOperand()->getFlags()); } } @@ -23954,8 +23954,8 @@ static SDValue performSTORECombine(SDNode *N, if (PtrVT != Ptr.getSimpleValueType()) { SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0); return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(), - ST->getOriginalAlign(), - ST->getMemOperand()->getFlags(), ST->getAAInfo()); + ST->getBaseAlign(), ST->getMemOperand()->getFlags(), + ST->getAAInfo()); } } @@ -26774,6 +26774,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performExtractVectorEltCombine(N, DCI, Subtarget); case ISD::VECREDUCE_ADD: return performVecReduceAddCombine(N, DCI.DAG, Subtarget); + case ISD::GET_ACTIVE_LANE_MASK: + return performActiveLaneMaskCombine(N, DCI, Subtarget); case AArch64ISD::UADDV: return performUADDVCombine(N, DAG); case AArch64ISD::SMULL: @@ -27756,8 +27758,7 @@ void AArch64TargetLowering::ReplaceNodeResults( DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM)); return; } - case Intrinsic::experimental_vector_match: - case Intrinsic::get_active_lane_mask: { + case Intrinsic::experimental_vector_match: { if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1) return; @@ -28169,8 +28170,8 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - Function *ThreadPointerFunc = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); + Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::thread_pointer, IRB.getPtrTy()); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset), @@ -28559,12 +28560,10 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { // Checks to allow the use of SME instructions if (auto *Base = dyn_cast(&Inst)) { - auto CallerAttrs = SMEAttrs(*Inst.getFunction()); - auto CalleeAttrs = SMEAttrs(*Base); - if (CallerAttrs.requiresSMChange(CalleeAttrs) || - CallerAttrs.requiresLazySave(CalleeAttrs) || - CallerAttrs.requiresPreservingZT0(CalleeAttrs) || - CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) + auto CallAttrs = SMECallAttrs(*Base); + if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || + CallAttrs.requiresPreservingZT0() || + CallAttrs.requiresPreservingAllZAState()) return true; } return false; @@ -29551,6 +29550,29 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op, return DAG.getNode(ISD::ADD, DL, ResultVT, Acc, Extended); } +SDValue +AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + assert(Subtarget->isSVEorStreamingSVEAvailable() && + "Lowering fixed length get_active_lane_mask requires SVE!"); + + // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK, + // but we can use SVE when available. + + SDLoc DL(Op); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + EVT WhileVT = ContainerVT.changeElementType(MVT::i1); + + SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT, + Op.getOperand(0), Op.getOperand(1)); + SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, + DAG.getVectorIdxConstant(0, DL)); +} + SDValue AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 9d8d1c22258be..c1e6d70099fa5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -875,9 +875,10 @@ class AArch64TargetLowering : public TargetLowering { if (!VT.isVector()) return hasAndNotCompare(Y); - TypeSize TS = VT.getSizeInBits(); - // TODO: We should be able to use bic/bif too for SVE. - return !TS.isScalable() && TS.getFixedValue() >= 64; // vector 'bic' + if (VT.isScalableVector()) + return true; + + return VT.getFixedSizeInBits() >= 64; // vector 'bic' } bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( @@ -1182,6 +1183,7 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGET_ACTIVE_LANE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 33241c65a4a37..5489541fcb318 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -10188,7 +10188,7 @@ multiclass SIMDScalarLShiftDTied opc, string asm, def d : BaseSIMDScalarShiftTied { + (i32 vecshiftL64:$imm)))]> { let Inst{21-16} = imm{5-0}; } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b02a907f7439f..010c7c391527f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7358,7 +7358,8 @@ def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), // Patterns for i8/i16 -> v2i32/v4i16 lane moves via insert and extract that go via i32. multiclass Neon_INS_elt_ext_pattern { + Instruction INS, Instruction DUP, SubRegIndex DUPSub, + SDNodeXForm VecIndexMult> { // VT64->OutVT def : Pat<(OutVT (vector_insert (OutVT V64:$src), (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))), @@ -7369,8 +7370,10 @@ multiclass Neon_INS_elt_ext_pattern; def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))))), (EXTRACT_SUBREG - (INS (IMPLICIT_DEF), 0, - (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn), + (VT128 (SUBREG_TO_REG + (i64 0), + (DUP (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn), + DUPSub)), dsub)>; // VT128->OutVT @@ -7383,25 +7386,38 @@ multiclass Neon_INS_elt_ext_pattern; def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))))), (EXTRACT_SUBREG - (INS (IMPLICIT_DEF), 0, V128:$Rn, imm:$Immn), + (VT128 (SUBREG_TO_REG + (i64 0), + (DUP V128:$Rn, imm:$Immn), + DUPSub)), dsub)>; } -defm : Neon_INS_elt_ext_pattern; -defm : Neon_INS_elt_ext_pattern; -defm : Neon_INS_elt_ext_pattern; +defm : Neon_INS_elt_ext_pattern; +defm : Neon_INS_elt_ext_pattern; +defm : Neon_INS_elt_ext_pattern; // bitcast of an extract -// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane)) -def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))), - (EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>; +// f32 bitcast(vector_extract(v4i32 src, 0)) -> EXTRACT_SUBREG(src) +def : Pat<(f32 (bitconvert (i32 (vector_extract v16i8:$src, (i64 0))))), + (EXTRACT_SUBREG V128:$src, bsub)>; +def : Pat<(f32 (bitconvert (i32 (vector_extract v8i16:$src, (i64 0))))), + (EXTRACT_SUBREG V128:$src, hsub)>; def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, (i64 0))))), (EXTRACT_SUBREG V128:$src, ssub)>; -def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))), - (EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>; def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, (i64 0))))), (EXTRACT_SUBREG V128:$src, dsub)>; +// f32 bitcast(vector_extract(v4i32 src, lane)) -> DUPi32(src, lane) +def : Pat<(f32 (bitconvert (i32 (vector_extract v16i8:$src, imm:$Immd)))), + (EXTRACT_SUBREG (v16i8 (SUBREG_TO_REG (i64 0), (DUPi8 V128:$src, imm:$Immd), bsub)), ssub)>; +def : Pat<(f32 (bitconvert (i32 (vector_extract v8i16:$src, imm:$Immd)))), + (EXTRACT_SUBREG (v8i16 (SUBREG_TO_REG (i64 0), (DUPi16 V128:$src, imm:$Immd), hsub)), ssub)>; +def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))), + (DUPi32 V128:$src, imm:$Immd)>; +def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))), + (DUPi64 V128:$src, imm:$Immd)>; + // Floating point vector extractions are codegen'd as either a sequence of // subregister extractions, or a MOV (aka DUP here) if // the lane number is anything other than zero. diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index d6bd59adef03b..a4bcd6847c4f0 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2141,12 +2141,12 @@ let Predicates = [HasSVE_or_SME] in { defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt, int_aarch64_sve_whilegt>; defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele, null_frag>; - defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo, int_aarch64_sve_whilehi>; + defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", get_active_lane_mask, int_aarch64_sve_whilehi>; defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels, null_frag>; defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt, int_aarch64_sve_whilegt>; defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele, null_frag>; - defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo, int_aarch64_sve_whilehi>; + defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", get_active_lane_mask, int_aarch64_sve_whilehi>; defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels, null_frag>; def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>; @@ -3459,16 +3459,10 @@ let Predicates = [HasSVE_or_SME] in { // Alternative case where insertelement is just scalar_to_vector rather than vector_insert. def : Pat<(v1f64 (scalar_to_vector (f64 (vector_extract nxv2f64:$vec, VectorIndexD:$index)))), - (EXTRACT_SUBREG - (INSvi64lane (IMPLICIT_DEF), (i64 0), - (EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index), - dsub)>; + (DUPi64 (EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index)>; def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)))), - (EXTRACT_SUBREG - (INSvi64lane (IMPLICIT_DEF), (i64 0), - (EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index), - dsub)>; + (DUPi64 (EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index)>; } // End HasNEON let Predicates = [HasNEON] in { @@ -3998,12 +3992,12 @@ let Predicates = [HasSVE2_or_SME] in { defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege, null_frag>; defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt, int_aarch64_sve_whilelt>; defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", int_aarch64_sve_whilehs, null_frag>; - defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi, int_aarch64_sve_whilelo>; + defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi, get_active_lane_mask>; defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", int_aarch64_sve_whilege, null_frag>; defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", int_aarch64_sve_whilegt, int_aarch64_sve_whilelt>; defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", int_aarch64_sve_whilehs, null_frag>; - defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, int_aarch64_sve_whilelo>; + defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, get_active_lane_mask>; // SVE2 pointer conflict compare defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 97e4993d52b4f..a2ce9c97bb50e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -268,22 +268,21 @@ const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = { bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { - SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); + SMECallAttrs CallAttrs(*Caller, *Callee); // When inlining, we should consider the body of the function, not the // interface. - if (CalleeAttrs.hasStreamingBody()) { - CalleeAttrs.set(SMEAttrs::SM_Compatible, false); - CalleeAttrs.set(SMEAttrs::SM_Enabled, true); + if (CallAttrs.callee().hasStreamingBody()) { + CallAttrs.callee().set(SMEAttrs::SM_Compatible, false); + CallAttrs.callee().set(SMEAttrs::SM_Enabled, true); } - if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0()) + if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0()) return false; - if (CallerAttrs.requiresLazySave(CalleeAttrs) || - CallerAttrs.requiresSMChange(CalleeAttrs) || - CallerAttrs.requiresPreservingZT0(CalleeAttrs) || - CallerAttrs.requiresPreservingAllZAState(CalleeAttrs)) { + if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() || + CallAttrs.requiresPreservingZT0() || + CallAttrs.requiresPreservingAllZAState()) { if (hasPossibleIncompatibleOps(Callee)) return false; } @@ -349,12 +348,14 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, // streaming-mode change, and the call to G from F would also require a // streaming-mode change, then there is benefit to do the streaming-mode // change only once and avoid inlining of G into F. + SMEAttrs FAttrs(*F); - SMEAttrs CalleeAttrs(Call); - if (FAttrs.requiresSMChange(CalleeAttrs)) { + SMECallAttrs CallAttrs(Call); + + if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) { if (F == Call.getCaller()) // (1) return CallPenaltyChangeSM * DefaultCallPenalty; - if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2) + if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2) return InlineCallPenaltyChangeSM * DefaultCallPenalty; } @@ -5478,6 +5479,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost( VectorType *NTp = VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); InstructionCost Cost; + std::map>, InstructionCost> + PreviousCosts; for (unsigned N = 0; N < NumVecs; N++) { SmallVector NMask; // Split the existing mask into chunks of size LTNumElts. Track the source @@ -5514,15 +5517,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost( else NMask.push_back(MaskElt % LTNumElts); } + // Check if we have already generated this sub-shuffle, which means we + // will have already generated the output. For example a <16 x i32> splat + // will be the same sub-splat 4 times, which only needs to be generated + // once and reused. + auto Result = + PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0}); + // Check if it was already in the map (already costed). + if (!Result.second) + continue; // If the sub-mask has at most 2 input sub-vectors then re-cost it using // getShuffleCost. If not then cost it using the worst case as the number // of element moves into a new vector. - if (NumSources <= 2) - Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc + InstructionCost NCost = + NumSources <= 2 + ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - NTp, NMask, CostKind, 0, nullptr, Args, CxtI); - else - Cost += LTNumElts; + NTp, NMask, CostKind, 0, nullptr, Args, CxtI) + : LTNumElts; + Result.first->second = NCost; + Cost += NCost; } return Cost; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp index 460902c67fe35..cca0adc84f6f6 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp @@ -135,8 +135,8 @@ void AArch64O0PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -153,7 +153,8 @@ bool AArch64O0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { auto &TPC = getAnalysis(); const Function &F = MF.getFunction(); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); const AArch64Subtarget &ST = MF.getSubtarget(); @@ -174,7 +175,7 @@ INITIALIZE_PASS_BEGIN(AArch64O0PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) INITIALIZE_PASS_END(AArch64O0PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index 32c33990ad348..1c3d2b4166309 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -636,8 +636,8 @@ void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); @@ -668,7 +668,8 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const AArch64Subtarget &ST = MF.getSubtarget(); const auto *LI = ST.getLegalizerInfo(); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis().getDomTree(); @@ -883,7 +884,7 @@ INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 MachineInstrs after legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 MachineInstrs after legalization", false, false) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 416386555dc0e..37a7d2206b180 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -820,8 +820,8 @@ void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -852,7 +852,8 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MachineDominatorTree *MDT = &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, @@ -874,7 +875,7 @@ INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 76d2ac6a601e5..271094f935e0e 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -27,15 +27,14 @@ void SMEAttrs::set(unsigned M, bool Enable) { "ZA_New and SME_ABI_Routine are mutually exclusive"); assert( - (!sharesZA() || - (isNewZA() ^ isInZA() ^ isInOutZA() ^ isOutZA() ^ isPreservesZA())) && + (isNewZA() + isInZA() + isOutZA() + isInOutZA() + isPreservesZA()) <= 1 && "Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', " "'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive"); // ZT0 Attrs assert( - (!sharesZT0() || (isNewZT0() ^ isInZT0() ^ isInOutZT0() ^ isOutZT0() ^ - isPreservesZT0())) && + (isNewZT0() + isInZT0() + isOutZT0() + isInOutZT0() + isPreservesZT0()) <= + 1 && "Attributes 'aarch64_new_zt0', 'aarch64_in_zt0', 'aarch64_out_zt0', " "'aarch64_inout_zt0' and 'aarch64_preserves_zt0' are mutually exclusive"); @@ -44,27 +43,6 @@ void SMEAttrs::set(unsigned M, bool Enable) { "interface"); } -SMEAttrs::SMEAttrs(const CallBase &CB) { - *this = SMEAttrs(CB.getAttributes()); - if (auto *F = CB.getCalledFunction()) { - set(SMEAttrs(*F).Bitmask | SMEAttrs(F->getName()).Bitmask); - } -} - -SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) { - if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") - Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine); - if (FuncName == "__arm_tpidr2_restore") - Bitmask |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) | - SMEAttrs::SME_ABI_Routine; - if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" || - FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr") - Bitmask |= SMEAttrs::SM_Compatible; - if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" || - FuncName == "__arm_sme_state_size") - Bitmask |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; -} - SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask = 0; if (Attrs.hasFnAttr("aarch64_pstate_sm_enabled")) @@ -99,17 +77,48 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask |= encodeZT0State(StateValue::New); } -bool SMEAttrs::requiresSMChange(const SMEAttrs &Callee) const { - if (Callee.hasStreamingCompatibleInterface()) +void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) { + unsigned KnownAttrs = SMEAttrs::Normal; + if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") + KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine); + if (FuncName == "__arm_tpidr2_restore") + KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) | + SMEAttrs::SME_ABI_Routine; + if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" || + FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr") + KnownAttrs |= SMEAttrs::SM_Compatible; + if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" || + FuncName == "__arm_sme_state_size") + KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; + set(KnownAttrs); +} + +bool SMECallAttrs::requiresSMChange() const { + if (callee().hasStreamingCompatibleInterface()) return false; // Both non-streaming - if (hasNonStreamingInterfaceAndBody() && Callee.hasNonStreamingInterface()) + if (caller().hasNonStreamingInterfaceAndBody() && + callee().hasNonStreamingInterface()) return false; // Both streaming - if (hasStreamingInterfaceOrBody() && Callee.hasStreamingInterface()) + if (caller().hasStreamingInterfaceOrBody() && + callee().hasStreamingInterface()) return false; return true; } + +SMECallAttrs::SMECallAttrs(const CallBase &CB) + : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal), + Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) { + if (auto *CalledFunction = CB.getCalledFunction()) + CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes); + + // FIXME: We probably should not allow SME attributes on direct calls but + // clang duplicates streaming mode attributes at each callsite. + assert((IsIndirect || + ((Callsite.withoutPerCallsiteFlags() | CalledFn) == CalledFn)) && + "SME attributes at callsite do not match declaration"); +} diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index 1691d4fec8b68..f1be0ecbee7ed 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -18,12 +18,9 @@ class CallBase; class AttributeList; /// SMEAttrs is a utility class to parse the SME ACLE attributes on functions. -/// It helps determine a function's requirements for PSTATE.ZA and PSTATE.SM. It -/// has interfaces to query whether a streaming mode change or lazy-save -/// mechanism is required when going from one function to another (e.g. through -/// a call). +/// It helps determine a function's requirements for PSTATE.ZA and PSTATE.SM. class SMEAttrs { - unsigned Bitmask; + unsigned Bitmask = Normal; public: enum class StateValue { @@ -43,18 +40,25 @@ class SMEAttrs { SM_Body = 1 << 2, // aarch64_pstate_sm_body SME_ABI_Routine = 1 << 3, // Used for SME ABI routines to avoid lazy saves ZA_State_Agnostic = 1 << 4, - ZT0_Undef = 1 << 5, // Use to mark ZT0 as undef to avoid spills + ZT0_Undef = 1 << 5, // Use to mark ZT0 as undef to avoid spills ZA_Shift = 6, ZA_Mask = 0b111 << ZA_Shift, ZT0_Shift = 9, - ZT0_Mask = 0b111 << ZT0_Shift + ZT0_Mask = 0b111 << ZT0_Shift, + CallSiteFlags_Mask = ZT0_Undef }; - SMEAttrs(unsigned Mask = Normal) : Bitmask(0) { set(Mask); } - SMEAttrs(const Function &F) : SMEAttrs(F.getAttributes()) {} - SMEAttrs(const CallBase &CB); + enum class InferAttrsFromName { No, Yes }; + + SMEAttrs() = default; + SMEAttrs(unsigned Mask) { set(Mask); } + SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No) + : SMEAttrs(F.getAttributes()) { + if (Infer == InferAttrsFromName::Yes) + addKnownFunctionAttrs(F.getName()); + } SMEAttrs(const AttributeList &L); - SMEAttrs(StringRef FuncName); + SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); }; void set(unsigned M, bool Enable = true); @@ -74,10 +78,6 @@ class SMEAttrs { return hasNonStreamingInterface() && !hasStreamingBody(); } - /// \return true if a call from Caller -> Callee requires a change in - /// streaming mode. - bool requiresSMChange(const SMEAttrs &Callee) const; - // Interfaces to query ZA static StateValue decodeZAState(unsigned Bitmask) { return static_cast((Bitmask & ZA_Mask) >> ZA_Shift); @@ -104,10 +104,7 @@ class SMEAttrs { return !hasSharedZAInterface() && !hasAgnosticZAInterface(); } bool hasZAState() const { return isNewZA() || sharesZA(); } - bool requiresLazySave(const SMEAttrs &Callee) const { - return hasZAState() && Callee.hasPrivateZAInterface() && - !(Callee.Bitmask & SME_ABI_Routine); - } + bool isSMEABIRoutine() const { return Bitmask & SME_ABI_Routine; } // Interfaces to query ZT0 State static StateValue decodeZT0State(unsigned Bitmask) { @@ -126,27 +123,83 @@ class SMEAttrs { bool isPreservesZT0() const { return decodeZT0State(Bitmask) == StateValue::Preserved; } - bool isUndefZT0() const { return Bitmask & ZT0_Undef; } + bool hasUndefZT0() const { return Bitmask & ZT0_Undef; } bool sharesZT0() const { StateValue State = decodeZT0State(Bitmask); return State == StateValue::In || State == StateValue::Out || State == StateValue::InOut || State == StateValue::Preserved; } bool hasZT0State() const { return isNewZT0() || sharesZT0(); } - bool requiresPreservingZT0(const SMEAttrs &Callee) const { - return hasZT0State() && !Callee.isUndefZT0() && !Callee.sharesZT0() && - !Callee.hasAgnosticZAInterface(); + + SMEAttrs operator|(SMEAttrs Other) const { + SMEAttrs Merged(*this); + Merged.set(Other.Bitmask); + return Merged; } - bool requiresDisablingZABeforeCall(const SMEAttrs &Callee) const { - return hasZT0State() && !hasZAState() && Callee.hasPrivateZAInterface() && - !(Callee.Bitmask & SME_ABI_Routine); + + SMEAttrs withoutPerCallsiteFlags() const { + return (Bitmask & ~CallSiteFlags_Mask); } - bool requiresEnablingZAAfterCall(const SMEAttrs &Callee) const { - return requiresLazySave(Callee) || requiresDisablingZABeforeCall(Callee); + + bool operator==(SMEAttrs const &Other) const { + return Bitmask == Other.Bitmask; } - bool requiresPreservingAllZAState(const SMEAttrs &Callee) const { - return hasAgnosticZAInterface() && !Callee.hasAgnosticZAInterface() && - !(Callee.Bitmask & SME_ABI_Routine); + +private: + void addKnownFunctionAttrs(StringRef FuncName); +}; + +/// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has +/// interfaces to query whether a streaming mode change or lazy-save mechanism +/// is required when going from one function to another (e.g. through a call). +class SMECallAttrs { + SMEAttrs CallerFn; + SMEAttrs CalledFn; + SMEAttrs Callsite; + bool IsIndirect = false; + +public: + SMECallAttrs(SMEAttrs Caller, SMEAttrs Callee, + SMEAttrs Callsite = SMEAttrs::Normal) + : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {} + + SMECallAttrs(const CallBase &CB); + + SMEAttrs &caller() { return CallerFn; } + SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; } + SMEAttrs &callsite() { return Callsite; } + SMEAttrs const &caller() const { return CallerFn; } + SMEAttrs const &callee() const { + return const_cast(this)->callee(); + } + SMEAttrs const &callsite() const { return Callsite; } + + /// \return true if a call from Caller -> Callee requires a change in + /// streaming mode. + bool requiresSMChange() const; + + bool requiresLazySave() const { + return caller().hasZAState() && callee().hasPrivateZAInterface() && + !callee().isSMEABIRoutine(); + } + + bool requiresPreservingZT0() const { + return caller().hasZT0State() && !callsite().hasUndefZT0() && + !callee().sharesZT0() && !callee().hasAgnosticZAInterface(); + } + + bool requiresDisablingZABeforeCall() const { + return caller().hasZT0State() && !caller().hasZAState() && + callee().hasPrivateZAInterface() && !callee().isSMEABIRoutine(); + } + + bool requiresEnablingZAAfterCall() const { + return requiresLazySave() || requiresDisablingZABeforeCall(); + } + + bool requiresPreservingAllZAState() const { + return caller().hasAgnosticZAInterface() && + !callee().hasAgnosticZAInterface() && !callee().isSMEABIRoutine(); } }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index efb2894aaf642..70e80f9fe14ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -47,10 +47,10 @@ static cl::opt WidenLoads( cl::init(false)); static cl::opt Widen16BitOps( - "amdgpu-codegenprepare-widen-16-bit-ops", - cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), - cl::ReallyHidden, - cl::init(true)); + "amdgpu-codegenprepare-widen-16-bit-ops", + cl::desc( + "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); static cl::opt BreakLargePHIs("amdgpu-codegenprepare-break-large-phis", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e591879060f30..9587fad1ecd63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -134,6 +134,22 @@ def combine_fmul_with_select_to_fldexp : GICombineRule< [{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +// (shift x, (zext amt)) -> (shift x, (and (anyext amt), mask) +// +// The pattern is longer, but is better for matching during ISel. +class canonicalize_zext_shift_amt : GICombineRule< + (defs root:$dst), + (match (G_ZEXT $amt, $amtsrc):$zext, + (opc $dst, $src, $amt):$shift), + (apply [{ applyCanonicalizeZextShiftAmt(*${shift}, *${zext}); }])>; + +def canonicalize_zext_lshr : canonicalize_zext_shift_amt; +def canonicalize_zext_ashr : canonicalize_zext_shift_amt; +def canonicalize_zext_shl : canonicalize_zext_shift_amt; + +def zext_of_shift_amount_combines : GICombineGroup<[ + canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl +]>; let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This @@ -182,5 +198,5 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8b93ed342c64a..7ed055e8da2b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1042,6 +1042,10 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, case ISD::MUL: case ISD::SETCC: case ISD::SELECT: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: if (Subtarget->has16BitInsts() && (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) { // Don't narrow back down to i16 if promoted to i32 already. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 34ba53cbe0f9e..f79069bd6d78b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -1038,6 +1038,12 @@ class AMDGPULowerModuleLDS { } bool runOnModule(Module &M) { + // Check if we've already lowered this module. The pass may run more + // than once in the LTO pipeline, and multiple runs aren't supported. + if (M.getModuleFlag("amdgpu.lowered_lds")) + return false; + M.addModuleFlag(Module::ModFlagBehavior::Error, "amdgpu.lowered_lds", 1); + CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index a52a6aef2bc39..0c6122cce78e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -462,8 +462,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); @@ -490,7 +490,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const AMDGPULegalizerInfo *LI = static_cast(ST.getLegalizerInfo()); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis().getDomTree(); @@ -512,7 +513,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after legalization", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index ca97591a87110..4aec2ba35ae5d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -234,8 +234,8 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); @@ -260,7 +260,8 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); // Enable CSE. GISelCSEAnalysisWrapper &Wrapper = @@ -289,7 +290,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs before legalization", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 8f9ad38d101a1..f08502fb3d928 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -87,6 +87,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; void applyClamp(MachineInstr &MI, Register &Reg) const; + void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -362,6 +364,34 @@ void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI, MI.eraseFromParent(); } +void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( + MachineInstr &MI, MachineInstr &Ext) const { + unsigned ShOpc = MI.getOpcode(); + assert(ShOpc == AMDGPU::G_SHL || ShOpc == AMDGPU::G_LSHR || + ShOpc == AMDGPU::G_ASHR); + assert(Ext.getOpcode() == AMDGPU::G_ZEXT); + + Register AmtReg = Ext.getOperand(1).getReg(); + Register ShDst = MI.getOperand(0).getReg(); + Register ShSrc = MI.getOperand(1).getReg(); + + LLT ExtAmtTy = MRI.getType(Ext.getOperand(0).getReg()); + LLT AmtTy = MRI.getType(AmtReg); + + auto &RB = *MRI.getRegBank(AmtReg); + + auto NewExt = B.buildAnyExt(ExtAmtTy, AmtReg); + auto Mask = B.buildConstant( + ExtAmtTy, maskTrailingOnes(AmtTy.getScalarSizeInBits())); + auto And = B.buildAnd(ExtAmtTy, NewExt, Mask); + B.buildInstr(ShOpc, {ShDst}, {ShSrc, And}); + + MRI.setRegBank(NewExt.getReg(0), RB); + MRI.setRegBank(Mask.getReg(0), RB); + MRI.setRegBank(And.getReg(0), RB); + MI.eraseFromParent(); +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } @@ -416,8 +446,8 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); @@ -441,7 +471,8 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); const GCNSubtarget &ST = MF.getSubtarget(); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); const auto *LI = ST.getLegalizerInfo(); MachineDominatorTree *MDT = @@ -466,7 +497,7 @@ INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after regbankselect", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after regbankselect", false, false) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5cd6561914364..70f9485c3e5b4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -919,6 +919,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasCvtPkF16F32Inst()) + setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); + setTargetDAGCombine({ISD::ADD, ISD::UADDO_CARRY, ISD::SUB, @@ -6899,10 +6902,16 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); + EVT DstVT = Op.getValueType(); + + if (DstVT == MVT::v2f16) { + assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32"); + return SrcVT == MVT::v2f32 ? Op : SDValue(); + } + if (SrcVT.getScalarType() != MVT::f64) return Op; - EVT DstVT = Op.getValueType(); SDLoc DL(Op); if (DstVT == MVT::f16) { // TODO: Handle strictfp diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 79667e5ff9285..84a6aeacc226a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1293,6 +1293,7 @@ def WaitVMVSrc : NamedIntOperand<"wait_vm_vsrc"> { def ByteSel : NamedIntOperand<"byte_sel"> { let Validator = "isUInt<2>"; } +def ByteSel0 : DefaultOperand; let PrintMethod = "printBitOp3" in def BitOp3 : NamedIntOperand<"bitop3">; @@ -1971,92 +1972,46 @@ class getIns32 class getIns64 { - - dag ret = - !if (!eq(NumSrcArgs, 0), - // VOP1 without input operands (V_NOP, V_CLREXCP) - (ins), - /* else */ - !if (!eq(NumSrcArgs, 1), - !if (HasModifiers, - // VOP1 with modifiers - !if(HasOMod, - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Clamp0:$clamp, omod0:$omod), - !if (HasClamp, - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Clamp0:$clamp), - (ins Src0Mod:$src0_modifiers, Src0RC:$src0))) - /* else */, - // VOP1 without modifiers - !if(HasOMod, - (ins Src0RC:$src0, Clamp0:$clamp, omod0:$omod), - !if (HasClamp, - (ins Src0RC:$src0, Clamp0:$clamp), - (ins Src0RC:$src0))) - /* endif */ ), - !if (!eq(NumSrcArgs, 2), - !if (HasModifiers, - // VOP 2 with modifiers - !if(HasOMod, - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Clamp0:$clamp, omod0:$omod), - !con((ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1), - !if(HasClamp, (ins Clamp0:$clamp), (ins)))) - /* else */, - // VOP2 without modifiers - !if (HasClamp, - (ins Src0RC:$src0, Src1RC:$src1, Clamp0:$clamp), - (ins Src0RC:$src0, Src1RC:$src1)) - - /* endif */ ) - /* NumSrcArgs == 3 */, - !if (HasModifiers, - !if (HasSrc2Mods, - // VOP3 with modifiers - !if (HasOMod, - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2, - Clamp0:$clamp, omod0:$omod), - !if (HasClamp, - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2, - Clamp0:$clamp), - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2Mod:$src2_modifiers, Src2RC:$src2))), - // VOP3 with modifiers except src2 - !if (HasOMod, - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2RC:$src2, Clamp0:$clamp, omod0:$omod), - !if (HasClamp, - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2RC:$src2, Clamp0:$clamp), - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - Src1Mod:$src1_modifiers, Src1RC:$src1, - Src2RC:$src2)))) - /* else */, - // VOP3 without modifiers - !if (HasClamp, - (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2, Clamp0:$clamp), - (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)) - /* endif */ )))); + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, + bit HasFP8ByteSel = 0, bit HasFP8DstByteSel = 0> { + dag src0 = !if(!ge(NumSrcArgs, 1), + !if (HasModifiers, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0), + (ins Src0RC:$src0)), + (ins)); + dag src1 = !if(!ge(NumSrcArgs, 2), + !if (HasModifiers, + (ins Src1Mod:$src1_modifiers, Src1RC:$src1), + (ins Src1RC:$src1)), + (ins)); + dag src2 = !if(!ge(NumSrcArgs, 3), + !if (HasSrc2Mods, + (ins Src2Mod:$src2_modifiers, Src2RC:$src2), + (ins Src2RC:$src2)), + (ins)); + // If there is vdst_in after clamp with HasFP8DstByteSel we cannot use + // Clamp0 with default value, all default operands must be at the end. + dag clamp = !if(HasClamp, !if(HasFP8DstByteSel, (ins Clamp:$clamp), + (ins Clamp0:$clamp)), + (ins)); + dag omod = !if(HasOMod, (ins omod0:$omod), (ins)); + dag bytesel = !if(HasFP8ByteSel, + !con(!if(HasFP8DstByteSel, (ins VGPR_32:$vdst_in), (ins)), + (ins ByteSel0:$byte_sel)), + (ins)); + + dag ret = !con(src0, src1, src2, clamp, omod, bytesel); } class getInsVOP3Base { + Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOpSel, + bit HasFP8ByteSel = 0, bit HasFP8DstByteSel = 0> { // getInst64 handles clamp and omod. implicit mutex between vop3p and omod dag base = getIns64 .ret; + Src0Mod, Src1Mod, Src2Mod, HasFP8ByteSel, HasFP8DstByteSel>.ret; dag opsel = (ins op_sel0:$op_sel); dag ret = !con(base, !if(HasOpSel, opsel, (ins))); } @@ -2595,11 +2550,10 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit IsWMMA = 0; field bit IsSWMMAC = 0; - field bit IsFP8SrcByteSel = 0; - field bit IsFP8DstByteSel = 0; + field bit HasFP8SrcByteSel = 0; field bit HasFP8DstByteSel = 0; field bit HasFP4DstByteSel = 0; - field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel); + field bit HasFP8ByteSel = !or(HasFP8SrcByteSel, HasFP8DstByteSel); field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; @@ -2669,7 +2623,8 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field dag Ins32 = getIns32.ret; field dag Ins64 = getIns64.ret; + HasOMod, Src0Mod, Src1Mod, Src2Mod, + HasFP8ByteSel, HasFP8DstByteSel>.ret; field dag InsVOP3P = getInsVOP3P.ret; @@ -2687,7 +2642,8 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { Src0ModDPP, Src1ModDPP, Src2ModDPP>.ret; defvar InsVOP3DPPBase = getInsVOP3Base.ret; + Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel, + HasFP8ByteSel, HasFP8DstByteSel>.ret; defvar InsVOP3PDPPBase = getInsVOP3P.ret; @@ -2716,7 +2672,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field string AsmDPP8 = getAsmDPP8.ret; field string AsmVOP3Base = getAsmVOP3Base.ret; + HasModifiers, DstVT, HasFP8ByteSel>.ret; field string Asm64 = AsmVOP3Base; field string AsmVOP3P = getAsmVOP3P.ret; field string AsmVOP3OpSel = getAsmVOP3OpSel; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg16Types.types in { + foreach st = Reg16Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 32-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg32DataTypes.types in { + foreach st = Reg32DataTypes.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 64-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - - -// FIXME: Make SGPR -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg64DataTypes.types in { + foreach st = Reg64DataTypes.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + } + } +} + // 96-bit bitcast -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg96Types.types in { + foreach st = Reg96Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + } + } +} + // 128-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg128Types.types in { + foreach st = Reg128Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 160-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg160Types.types in { + foreach st = Reg160Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 192-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg192Types.types in { + foreach st = Reg192Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 224-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -// 256-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - - - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg224Types.types in { + foreach st = Reg224Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} +// 256-bit bitcast +foreach vt = Reg256Types.types in { + foreach st = Reg256Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 288-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg288Types.types in { + foreach st = Reg288Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 320-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg320Types.types in { + foreach st = Reg320Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 320-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg352Types.types in { + foreach st = Reg352Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 384-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg384Types.types in { + foreach st = Reg384Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} // 512-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg512Types.types in { + foreach st = Reg512Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} + // 1024-bit bitcast -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +foreach vt = Reg1024Types.types in { + foreach st = Reg1024Types.types in { + if !not(!eq (vt, st)) then { + def : BitConvert ; + def : BitConvert ; + } + } +} /********** =================== **********/ diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 8eb1d7253cd48..bd8baaaa3df20 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1105,6 +1105,7 @@ void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) .setMIFlags(MI.getFlags()); + TII->fixImplicitOperands(*Converted); LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); (void)Converted; MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 182128cb174bd..d595163f820cb 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -547,10 +547,24 @@ class RegisterTypes reg_types> { } def Reg16Types : RegisterTypes<[i16, f16, bf16]>; -def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>; -def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, p1, p4, v4i16, v4f16, v4bf16]>; +def Reg32DataTypes: RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16]>; +def Reg32PtrTypes: RegisterTypes<[p2, p3, p5, p6]>; +def Reg32Types : RegisterTypes; +def Reg64DataTypes: RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16]>; +def Reg64PtrTypes: RegisterTypes<[p0, p1, p4]>; +def Reg64Types : RegisterTypes; def Reg96Types : RegisterTypes<[v3i32, v3f32]>; def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>; +def Reg160Types : RegisterTypes<[v5i32, v5f32]>; +def Reg192Types : RegisterTypes<[v6i32, v6f32, v3i64, v3f64]>; +def Reg224Types : RegisterTypes<[v7i32, v7f32]>; +def Reg256Types : RegisterTypes<[v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16]>; +def Reg288Types : RegisterTypes<[v9i32, v9f32]>; +def Reg320Types : RegisterTypes<[v10i32, v10f32]>; +def Reg352Types : RegisterTypes<[v11i32, v11f32]>; +def Reg384Types : RegisterTypes<[v12i32, v12f32]>; +def Reg512Types : RegisterTypes<[v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16]>; +def Reg1024Types : RegisterTypes<[v32i32, v32f32, v16i64, v16f64]>; let HasVGPR = 1 in { // VOP3 and VINTERP can access 256 lo and 256 hi registers. @@ -894,18 +908,18 @@ multiclass SRegClass; defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs, /*hasNull*/ true>; -defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; -defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; -defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs, /*hasNull*/ true>; -defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>; -defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>; -defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; -defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>; +defm "" : SRegClass<5, Reg160Types.types, SGPR_160Regs, TTMP_160Regs>; +defm "" : SRegClass<6, Reg192Types.types, SGPR_192Regs, TTMP_192Regs>; +defm "" : SRegClass<7, Reg224Types.types, SGPR_224Regs, TTMP_224Regs>; +defm "" : SRegClass<8, Reg256Types.types, SGPR_256Regs, TTMP_256Regs, /*hasNull*/ true>; +defm "" : SRegClass<9, Reg288Types.types, SGPR_288Regs, TTMP_288Regs>; +defm "" : SRegClass<10, Reg320Types.types, SGPR_320Regs, TTMP_320Regs>; +defm "" : SRegClass<11, Reg352Types.types, SGPR_352Regs, TTMP_352Regs>; +defm "" : SRegClass<12, Reg384Types.types, SGPR_384Regs, TTMP_384Regs>; let GlobalPriority = true in { -defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], SGPR_512Regs, TTMP_512Regs>; -defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; +defm "" : SRegClass<16, Reg512Types.types, SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<32, Reg1024Types.types, SGPR_1024Regs>; } def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32, @@ -944,23 +958,22 @@ multiclass VRegClass regTypes, dag regList> { } } -defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4], - (add VGPR_64)>; +defm VReg_64 : VRegClass<2, Reg64Types.types, (add VGPR_64)>; defm VReg_96 : VRegClass<3, Reg96Types.types, (add VGPR_96)>; defm VReg_128 : VRegClass<4, Reg128Types.types, (add VGPR_128)>; -defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; +defm VReg_160 : VRegClass<5, Reg160Types.types, (add VGPR_160)>; -defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; -defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; -defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], (add VGPR_256)>; -defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>; -defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>; -defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>; -defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>; +defm VReg_192 : VRegClass<6, Reg192Types.types, (add VGPR_192)>; +defm VReg_224 : VRegClass<7, Reg224Types.types, (add VGPR_224)>; +defm VReg_256 : VRegClass<8, Reg256Types.types, (add VGPR_256)>; +defm VReg_288 : VRegClass<9, Reg288Types.types, (add VGPR_288)>; +defm VReg_320 : VRegClass<10, Reg320Types.types, (add VGPR_320)>; +defm VReg_352 : VRegClass<11, Reg352Types.types, (add VGPR_352)>; +defm VReg_384 : VRegClass<12, Reg384Types.types, (add VGPR_384)>; let GlobalPriority = true in { -defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], (add VGPR_512)>; -defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; +defm VReg_512 : VRegClass<16, Reg512Types.types, (add VGPR_512)>; +defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass regTypes, dag regList> { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index aebff60047e0f..1233973da140d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1362,7 +1362,7 @@ getIntegerPairAttribute(const Function &F, StringRef Name, std::pair Default, bool OnlyFirstRequired) { if (auto Attr = getIntegerPairAttribute(F, Name, OnlyFirstRequired)) - return {Attr->first, Attr->second ? *(Attr->second) : Default.second}; + return {Attr->first, Attr->second.value_or(Default.second)}; return Default; } diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 073c45ff6fa99..7fdd951ecbd3c 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -666,7 +666,7 @@ let HasClamp = 0, HasOMod = 0, HasExtDPP = 0, HasExtVOP3DPP = 0, } class VOPProfile_Base_CVT_F_F8_ByteSel : VOPProfile<[DstVT, i32, untyped, untyped]> { - let IsFP8SrcByteSel = 1; + let HasFP8SrcByteSel = 1; let HasOpSel = 0; let HasExtDPP = 1; let HasExtVOP3DPP = 1; @@ -674,17 +674,6 @@ class VOPProfile_Base_CVT_F_F8_ByteSel : VOPProfile<[DstVT, i32 let HasClamp = 0; let HasOMod = 0; let HasModifiers = 0; - - defvar bytesel = (ins ByteSel:$byte_sel); - let Ins64 = !con(getIns64.ret, - bytesel); - let InsVOP3Base = !con(getInsVOP3Base.ret, - bytesel); } let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts], diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 30cef69aa29c4..0c7e20fc1ebf3 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -439,7 +439,7 @@ class VOP_MAC : VOPProfile <[vt0, vt1, vt1, v // Src2 must accept the same operand types as vdst, namely VGPRs only let Src2RC64 = getVOP3VRegForVT.ret; let Ins64 = getIns64.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, @@ -448,7 +448,7 @@ class VOP_MAC : VOPProfile <[vt0, vt1, vt1, v DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi)); let InsVOP3Base = getInsVOP3Base, 3, - 0, HasModifiers, HasModifiers, HasOMod, + HasClamp, HasModifiers, HasModifiers, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret; // We need a dummy src2 tied to dst to track the use of that register for s_delay_alu let InsVOPDX = (ins Src0RC32:$src0X, Src1RC32:$vsrc1X, VGPRSrc_32:$src2X); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 73f7a5cccaa07..0252c4f1b0929 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -593,6 +593,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, let HasExtVOP3DPP = 1; let HasOpSel = 1; let HasFP8DstByteSel = 1; + let HasFP8ByteSel = 0; // It works as a dst-bytesel, but does not have byte_sel operand. let AsmVOP3OpSel = !subst(", $src2_modifiers", "", getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, @@ -605,19 +606,8 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, class VOP3_CVT_SR_F8_ByteSel_Profile : VOP3_Profile> { - let IsFP8DstByteSel = 1; let HasFP8DstByteSel = 1; let HasClamp = 0; - defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel); - let Ins64 = !con(getIns64.ret, - bytesel); - let InsVOP3Base = !con( - getInsVOP3Base.ret, - bytesel); } def IsPow2Plus1: PatLeaf<(i32 imm), [{ diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index f9fa83c3f5ae7..24032353a00e9 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -869,13 +869,13 @@ class VOP3_DPPe_Common_Base op, VOPProfile P> : Enc96 { let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0), - !if(P.IsFP8SrcByteSel, byte_sel{1}, ?)); + !if(P.HasFP8SrcByteSel, byte_sel{1}, ?)); let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0), - !if(P.IsFP8SrcByteSel, byte_sel{0}, ?)); + !if(P.HasFP8SrcByteSel, byte_sel{0}, ?)); let Inst{13} = !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0), - !if(P.IsFP8DstByteSel, byte_sel{0}, ?)); + !if(P.HasFP8DstByteSel, byte_sel{0}, ?)); let Inst{14} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0), - !if(P.IsFP8DstByteSel, byte_sel{1}, ?)); + !if(P.HasFP8DstByteSel, byte_sel{1}, ?)); let Inst{15} = !if(P.HasClamp, clamp, 0); let Inst{25-16} = op; let Inst{31-26} = 0x35; @@ -1695,11 +1695,11 @@ multiclass VOP3_Real_Base op, string opName = NAME, bit isSingle = 0> { defvar ps = !cast(opName#"_e64"); let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - if ps.Pfl.IsFP8SrcByteSel then { + if ps.Pfl.HasFP8SrcByteSel then { def _e64#Gen.Suffix : VOP3_Real_Gen, VOP3FP8OpSel_src_bytesel_gfx11_gfx12; - } else if ps.Pfl.IsFP8DstByteSel then { + } else if ps.Pfl.HasFP8DstByteSel then { def _e64#Gen.Suffix : VOP3_Real_Gen, VOP3FP8OpSel_dst_bytesel_gfx11_gfx12; @@ -1733,11 +1733,11 @@ multiclass VOP3_Real_with_name op, string opName, let AsmString = asmName # ps.AsmOperands, IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { // FIXME-TRUE16 support FP8 instructions properly - if ps.Pfl.IsFP8SrcByteSel then { + if ps.Pfl.HasFP8SrcByteSel then { def _e64#Gen.Suffix : VOP3_Real_Gen, VOP3FP8OpSel_src_bytesel_gfx11_gfx12; - } else if ps.Pfl.IsFP8DstByteSel then { + } else if ps.Pfl.HasFP8DstByteSel then { def _e64#Gen.Suffix : VOP3_Real_Gen, VOP3FP8OpSel_dst_bytesel_gfx11_gfx12; diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 89eb49ed416ae..2972316fcee00 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -193,7 +193,8 @@ namespace { struct ImmBranch { MachineInstr *MI; unsigned MaxDisp : 31; - bool isCond : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned isCond : 1; unsigned UncondBr; ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, unsigned ubr) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fde7f04cc1747..afbf1b4c55e70 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -16317,10 +16317,10 @@ static SDValue CombineBaseUpdate(SDNode *N, // Try to fold with other users. Non-constant updates are considered // first, and constant updates are sorted to not break a sequence of // strided accesses (if there is any). - std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(), - [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) { - return LHS.ConstInc < RHS.ConstInc; - }); + llvm::stable_sort(BaseUpdates, + [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) { + return LHS.ConstInc < RHS.ConstInc; + }); for (BaseUpdateUser &User : BaseUpdates) { if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI)) return SDValue(); @@ -16772,7 +16772,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, // Details about the old store SDValue Ch = St->getChain(); SDValue BasePtr = St->getBasePtr(); - Align Alignment = St->getOriginalAlign(); + Align Alignment = St->getBaseAlign(); MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); AAMDNodes AAInfo = St->getAAInfo(); @@ -16823,7 +16823,7 @@ static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, // Details about the old store SDValue Ch = St->getChain(); SDValue BasePtr = St->getBasePtr(); - Align Alignment = St->getOriginalAlign(); + Align Alignment = St->getBaseAlign(); MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); AAMDNodes AAInfo = St->getAAInfo(); @@ -16871,7 +16871,7 @@ static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) { // Create a new integer store to replace the existing floating point version. SDValue Ch = St->getChain(); SDValue BasePtr = St->getBasePtr(); - Align Alignment = St->getOriginalAlign(); + Align Alignment = St->getBaseAlign(); MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); AAMDNodes AAInfo = St->getAAInfo(); EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits()); @@ -16922,7 +16922,7 @@ static SDValue PerformSTORECombine(SDNode *N, SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore( St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), - BasePtr, St->getPointerInfo(), St->getOriginalAlign(), + BasePtr, St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, @@ -16930,8 +16930,7 @@ static SDValue PerformSTORECombine(SDNode *N, return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(isBigEndian ? 0 : 1), OffsetPtr, St->getPointerInfo().getWithOffset(4), - St->getOriginalAlign(), - St->getMemOperand()->getFlags()); + St->getBaseAlign(), St->getMemOperand()->getFlags()); } if (StVal.getValueType() == MVT::i64 && @@ -17913,7 +17912,7 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { // Details about the old load SDValue Ch = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); - Align Alignment = LD->getOriginalAlign(); + Align Alignment = LD->getBaseAlign(); MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); AAMDNodes AAInfo = LD->getAAInfo(); @@ -18819,7 +18818,7 @@ static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, // Details about the old load SDValue Ch = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); - Align Alignment = LD->getOriginalAlign(); + Align Alignment = LD->getBaseAlign(); MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); AAMDNodes AAInfo = LD->getAAInfo(); diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp index d7f4d4b93f957..e21f4ea45b595 100644 --- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -184,7 +184,8 @@ class CSKYConstantIslands : public MachineFunctionPass { struct ImmBranch { MachineInstr *MI; unsigned MaxDisp : 31; - bool IsCond : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsCond : 1; int UncondBr; ImmBranch(MachineInstr *Mi, unsigned Maxdisp, bool Cond, int Ubr) diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index ef299c17baf76..43e06ee278b49 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -75,31 +75,34 @@ static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, if (RootConstantNode->getNumOperands() != 5) return reportError(Ctx, "Invalid format for RootConstants Element"); - mcdxbc::RootParameter NewParameter; - NewParameter.Header.ParameterType = + dxbc::RootParameterHeader Header; + // The parameter offset doesn't matter here - we recalculate it during + // serialization Header.ParameterOffset = 0; + Header.ParameterType = llvm::to_underlying(dxbc::RootParameterType::Constants32Bit); if (std::optional Val = extractMdIntValue(RootConstantNode, 1)) - NewParameter.Header.ShaderVisibility = *Val; + Header.ShaderVisibility = *Val; else return reportError(Ctx, "Invalid value for ShaderVisibility"); + dxbc::RootConstants Constants; if (std::optional Val = extractMdIntValue(RootConstantNode, 2)) - NewParameter.Constants.ShaderRegister = *Val; + Constants.ShaderRegister = *Val; else return reportError(Ctx, "Invalid value for ShaderRegister"); if (std::optional Val = extractMdIntValue(RootConstantNode, 3)) - NewParameter.Constants.RegisterSpace = *Val; + Constants.RegisterSpace = *Val; else return reportError(Ctx, "Invalid value for RegisterSpace"); if (std::optional Val = extractMdIntValue(RootConstantNode, 4)) - NewParameter.Constants.Num32BitValues = *Val; + Constants.Num32BitValues = *Val; else return reportError(Ctx, "Invalid value for Num32BitValues"); - RSD.Parameters.push_back(NewParameter); + RSD.ParametersContainer.addParameter(Header, Constants); return false; } @@ -164,12 +167,12 @@ static bool validate(LLVMContext *Ctx, const mcdxbc::RootSignatureDesc &RSD) { return reportValueError(Ctx, "RootFlags", RSD.Flags); } - for (const mcdxbc::RootParameter &P : RSD.Parameters) { - if (!dxbc::isValidShaderVisibility(P.Header.ShaderVisibility)) + for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) { + if (!dxbc::isValidShaderVisibility(Info.Header.ShaderVisibility)) return reportValueError(Ctx, "ShaderVisibility", - P.Header.ShaderVisibility); + Info.Header.ShaderVisibility); - assert(dxbc::isValidParameterType(P.Header.ParameterType) && + assert(dxbc::isValidParameterType(Info.Header.ParameterType) && "Invalid value for ParameterType"); } @@ -287,25 +290,33 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, OS << indent(Space) << "Version: " << RS.Version << "\n"; OS << indent(Space) << "RootParametersOffset: " << RS.RootParameterOffset << "\n"; - OS << indent(Space) << "NumParameters: " << RS.Parameters.size() << "\n"; + OS << indent(Space) << "NumParameters: " << RS.ParametersContainer.size() + << "\n"; Space++; - for (auto const &P : RS.Parameters) { - OS << indent(Space) << "- Parameter Type: " << P.Header.ParameterType - << "\n"; + for (size_t I = 0; I < RS.ParametersContainer.size(); I++) { + const auto &[Type, Loc] = + RS.ParametersContainer.getTypeAndLocForParameter(I); + const dxbc::RootParameterHeader Header = + RS.ParametersContainer.getHeader(I); + + OS << indent(Space) << "- Parameter Type: " << Type << "\n"; OS << indent(Space + 2) - << "Shader Visibility: " << P.Header.ShaderVisibility << "\n"; - switch (P.Header.ParameterType) { - case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): - OS << indent(Space + 2) - << "Register Space: " << P.Constants.RegisterSpace << "\n"; + << "Shader Visibility: " << Header.ShaderVisibility << "\n"; + + switch (Type) { + case llvm::to_underlying(dxbc::RootParameterType::Constants32Bit): { + const dxbc::RootConstants &Constants = + RS.ParametersContainer.getConstant(Loc); + OS << indent(Space + 2) << "Register Space: " << Constants.RegisterSpace + << "\n"; OS << indent(Space + 2) - << "Shader Register: " << P.Constants.ShaderRegister << "\n"; + << "Shader Register: " << Constants.ShaderRegister << "\n"; OS << indent(Space + 2) - << "Num 32 Bit Values: " << P.Constants.Num32BitValues << "\n"; - break; + << "Num 32 Bit Values: " << Constants.Num32BitValues << "\n"; } + } + Space--; } - Space--; OS << indent(Space) << "NumStaticSamplers: " << 0 << "\n"; OS << indent(Space) << "StaticSamplersOffset: " << RS.StaticSamplersOffset << "\n"; @@ -313,7 +324,6 @@ PreservedAnalyses RootSignatureAnalysisPrinter::run(Module &M, Space--; // end root signature header } - return PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index b50a9b5d6051c..bd3349d2e18c5 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -32,7 +32,7 @@ using namespace llvm; using namespace llvm::dxil; -static bool hasUAVsAtEveryStage(DXILResourceMap &DRM, +static bool hasUAVsAtEveryStage(const DXILResourceMap &DRM, const ModuleMetadataInfo &MMDI) { if (DRM.uavs().empty()) return false; @@ -142,6 +142,13 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, } } + if (CSF.LowPrecisionPresent) { + if (CSF.NativeLowPrecisionMode) + CSF.NativeLowPrecision = true; + else + CSF.MinimumPrecision = true; + } + if (!CSF.Int64Ops) CSF.Int64Ops = I.getType()->isIntegerTy(64); @@ -200,20 +207,74 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, } } +/// Set shader flags that apply to all functions within the module +ComputedShaderFlags +ModuleShaderFlags::gatherGlobalModuleFlags(const Module &M, + const DXILResourceMap &DRM, + const ModuleMetadataInfo &MMDI) { + + ComputedShaderFlags CSF; + + // Set DisableOptimizations flag based on the presence of OptimizeNone + // attribute of entry functions. + if (MMDI.EntryPropertyVec.size() > 0) { + CSF.DisableOptimizations = MMDI.EntryPropertyVec[0].Entry->hasFnAttribute( + llvm::Attribute::OptimizeNone); + // Ensure all entry functions have the same optimization attribute + for (const auto &EntryFunProps : MMDI.EntryPropertyVec) + if (CSF.DisableOptimizations != + EntryFunProps.Entry->hasFnAttribute(llvm::Attribute::OptimizeNone)) + EntryFunProps.Entry->getContext().diagnose(DiagnosticInfoUnsupported( + *(EntryFunProps.Entry), "Inconsistent optnone attribute ")); + } + + CSF.UAVsAtEveryStage = hasUAVsAtEveryStage(DRM, MMDI); + + // Set the Max64UAVs flag if the number of UAVs is > 8 + uint32_t NumUAVs = 0; + for (auto &UAV : DRM.uavs()) + if (MMDI.ValidatorVersion < VersionTuple(1, 6)) + NumUAVs++; + else // MMDI.ValidatorVersion >= VersionTuple(1, 6) + NumUAVs += UAV.getBinding().Size; + if (NumUAVs > 8) + CSF.Max64UAVs = true; + + // Set the module flag that enables native low-precision execution mode. + // NativeLowPrecisionMode can only be set when the command line option + // -enable-16bit-types is provided. This is indicated by the dx.nativelowprec + // module flag being set + // This flag is needed even if the module does not use 16-bit types because a + // corresponding debug module may include 16-bit types, and tools that use the + // debug module may expect it to have the same flags as the original + if (auto *NativeLowPrec = mdconst::extract_or_null( + M.getModuleFlag("dx.nativelowprec"))) + if (MMDI.ShaderModelVersion >= VersionTuple(6, 2)) + CSF.NativeLowPrecisionMode = NativeLowPrec->getValue().getBoolValue(); + + // Set ResMayNotAlias to true if DXIL validator version < 1.8 and there + // are UAVs present globally. + if (CanSetResMayNotAlias && MMDI.ValidatorVersion < VersionTuple(1, 8)) + CSF.ResMayNotAlias = !DRM.uavs().empty(); + + return CSF; +} + /// Construct ModuleShaderFlags for module Module M void ModuleShaderFlags::initialize(Module &M, DXILResourceTypeMap &DRTM, - DXILResourceMap &DRM, + const DXILResourceMap &DRM, const ModuleMetadataInfo &MMDI) { CanSetResMayNotAlias = MMDI.DXILVersion >= VersionTuple(1, 7); - - // Check if -res-may-alias was provided on the command line. - // The command line option will set the dx.resmayalias module flag to 1. - if (auto *RMA = mdconst::extract_or_null( + // The command line option -res-may-alias will set the dx.resmayalias module + // flag to 1, thereby disabling the ability to set the ResMayNotAlias flag + if (auto *ResMayAlias = mdconst::extract_or_null( M.getModuleFlag("dx.resmayalias"))) - if (RMA->getValue() != 0) + if (ResMayAlias->getValue().getBoolValue()) CanSetResMayNotAlias = false; + ComputedShaderFlags GlobalSFMask = gatherGlobalModuleFlags(M, DRM, MMDI); + CallGraph CG(M); // Compute Shader Flags Mask for all functions using post-order visit of SCC @@ -238,19 +299,7 @@ void ModuleShaderFlags::initialize(Module &M, DXILResourceTypeMap &DRTM, continue; } - // Set ResMayNotAlias to true if DXIL validator version < 1.8 and there - // are UAVs present globally. - if (CanSetResMayNotAlias && MMDI.ValidatorVersion < VersionTuple(1, 8)) - SCCSF.ResMayNotAlias = !DRM.uavs().empty(); - - // Set UseNativeLowPrecision using dx.nativelowprec module metadata - if (auto *NativeLowPrec = mdconst::extract_or_null( - M.getModuleFlag("dx.nativelowprec"))) - if (MMDI.ShaderModelVersion >= VersionTuple(6, 2) && - NativeLowPrec->getValue() != 0) - SCCSF.UseNativeLowPrecision = true; - - ComputedShaderFlags CSF; + ComputedShaderFlags CSF = GlobalSFMask; for (const auto &BB : *F) for (const auto &I : BB) updateFunctionFlags(CSF, I, DRTM, MMDI); @@ -271,32 +320,6 @@ void ModuleShaderFlags::initialize(Module &M, DXILResourceTypeMap &DRTM, // Merge SCCSF with that of F FunctionFlags[F].merge(SCCSF); } - - // Set DisableOptimizations flag based on the presence of OptimizeNone - // attribute of entry functions. - if (MMDI.EntryPropertyVec.size() > 0) { - CombinedSFMask.DisableOptimizations = - MMDI.EntryPropertyVec[0].Entry->hasFnAttribute( - llvm::Attribute::OptimizeNone); - // Ensure all entry functions have the same optimization attribute - for (const auto &EntryFunProps : MMDI.EntryPropertyVec) - if (CombinedSFMask.DisableOptimizations != - EntryFunProps.Entry->hasFnAttribute(llvm::Attribute::OptimizeNone)) - EntryFunProps.Entry->getContext().diagnose(DiagnosticInfoUnsupported( - *(EntryFunProps.Entry), "Inconsistent optnone attribute ")); - } - - // Set the Max64UAVs flag if the number of UAVs is > 8 - uint32_t NumUAVs = 0; - for (auto &UAV : DRM.uavs()) - if (MMDI.ValidatorVersion < VersionTuple(1, 6)) - NumUAVs++; - else // MMDI.ValidatorVersion >= VersionTuple(1, 6) - NumUAVs += UAV.getBinding().Size; - if (NumUAVs > 8) - CombinedSFMask.Max64UAVs = true; - - CombinedSFMask.UAVsAtEveryStage = hasUAVsAtEveryStage(DRM, MMDI); } void ComputedShaderFlags::print(raw_ostream &OS) const { diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h index 0e0bd0036349e..f94f7997436ac 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.h +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h @@ -85,19 +85,25 @@ struct ComputedShaderFlags { }; struct ModuleShaderFlags { - void initialize(Module &, DXILResourceTypeMap &DRTM, DXILResourceMap &DRM, - const ModuleMetadataInfo &MMDI); + void initialize(Module &, DXILResourceTypeMap &DRTM, + const DXILResourceMap &DRM, const ModuleMetadataInfo &MMDI); const ComputedShaderFlags &getFunctionFlags(const Function *) const; const ComputedShaderFlags &getCombinedFlags() const { return CombinedSFMask; } private: + // This boolean is inversely set by the LLVM module flag dx.resmayalias to + // determine whether or not the ResMayNotAlias DXIL module flag can be set bool CanSetResMayNotAlias; + /// Map of Function-Shader Flag Mask pairs representing properties of each of /// the functions in the module. Shader Flags of each function represent both /// module-level and function-level flags DenseMap FunctionFlags; /// Combined Shader Flag Mask of all functions of the module ComputedShaderFlags CombinedSFMask{}; + ComputedShaderFlags gatherGlobalModuleFlags(const Module &M, + const DXILResourceMap &, + const ModuleMetadataInfo &); void updateFunctionFlags(ComputedShaderFlags &, const Instruction &, DXILResourceTypeMap &, const ModuleMetadataInfo &); }; diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index 0d2730f820748..22142484cef3c 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -104,11 +104,11 @@ class DirectXPassConfig : public TargetPassConfig { void addCodeGenPrepare() override { addPass(createDXILFinalizeLinkageLegacyPass()); addPass(createDXILResourceImplicitBindingLegacyPass()); + addPass(createDXILResourceAccessLegacyPass()); addPass(createDXILIntrinsicExpansionLegacyPass()); addPass(createDXILCBufferAccessLegacyPass()); addPass(createDXILDataScalarizationLegacyPass()); addPass(createDXILFlattenArraysLegacyPass()); - addPass(createDXILResourceAccessLegacyPass()); ScalarizerPassOptions DxilScalarOptions; DxilScalarOptions.ScalarizeLoadStore = true; addPass(createScalarizerPass(DxilScalarOptions)); diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 686e1609c376d..1c9fb8f0a42ae 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -110,6 +110,7 @@ class HexagonAsmParser : public MCTargetAsmParser { bool equalIsAsmAssignment() override { return false; } bool isLabel(AsmToken &Token) override; + bool tokenIsStartOfStatement(AsmToken::TokenKind Token) override; void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } @@ -1007,6 +1008,10 @@ bool HexagonAsmParser::isLabel(AsmToken &Token) { return false; } +bool HexagonAsmParser::tokenIsStartOfStatement(AsmToken::TokenKind Token) { + return Token == AsmToken::LCurly || Token == AsmToken::RCurly; +} + bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious, SMLoc &Loc) { if (!Contigious && ErrorNoncontigiousRegister) { diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index fe12f99b91cd3..01efcedebc808 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1499,8 +1499,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // - indexed loads and stores (pre-/post-incremented), // - ANY_EXTEND_VECTOR_INREG, ATOMIC_CMP_SWAP_WITH_SUCCESS, CONCAT_VECTORS, // ConstantFP, FCEIL, FCOPYSIGN, FEXP, FEXP2, FFLOOR, FGETSIGN, - // FLOG, FLOG2, FLOG10, FMAXNUM, FMINNUM, FNEARBYINT, FRINT, FROUND, TRAP, - // FTRUNC, PREFETCH, SIGN_EXTEND_VECTOR_INREG, ZERO_EXTEND_VECTOR_INREG, + // FLOG, FLOG2, FLOG10, FMAXIMUMNUM, FMINIMUMNUM, FNEARBYINT, FRINT, FROUND, + // TRAP, FTRUNC, PREFETCH, SIGN_EXTEND_VECTOR_INREG, + // ZERO_EXTEND_VECTOR_INREG, // which default to "expand" for at least one type. // Misc operations. @@ -1638,6 +1639,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Set the action for vector operations to "expand", then override it with // either "custom" or "legal" for specific cases. + // clang-format off static const unsigned VectExpOps[] = { // Integer arithmetic: ISD::ADD, ISD::SUB, ISD::MUL, ISD::SDIV, ISD::UDIV, @@ -1652,7 +1654,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, ISD::FCOS, ISD::FPOW, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, ISD::FROUND, ISD::FFLOOR, - ISD::FMINNUM, ISD::FMAXNUM, ISD::FSINCOS, ISD::FLDEXP, + ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM, + ISD::FSINCOS, ISD::FLDEXP, // Misc: ISD::BR_CC, ISD::SELECT_CC, ISD::ConstantPool, // Vector: @@ -1662,6 +1665,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, ISD::CONCAT_VECTORS, ISD::VECTOR_SHUFFLE, ISD::SPLAT_VECTOR, }; + // clang-format on for (MVT VT : MVT::fixedlen_vector_valuetypes()) { for (unsigned VectExpOp : VectExpOps) @@ -1784,8 +1788,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMUL, MVT::f64, Expand); setOperationAction(ISD::FDIV, MVT::f32, Custom); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); @@ -1833,8 +1837,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f64, Legal); } if (Subtarget.hasV67Ops()) { - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + setOperationAction(ISD::FMINIMUMNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXIMUMNUM, MVT::f64, Legal); setOperationAction(ISD::FMUL, MVT::f64, Legal); } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index de2bac6905530..fbbcacf0d713e 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -127,8 +127,8 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::FADD, T, Legal); setOperationAction(ISD::FSUB, T, Legal); setOperationAction(ISD::FMUL, T, Legal); - setOperationAction(ISD::FMINNUM, T, Legal); - setOperationAction(ISD::FMAXNUM, T, Legal); + setOperationAction(ISD::FMINIMUMNUM, T, Legal); + setOperationAction(ISD::FMAXIMUMNUM, T, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom); @@ -164,8 +164,8 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::FADD, P, Custom); setOperationAction(ISD::FSUB, P, Custom); setOperationAction(ISD::FMUL, P, Custom); - setOperationAction(ISD::FMINNUM, P, Custom); - setOperationAction(ISD::FMAXNUM, P, Custom); + setOperationAction(ISD::FMINIMUMNUM, P, Custom); + setOperationAction(ISD::FMAXIMUMNUM, P, Custom); setOperationAction(ISD::SETCC, P, Custom); setOperationAction(ISD::VSELECT, P, Custom); @@ -3172,8 +3172,8 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FADD: case ISD::FSUB: case ISD::FMUL: - case ISD::FMINNUM: - case ISD::FMAXNUM: + case ISD::FMINIMUMNUM: + case ISD::FMAXIMUMNUM: case ISD::MULHS: case ISD::MULHU: case ISD::AND: diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 0d872b556d801..2a991bafbf148 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -1579,8 +1579,8 @@ def: OpR_RR_pat; def: OpR_RR_pat, f32, F32>; def: OpR_RR_pat, f32, F32>; def: OpR_RR_pat, f32, F32>; -def: OpR_RR_pat, f32, F32>; -def: OpR_RR_pat, f32, F32>; +def: OpR_RR_pat, f32, F32>; +def: OpR_RR_pat, f32, F32>; let Predicates = [HasV66] in { def: OpR_RR_pat, f64, F64>; @@ -1600,8 +1600,8 @@ let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in { def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>; } let Predicates = [HasV67] in { - def: OpR_RR_pat, f64, F64>; - def: OpR_RR_pat, f64, F64>; + def: OpR_RR_pat, f64, F64>; + def: OpR_RR_pat, f64, F64>; def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy (F2_dfmpyfix $Rs, $Rt), (F2_dfmpyfix $Rt, $Rs))>; diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index 5b8386416a5f0..ba449eaeed34c 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -508,10 +508,10 @@ let Predicates = [UseHVXV68, UseHVX128B, UseHVXQFloat] in { defm: MinMax_pats; defm: MinMax_pats; } - def: OpR_RR_pat, VecF16, HVF16>; - def: OpR_RR_pat, VecF16, HVF16>; - def: OpR_RR_pat, VecF32, HVF32>; - def: OpR_RR_pat, VecF32, HVF32>; + def: OpR_RR_pat, VecF16, HVF16>; + def: OpR_RR_pat, VecF16, HVF16>; + def: OpR_RR_pat, VecF32, HVF32>; + def: OpR_RR_pat, VecF32, HVF32>; } let Predicates = [UseHVXV68, UseHVX128B, UseHVXIEEEFP] in { @@ -521,10 +521,10 @@ let Predicates = [UseHVXV68, UseHVX128B, UseHVXIEEEFP] in { defm: MinMax_pats; defm: MinMax_pats; } - def: OpR_RR_pat, VecF16, HVF16>; - def: OpR_RR_pat, VecF16, HVF16>; - def: OpR_RR_pat, VecF32, HVF32>; - def: OpR_RR_pat, VecF32, HVF32>; + def: OpR_RR_pat, VecF16, HVF16>; + def: OpR_RR_pat, VecF16, HVF16>; + def: OpR_RR_pat, VecF32, HVF32>; + def: OpR_RR_pat, VecF32, HVF32>; } let Predicates = [UseHVX] in { diff --git a/llvm/lib/Target/M68k/CMakeLists.txt b/llvm/lib/Target/M68k/CMakeLists.txt index 1661dccece3dd..7005df4fb8a82 100644 --- a/llvm/lib/Target/M68k/CMakeLists.txt +++ b/llvm/lib/Target/M68k/CMakeLists.txt @@ -6,6 +6,7 @@ tablegen(LLVM M68kGenGlobalISel.inc -gen-global-isel) tablegen(LLVM M68kGenRegisterInfo.inc -gen-register-info) tablegen(LLVM M68kGenRegisterBank.inc -gen-register-bank) tablegen(LLVM M68kGenInstrInfo.inc -gen-instr-info) +tablegen(LLVM M68kGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM M68kGenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM M68kGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM M68kGenMCPseudoLowering.inc -gen-pseudo-lowering) @@ -32,6 +33,7 @@ add_llvm_target(M68kCodeGen M68kMachineFunction.cpp M68kMCInstLower.cpp M68kRegisterInfo.cpp + M68kSelectionDAGInfo.cpp M68kSubtarget.cpp M68kTargetMachine.cpp M68kTargetObjectFile.cpp diff --git a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp index 53c144c8fa79a..9c3d61ec60e00 100644 --- a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp +++ b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp @@ -15,8 +15,8 @@ #include "M68kMachineFunction.h" #include "M68kRegisterInfo.h" +#include "M68kSelectionDAGInfo.h" #include "M68kTargetMachine.h" - #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp index 616b1f622619c..9d3ab606ab8cd 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.cpp +++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp @@ -15,6 +15,7 @@ #include "M68kISelLowering.h" #include "M68kCallingConv.h" #include "M68kMachineFunction.h" +#include "M68kSelectionDAGInfo.h" #include "M68kSubtarget.h" #include "M68kTargetMachine.h" #include "M68kTargetObjectFile.h" @@ -3641,64 +3642,6 @@ SDValue M68kTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } -//===----------------------------------------------------------------------===// -// M68kISD Node Names -//===----------------------------------------------------------------------===// -const char *M68kTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - case M68kISD::CALL: - return "M68kISD::CALL"; - case M68kISD::TAIL_CALL: - return "M68kISD::TAIL_CALL"; - case M68kISD::RET: - return "M68kISD::RET"; - case M68kISD::TC_RETURN: - return "M68kISD::TC_RETURN"; - case M68kISD::ADD: - return "M68kISD::ADD"; - case M68kISD::SUB: - return "M68kISD::SUB"; - case M68kISD::ADDX: - return "M68kISD::ADDX"; - case M68kISD::SUBX: - return "M68kISD::SUBX"; - case M68kISD::SMUL: - return "M68kISD::SMUL"; - case M68kISD::UMUL: - return "M68kISD::UMUL"; - case M68kISD::OR: - return "M68kISD::OR"; - case M68kISD::XOR: - return "M68kISD::XOR"; - case M68kISD::AND: - return "M68kISD::AND"; - case M68kISD::CMP: - return "M68kISD::CMP"; - case M68kISD::BTST: - return "M68kISD::BTST"; - case M68kISD::SELECT: - return "M68kISD::SELECT"; - case M68kISD::CMOV: - return "M68kISD::CMOV"; - case M68kISD::BRCOND: - return "M68kISD::BRCOND"; - case M68kISD::SETCC: - return "M68kISD::SETCC"; - case M68kISD::SETCC_CARRY: - return "M68kISD::SETCC_CARRY"; - case M68kISD::GLOBAL_BASE_REG: - return "M68kISD::GLOBAL_BASE_REG"; - case M68kISD::Wrapper: - return "M68kISD::Wrapper"; - case M68kISD::WrapperPC: - return "M68kISD::WrapperPC"; - case M68kISD::SEG_ALLOCA: - return "M68kISD::SEG_ALLOCA"; - default: - return NULL; - } -} - CCAssignFn *M68kTargetLowering::getCCAssignFn(CallingConv::ID CC, bool Return, bool IsVarArg) const { if (Return) diff --git a/llvm/lib/Target/M68k/M68kISelLowering.h b/llvm/lib/Target/M68k/M68kISelLowering.h index b646f7d7fb2ba..3774f93c851e8 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.h +++ b/llvm/lib/Target/M68k/M68kISelLowering.h @@ -25,77 +25,6 @@ #include namespace llvm { -namespace M68kISD { - -/// M68k Specific DAG nodes -enum NodeType { - /// Start the numbering from where ISD NodeType finishes. - FIRST_NUMBER = ISD::BUILTIN_OP_END, - - CALL, - RET, - TAIL_CALL, - TC_RETURN, - - /// M68k compare and logical compare instructions. Subtracts the source - /// operand from the destination data register and sets the condition - /// codes according to the result. Immediate always goes first. - CMP, - - /// M68k bit-test instructions. - BTST, - - /// M68k Select - SELECT, - - /// M68k SetCC. Operand 0 is condition code, and operand 1 is the CCR - /// operand, usually produced by a CMP instruction. - SETCC, - - // Same as SETCC except it's materialized with a subx and the value is all - // one's or all zero's. - SETCC_CARRY, // R = carry_bit ? ~0 : 0 - - /// M68k conditional moves. Operand 0 and operand 1 are the two values - /// to select from. Operand 2 is the condition code, and operand 3 is the - /// flag operand produced by a CMP or TEST instruction. It also writes a - /// flag result. - CMOV, - - /// M68k conditional branches. Operand 0 is the chain operand, operand 1 - /// is the block to branch if condition is true, operand 2 is the - /// condition code, and operand 3 is the flag operand produced by a CMP - /// or TEST instruction. - BRCOND, - - // Arithmetic operations with CCR results. - ADD, - SUB, - ADDX, - SUBX, - SMUL, - UMUL, - OR, - XOR, - AND, - - // GlobalBaseReg, - GLOBAL_BASE_REG, - - /// A wrapper node for TargetConstantPool, - /// TargetExternalSymbol, and TargetGlobalAddress. - Wrapper, - - /// Special wrapper used under M68k PIC mode for PC - /// relative displacements. - WrapperPC, - - // For allocating variable amounts of stack space when using - // segmented stacks. Check if the current stacklet has enough space, and - // falls back to heap allocation if not. - SEG_ALLOCA, -}; -} // namespace M68kISD /// Define some predicates that are used for node matching. namespace M68k { @@ -124,8 +53,6 @@ class M68kTargetLowering : public TargetLowering { static const M68kTargetLowering *create(const M68kTargetMachine &TM, const M68kSubtarget &STI); - const char *getTargetNodeName(unsigned Opcode) const override; - /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td index 398c55fa6da4a..f4ed62720ff99 100644 --- a/llvm/lib/Target/M68k/M68kInstrData.td +++ b/llvm/lib/Target/M68k/M68kInstrData.td @@ -296,8 +296,9 @@ class MxMOVEMEncoding - : MxInst<(outs), (ins MEMOp:$dst, MxMoveMask:$mask), + MxOperand MEMOp, MxEncMemOp MEM_ENC, + MxOp MASKOp> + : MxInst<(outs), (ins MEMOp:$dst, MASKOp:$mask), "movem."#TYPE.Prefix#"\t$mask, $dst", []> { let Inst = MxMOVEMEncoding.Value; } @@ -307,13 +308,15 @@ foreach AM = MxMoveSupportedAMs in { def MOVM # TYPE.Size # AM # m # TYPE.Postfix : MxMOVEM_MR("MxOp"#TYPE.Size#"AddrMode_"#AM).Op, - !cast("MxMoveDstOpEnc_"#AM)>; + !cast("MxMoveDstOpEnc_"#AM), + !if(!eq(AM, "e"), MxInverseMoveMask, MxMoveMask)>; } // foreach AM let mayLoad = 1 in class MxMOVEM_RM - : MxInst<(outs), (ins MxMoveMask:$mask, MEMOp:$src), + MxOperand MEMOp, MxEncMemOp MEM_ENC, + MxOp MASKOp> + : MxInst<(outs), (ins MASKOp:$mask, MEMOp:$src), "movem."#TYPE.Prefix#"\t$src, $mask", []> { let Inst = MxMOVEMEncoding.Value; } @@ -323,7 +326,8 @@ foreach AM = MxMoveSupportedAMs in { def MOVM # TYPE.Size # m # AM # TYPE.Postfix : MxMOVEM_RM("MxOp"#TYPE.Size#"AddrMode_"#AM).Op, - !cast("MxMoveSrcOpEnc_"#AM)>; + !cast("MxMoveSrcOpEnc_"#AM), + !if(!eq(AM, "e"), MxInverseMoveMask, MxMoveMask)>; } // foreach AM // Pseudo versions. These a required by virtual register spill/restore since diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.td b/llvm/lib/Target/M68k/M68kInstrInfo.td index dca774e94b9b5..1200c493f9fca 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.td +++ b/llvm/lib/Target/M68k/M68kInstrInfo.td @@ -112,9 +112,18 @@ def MxRet : SDNode<"M68kISD::RET", MxSDT_Ret, def MxTCRet : SDNode<"M68kISD::TC_RETURN", MxSDT_TCRet, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def MxGlobalBaseReg : SDNode<"M68kISD::GLOBAL_BASE_REG", + SDTypeProfile<1, 0, [SDTCisVT<0, iPTR>]>>; + +// A wrapper node for TargetConstantPool, +// TargetExternalSymbol, and TargetGlobalAddress. def MxWrapper : SDNode<"M68kISD::Wrapper", MxSDT_Wrapper>; + +// Special wrapper used under M68k PIC mode for PC +// relative displacements. def MxWrapperPC : SDNode<"M68kISD::WrapperPC", MxSDT_Wrapper>; +// Arithmetic operations with CCR results. def MxAdd : SDNode<"M68kISD::ADD", MxSDT_BiArithCCROut, [SDNPCommutative]>; def MxSub : SDNode<"M68kISD::SUB", MxSDT_BiArithCCROut>; def MxOr : SDNode<"M68kISD::OR", MxSDT_BiArithCCROut, [SDNPCommutative]>; @@ -127,15 +136,37 @@ def MxSubX : SDNode<"M68kISD::SUBX", MxSDT_BiArithCCRInOut>; def MxSMul : SDNode<"M68kISD::SMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>; def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>; +// M68k compare and logical compare instructions. Subtracts the source +// operand from the destination data register and sets the condition +// codes according to the result. Immediate always goes first. def MxCmp : SDNode<"M68kISD::CMP", MxSDT_CmpTest>; + +// M68k bit-test instructions. def MxBtst : SDNode<"M68kISD::BTST", MxSDT_CmpTest>; +// M68k conditional moves. Operand 0 and operand 1 are the two values +// to select from. Operand 2 is the condition code, and operand 3 is the +// flag operand produced by a CMP or TEST instruction. It also writes a +// flag result. def MxCmov : SDNode<"M68kISD::CMOV", MxSDT_Cmov>; + +// M68k conditional branches. Operand 0 is the chain operand, operand 1 +// is the block to branch if condition is true, operand 2 is the +// condition code, and operand 3 is the flag operand produced by a CMP +// or TEST instruction. def MxBrCond : SDNode<"M68kISD::BRCOND", MxSDT_BrCond, [SDNPHasChain]>; + +// M68k SetCC. Operand 0 is condition code, and operand 1 is the CCR +// operand, usually produced by a CMP instruction. def MxSetCC : SDNode<"M68kISD::SETCC", MxSDT_SetCC>; -def MxSetCC_C : SDNode<"M68kISD::SETCC_CARRY", MxSDT_SetCC_C>; +// Same as SETCC except it's materialized with a subx and the value is all +// one's or all zero's. +def MxSetCC_C : SDNode<"M68kISD::SETCC_CARRY", MxSDT_SetCC_C>; +// For allocating variable amounts of stack space when using +// segmented stacks. Check if the current stacklet has enough space, and +// falls back to heap allocation if not. def MxSegAlloca : SDNode<"M68kISD::SEG_ALLOCA", MxSDT_SEG_ALLOCA, [SDNPHasChain]>; @@ -407,12 +438,19 @@ def MxBrTarget32 : MxBrTargetOperand<32>; // Used with MOVEM def MxMoveMaskClass : MxOpClass<"MoveMask">; -def MxMoveMask : MxOp { +class MxMoveMaskOp : MxOp { let OperandType = "OPERAND_IMMEDIATE"; let PrintMethod = "printMoveMask"; let ParserMatchClass = MxMoveMaskClass; } +def MxMoveMask : MxMoveMaskOp; +// The encoding of mask is reversed when the memory operand has an addressing +// mode of 'e', that is, pre-decrement. +def MxInverseMoveMask : MxMoveMaskOp { + let EncoderMethod = "encodeInverseMoveMask"; +} + //===----------------------------------------------------------------------===// // Predicates //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/M68k/M68kSelectionDAGInfo.cpp b/llvm/lib/Target/M68k/M68kSelectionDAGInfo.cpp new file mode 100644 index 0000000000000..dd1bfdf00af8c --- /dev/null +++ b/llvm/lib/Target/M68k/M68kSelectionDAGInfo.cpp @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "M68kSelectionDAGInfo.h" + +#define GET_SDNODE_DESC +#include "M68kGenSDNodeInfo.inc" + +using namespace llvm; + +M68kSelectionDAGInfo::M68kSelectionDAGInfo() + : SelectionDAGGenTargetInfo(M68kGenSDNodeInfo) {} + +M68kSelectionDAGInfo::~M68kSelectionDAGInfo() = default; diff --git a/llvm/lib/Target/M68k/M68kSelectionDAGInfo.h b/llvm/lib/Target/M68k/M68kSelectionDAGInfo.h new file mode 100644 index 0000000000000..87a8c08d2591e --- /dev/null +++ b/llvm/lib/Target/M68k/M68kSelectionDAGInfo.h @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_M68K_M68KSELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_M68K_M68KSELECTIONDAGINFO_H + +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" + +#define GET_SDNODE_ENUM +#include "M68kGenSDNodeInfo.inc" + +namespace llvm { + +class M68kSelectionDAGInfo : public SelectionDAGGenTargetInfo { +public: + M68kSelectionDAGInfo(); + + ~M68kSelectionDAGInfo() override; +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_M68K_M68KSELECTIONDAGINFO_H diff --git a/llvm/lib/Target/M68k/M68kSubtarget.cpp b/llvm/lib/Target/M68k/M68kSubtarget.cpp index 53ec574ae5596..59d865ff1f4a9 100644 --- a/llvm/lib/Target/M68k/M68kSubtarget.cpp +++ b/llvm/lib/Target/M68k/M68kSubtarget.cpp @@ -15,12 +15,11 @@ #include "GISel/M68kCallLowering.h" #include "GISel/M68kLegalizerInfo.h" #include "GISel/M68kRegisterBankInfo.h" - #include "M68k.h" #include "M68kMachineFunction.h" #include "M68kRegisterInfo.h" +#include "M68kSelectionDAGInfo.h" #include "M68kTargetMachine.h" - #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -50,10 +49,12 @@ void M68kSubtarget::anchor() {} M68kSubtarget::M68kSubtarget(const Triple &TT, StringRef CPU, StringRef FS, const M68kTargetMachine &TM) - : M68kGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), TM(TM), TSInfo(), + : M68kGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), TM(TM), InstrInfo(initializeSubtargetDependencies(CPU, TT, FS, TM)), FrameLowering(*this, this->getStackAlignment()), TLInfo(TM, *this), TargetTriple(TT) { + TSInfo = std::make_unique(); + CallLoweringInfo.reset(new M68kCallLowering(*getTargetLowering())); Legalizer.reset(new M68kLegalizerInfo(*this)); @@ -62,6 +63,12 @@ M68kSubtarget::M68kSubtarget(const Triple &TT, StringRef CPU, StringRef FS, InstSelector.reset(createM68kInstructionSelector(TM, *this, *RBI)); } +M68kSubtarget::~M68kSubtarget() = default; + +const SelectionDAGTargetInfo *M68kSubtarget::getSelectionDAGInfo() const { + return TSInfo.get(); +} + const CallLowering *M68kSubtarget::getCallLowering() const { return CallLoweringInfo.get(); } diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h index c08a9786fb27b..16ca7d2e6d0fd 100644 --- a/llvm/lib/Target/M68k/M68kSubtarget.h +++ b/llvm/lib/Target/M68k/M68kSubtarget.h @@ -22,7 +22,6 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/RegisterBankInfo.h" -#include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" @@ -63,7 +62,6 @@ class M68kSubtarget : public M68kGenSubtargetInfo { const M68kTargetMachine &TM; - SelectionDAGTargetInfo TSInfo; M68kInstrInfo InstrInfo; M68kFrameLowering FrameLowering; M68kTargetLowering TLInfo; @@ -80,6 +78,8 @@ class M68kSubtarget : public M68kGenSubtargetInfo { M68kSubtarget(const Triple &TT, StringRef CPU, StringRef FS, const M68kTargetMachine &_TM); + ~M68kSubtarget() override; + /// Parses features string setting specified subtarget options. Definition /// of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); @@ -148,10 +148,6 @@ class M68kSubtarget : public M68kGenSubtargetInfo { StringRef FS, const M68kTargetMachine &TM); - const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { - return &TSInfo; - } - const M68kInstrInfo *getInstrInfo() const override { return &InstrInfo; } const M68kFrameLowering *getFrameLowering() const override { @@ -171,6 +167,9 @@ class M68kSubtarget : public M68kGenSubtargetInfo { } protected: + // SelectionDAGISel related APIs. + std::unique_ptr TSInfo; + // GlobalISel related APIs. std::unique_ptr CallLoweringInfo; std::unique_ptr InstSelector; @@ -178,6 +177,7 @@ class M68kSubtarget : public M68kGenSubtargetInfo { std::unique_ptr RegBankInfo; public: + const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; const CallLowering *getCallLowering() const override; InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp index 78783084ee59a..3bd1d0d7dcaeb 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp @@ -63,6 +63,11 @@ class M68kMCCodeEmitter : public MCCodeEmitter { APInt &Value, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + void encodeInverseMoveMask(const MCInst &MI, unsigned OpIdx, + unsigned InsertPos, APInt &Value, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + public: M68kMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : MCII(mcii), Ctx(ctx) {} @@ -196,6 +201,13 @@ void M68kMCCodeEmitter::encodeFPSYSSelect(const MCInst &MI, unsigned OpIdx, } } +void M68kMCCodeEmitter::encodeInverseMoveMask( + const MCInst &MI, unsigned OpIdx, unsigned InsertPos, APInt &Value, + SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { + const MCOperand &Op = MI.getOperand(OpIdx); + Value = llvm::reverseBits((uint16_t)Op.getImm()); +} + void M68kMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &Op, unsigned InsertPos, APInt &Value, SmallVectorImpl &Fixups, diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index 59c5bcb89bede..11df6fecaf37b 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -53,5 +53,7 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() { ExceptionsType = ExceptionHandling::WinEH; + PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; AllowAtInName = true; } diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index c53a73db1dd92..760be36b7667d 100644 --- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -321,7 +321,8 @@ namespace { struct ImmBranch { MachineInstr *MI; unsigned MaxDisp : 31; - bool isCond : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned isCond : 1; int UncondBr; ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr) diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 66cbf79a453a6..e933e97ea3706 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -520,6 +520,9 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Custom); + setOperationAction(ISD::ConstantFP, MVT::f64, Custom); + setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND, ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL, ISD::SIGN_EXTEND}); @@ -1355,6 +1358,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG); case ISD::READCYCLECOUNTER: return lowerREADCYCLECOUNTER(Op, DAG); + case ISD::ConstantFP: + return lowerConstantFP(Op, DAG); } return SDValue(); } @@ -3015,6 +3020,30 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op, return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc); } +SDValue MipsTargetLowering::lowerConstantFP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getSimpleValueType(); + SDNode *N = Op.getNode(); + ConstantFPSDNode *CFP = cast(N); + + if (!CFP->isNaN() || Subtarget.isNaN2008()) { + return SDValue(); + } + + APFloat NaNValue = CFP->getValueAPF(); + auto &Sem = NaNValue.getSemantics(); + + // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN + // encodings, and one for sNaNs. Check every NaN constants and make sure + // they are correctly encoded for legacy encodings. + if (!NaNValue.isSignaling()) { + APFloat RealQNaN = NaNValue.getSNaN(Sem); + return DAG.getConstantFP(RealQNaN, DL, VT); + } + return SDValue(); +} + //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index 9885ab894d6f2..241e9343ae384 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -592,6 +592,7 @@ class TargetRegisterClass; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; /// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index b81bb1186de72..8a59532ba5786 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Target/TargetMachine.h" #include @@ -53,6 +54,13 @@ bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const { return op.isImm() && op.getImm() == 0; } +MCInst MipsInstrInfo::getNop() const { + return MCInstBuilder(Mips::SLL) + .addReg(Mips::ZERO) + .addReg(Mips::ZERO) + .addImm(0); +} + /// insertNoop - If data hazard condition is found insert the target nop /// instruction. void MipsInstrInfo:: diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index 06964c0161b4b..2337ae7c079e7 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -57,6 +57,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc); + MCInst getNop() const override; + static const MipsInstrInfo *create(MipsSubtarget &STI); /// Branch Analysis diff --git a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp index bb7482c5555ef..166a2501e3f09 100644 --- a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp @@ -106,8 +106,8 @@ void MipsPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); @@ -134,7 +134,8 @@ bool MipsPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const MipsLegalizerInfo *LI = static_cast(ST.getLegalizerInfo()); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis().getDomTree(); @@ -150,7 +151,7 @@ INITIALIZE_PASS_BEGIN(MipsPostLegalizerCombiner, DEBUG_TYPE, "Combine Mips machine instrs after legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(MipsPostLegalizerCombiner, DEBUG_TYPE, "Combine Mips machine instrs after legalization", false, false) diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp index a1ae66ef09770..278dcb143d336 100644 --- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp @@ -102,8 +102,8 @@ class MipsPreLegalizerCombiner : public MachineFunctionPass { void MipsPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); @@ -122,7 +122,8 @@ bool MipsPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const MipsLegalizerInfo *LI = static_cast(ST.getLegalizerInfo()); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MipsPreLegalizerCombinerInfo PCInfo; MipsPreLegalizerCombinerImpl Impl(MF, PCInfo, TPC, *VT, /*CSEInfo*/ nullptr, ST, /*MDT*/ nullptr, LI); @@ -134,7 +135,7 @@ INITIALIZE_PASS_BEGIN(MipsPreLegalizerCombiner, DEBUG_TYPE, "Combine Mips machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(MipsPreLegalizerCombiner, DEBUG_TYPE, "Combine Mips machine instrs before legalization", false, false) diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h index 430fcd741c1b6..caef8fe790adb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h @@ -90,14 +90,14 @@ class NVPTXExternalAAWrapper : public ExternalAAWrapperPass { public: static char ID; - bool runEarly() override { return true; } - NVPTXExternalAAWrapper() - : ExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { - if (auto *WrapperPass = - P.getAnalysisIfAvailable()) - AAR.addAAResult(WrapperPass->getResult()); - }) {} + : ExternalAAWrapperPass( + [](Pass &P, Function &, AAResults &AAR) { + if (auto *WrapperPass = + P.getAnalysisIfAvailable()) + AAR.addAAResult(WrapperPass->getResult()); + }, + /*RunEarly=*/true) {} StringRef getPassName() const override { return "NVPTX Address space based Alias Analysis Wrapper"; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 7d171cff7bcb4..2247ae3cf8f46 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2685,31 +2685,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } -void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // dst, src, size, cache_hint, cache_hint_flag - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {5} - size_t NumOps = N->getNumOperands(); - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint - - SDLoc DL(N); - SmallVector Ops(N->ops().slice(2, NumArgs)); - Ops.push_back(N->getOperand(0)); // Chain operand - - bool IsShared32 = - CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode; - if (IsCacheHint) - Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH - : NVPTX::CP_ASYNC_BULK_S2G_CH; - else - Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32 - : NVPTX::CP_ASYNC_BULK_S2G; - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) { // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: // {dst, mbar, src, size, multicast, cache_hint, @@ -2892,9 +2867,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster: SelectCpAsyncBulkG2S(N); return true; - case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global: - SelectCpAsyncBulkS2G(N); - return true; case Intrinsic::nvvm_cp_async_bulk_prefetch_L2: SelectCpAsyncBulkPrefetchL2(N); return true; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 23cbd458571a0..92efabc7e2068 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -93,7 +93,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); void SelectCpAsyncBulkG2S(SDNode *N); - void SelectCpAsyncBulkS2G(SDNode *N); void SelectCpAsyncBulkPrefetchL2(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b6104a5aed0d1..2c65ee6d484d5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -714,6 +714,23 @@ let hasSideEffects = false in { # type # " \t$dst, $src;", []>; } + // FP4 conversions. + def CVT_e2m1x2_f32_sf : NVPTXInst<(outs Int16Regs:$dst), + (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), + !strconcat("{{ \n\t", + ".reg .b8 \t%e2m1x2_out; \n\t", + "cvt${mode:base}.satfinite${mode:relu}.e2m1x2.f32 \t%e2m1x2_out, $src1, $src2; \n\t", + "cvt.u16.u8 \t$dst, %e2m1x2_out; \n\t", + "}}"), []>; + + def CVT_f16x2_e2m1x2 : NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("{{ \n\t", + ".reg .b8 \t%e2m1x2_in; \n\t", + "cvt.u8.u16 \t%e2m1x2_in, $src; \n\t", + "cvt${mode:base}${mode:relu}.f16x2.e2m1x2 \t$dst, %e2m1x2_in; \n\t", + "}}"), []>; + // UE8M0x2 conversions. class CVT_f32_to_ue8m0x2 : NVPTXInst<(outs Int16Regs:$dst), diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8110ba1b2b37b..4f8a798295b42 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -511,10 +511,11 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ : // TMA Async Bulk Copy Functions //------------------------------ -class CpAsyncBulkStr { +class CpAsyncBulkStr { // Shared to Global memory string S2G = "cp.async.bulk.global.shared::cta.bulk_group" - # !if(ch, ".L2::cache_hint", ""); + # !if(ch, ".L2::cache_hint", "") + # !if(mask, ".cp_mask", ""); // Global to Shared cluster memory string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes" @@ -525,18 +526,23 @@ class CpAsyncBulkStr { string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes"; } -multiclass CP_ASYNC_BULK_S2G { - def NAME: NVPTXInst<(outs), - (ins Int64Regs:$dst, rc:$src, Int32Regs:$size), - !strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), - (ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch), - !strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; +multiclass CP_ASYNC_BULK_S2G_INTR { + def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch), + !if(has_ch, + CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;", + CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"), + [(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>, + Requires<[hasPTX<80>, hasSM<90>]>; + + def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask), + !if(has_ch, + CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;", + CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"), + [(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>, + Requires<[hasPTX<86>, hasSM<100>]>; } -defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G; -defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G; +defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>; +defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>; multiclass CP_ASYNC_BULK_G2S { def NAME: NVPTXInst<(outs), @@ -2003,6 +2009,20 @@ def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a), def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a), (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>, Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b), + (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn Int16Regs:$a), + (CVT_f16x2_e2m1x2 $a, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu Int16Regs:$a), + (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b), (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>, diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 9bc4734815364..a1c9091c95b48 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1272,6 +1272,11 @@ static MCRegister convertFPR64ToFPR32(MCRegister Reg) { return Reg - RISCV::F0_D + RISCV::F0_F; } +static MCRegister convertFPR64ToFPR128(MCRegister Reg) { + assert(Reg >= RISCV::F0_D && Reg <= RISCV::F31_D && "Invalid register"); + return Reg - RISCV::F0_D + RISCV::F0_Q; +} + static MCRegister convertVRToVRMx(const MCRegisterInfo &RI, MCRegister Reg, unsigned Kind) { unsigned RegClassID; @@ -1300,6 +1305,10 @@ unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, RISCVMCRegisterClasses[RISCV::FPR64CRegClassID].contains(Reg); bool IsRegVR = RISCVMCRegisterClasses[RISCV::VRRegClassID].contains(Reg); + if (IsRegFPR64 && Kind == MCK_FPR128) { + Op.Reg.RegNum = convertFPR64ToFPR128(Reg); + return Match_Success; + } // As the parser couldn't differentiate an FPR32 from an FPR64, coerce the // register from FPR64 to FPR32 or FPR64C to FPR32C if necessary. if ((IsRegFPR64 && Kind == MCK_FPR32) || @@ -1663,13 +1672,16 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // rejected. MCRegister RISCVAsmParser::matchRegisterNameHelper(StringRef Name) const { MCRegister Reg = MatchRegisterName(Name); - // The 16-/32- and 64-bit FPRs have the same asm name. Check that the initial - // match always matches the 64-bit variant, and not the 16/32-bit one. + // The 16-/32-/128- and 64-bit FPRs have the same asm name. Check + // that the initial match always matches the 64-bit variant, and + // not the 16/32/128-bit one. assert(!(Reg >= RISCV::F0_H && Reg <= RISCV::F31_H)); assert(!(Reg >= RISCV::F0_F && Reg <= RISCV::F31_F)); + assert(!(Reg >= RISCV::F0_Q && Reg <= RISCV::F31_Q)); // The default FPR register class is based on the tablegen enum ordering. static_assert(RISCV::F0_D < RISCV::F0_H, "FPR matching must be updated"); static_assert(RISCV::F0_D < RISCV::F0_F, "FPR matching must be updated"); + static_assert(RISCV::F0_D < RISCV::F0_Q, "FPR matching must be updated"); if (!Reg) Reg = MatchRegisterAltName(Name); if (isRVE() && Reg >= RISCV::X16 && Reg <= RISCV::X31) @@ -3848,6 +3860,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case RISCV::PseudoFLD: emitLoadStoreSymbol(Inst, RISCV::FLD, IDLoc, Out, /*HasTmpReg=*/true); return false; + case RISCV::PseudoFLQ: + emitLoadStoreSymbol(Inst, RISCV::FLQ, IDLoc, Out, /*HasTmpReg=*/true); + return false; case RISCV::PseudoSB: case RISCV::PseudoQC_E_SB: emitLoadStoreSymbol(Inst, RISCV::SB, IDLoc, Out, /*HasTmpReg=*/true); @@ -3875,6 +3890,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case RISCV::PseudoFSD: emitLoadStoreSymbol(Inst, RISCV::FSD, IDLoc, Out, /*HasTmpReg=*/true); return false; + case RISCV::PseudoFSQ: + emitLoadStoreSymbol(Inst, RISCV::FSQ, IDLoc, Out, /*HasTmpReg=*/true); + return false; case RISCV::PseudoAddTPRel: if (checkPseudoAddTPRel(Inst, Operands)) return true; diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index ee8aa376f467d..a5e76668db6b9 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -177,6 +177,17 @@ static DecodeStatus DecodeFPR64CRegisterClass(MCInst &Inst, uint32_t RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + if (RegNo >= 32) + return MCDisassembler::Fail; + + MCRegister Reg = RISCV::F0_Q + RegNo; + Inst.addOperand(MCOperand::createReg(Reg)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo, uint64_t Address, const MCDisassembler *Decoder) { @@ -728,7 +739,8 @@ static constexpr FeatureBitset XTHeadGroup = { RISCV::FeatureVendorXTHeadVdot}; static constexpr FeatureBitset XAndesGroup = { - RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVPackFPH}; + RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVPackFPH, + RISCV::FeatureVendorXAndesVDot}; static constexpr DecoderListEntry DecoderList32[]{ // Vendor Extensions diff --git a/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp index 1450d5f092f9c..d57479c80297d 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp @@ -103,8 +103,8 @@ void RISCVO0PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -121,7 +121,8 @@ bool RISCVO0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { auto &TPC = getAnalysis(); const Function &F = MF.getFunction(); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); const RISCVSubtarget &ST = MF.getSubtarget(); @@ -142,7 +143,7 @@ INITIALIZE_PASS_BEGIN(RISCVO0PreLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) INITIALIZE_PASS_END(RISCVO0PreLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V machine instrs before legalization", false, diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp index eaccf6d67dcc4..1e4c598d3adf9 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp @@ -107,8 +107,8 @@ void RISCVPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -137,7 +137,8 @@ bool RISCVPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const RISCVSubtarget &ST = MF.getSubtarget(); const auto *LI = ST.getLegalizerInfo(); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MachineDominatorTree *MDT = &getAnalysis().getDomTree(); GISelCSEAnalysisWrapper &Wrapper = @@ -157,7 +158,7 @@ INITIALIZE_PASS_BEGIN(RISCVPostLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V MachineInstrs after legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(RISCVPostLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V MachineInstrs after legalization", false, false) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp index afd25676a89eb..e6e8147f3118b 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp @@ -105,8 +105,8 @@ void RISCVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -137,7 +137,8 @@ bool RISCVPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MachineDominatorTree *MDT = &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, @@ -159,7 +160,7 @@ INITIALIZE_PASS_BEGIN(RISCVPreLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) INITIALIZE_PASS_END(RISCVPreLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V machine instrs before legalization", false, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp index 5785f7da88898..6df8b182885b8 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp @@ -13,8 +13,15 @@ #include "RISCVTargetStreamer.h" #include "RISCVBaseInfo.h" #include "RISCVMCTargetDesc.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/RISCVAttributes.h" #include "llvm/TargetParser/RISCVISAInfo.h" @@ -53,6 +60,55 @@ void RISCVTargetStreamer::emitTextAttribute(unsigned Attribute, void RISCVTargetStreamer::emitIntTextAttribute(unsigned Attribute, unsigned IntValue, StringRef StringValue) {} + +void RISCVTargetStreamer::emitNoteGnuPropertySection( + const uint32_t Feature1And) { + MCStreamer &OutStreamer = getStreamer(); + MCContext &Ctx = OutStreamer.getContext(); + + const Triple &Triple = Ctx.getTargetTriple(); + Align NoteAlign; + if (Triple.isArch64Bit()) { + NoteAlign = Align(8); + } else { + assert(Triple.isArch32Bit()); + NoteAlign = Align(4); + } + + assert(Ctx.getObjectFileType() == MCContext::Environment::IsELF); + MCSection *const NoteSection = + Ctx.getELFSection(".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC); + NoteSection->setAlignment(NoteAlign); + OutStreamer.pushSection(); + OutStreamer.switchSection(NoteSection); + + // Emit the note header + OutStreamer.emitIntValue(4, 4); // n_namsz + + MCSymbol *const NDescBeginSym = Ctx.createTempSymbol(); + MCSymbol *const NDescEndSym = Ctx.createTempSymbol(); + const MCExpr *const NDescSzExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(NDescEndSym, Ctx), + MCSymbolRefExpr::create(NDescBeginSym, Ctx), Ctx); + + OutStreamer.emitValue(NDescSzExpr, 4); // n_descsz + OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); // n_type + OutStreamer.emitBytes(StringRef("GNU", 4)); // n_name + + // Emit n_desc field + OutStreamer.emitLabel(NDescBeginSym); + OutStreamer.emitValueToAlignment(NoteAlign); + + // Emit the feature_1_and property + OutStreamer.emitIntValue(ELF::GNU_PROPERTY_RISCV_FEATURE_1_AND, 4); // pr_type + OutStreamer.emitIntValue(4, 4); // pr_datasz + OutStreamer.emitIntValue(Feature1And, 4); // pr_data + OutStreamer.emitValueToAlignment(NoteAlign); // pr_padding + + OutStreamer.emitLabel(NDescEndSym); + OutStreamer.popSection(); +} + void RISCVTargetStreamer::setTargetABI(RISCVABI::ABI ABI) { assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialized target ABI"); TargetABI = ABI; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h index 169cb0f79ba78..d7b3b1ed92068 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h @@ -58,6 +58,7 @@ class RISCVTargetStreamer : public MCTargetStreamer { virtual void emitTextAttribute(unsigned Attribute, StringRef String); virtual void emitIntTextAttribute(unsigned Attribute, unsigned IntValue, StringRef StringValue); + void emitNoteGnuPropertySection(const uint32_t Feature1And); void emitTargetAttributes(const MCSubtargetInfo &STI, bool EmitStackAlign); void setTargetABI(RISCVABI::ABI ABI); diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index e40bd184ddc38..5aa323abf068f 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -112,6 +112,8 @@ class RISCVAsmPrinter : public AsmPrinter { void emitFunctionEntryLabel() override; bool emitDirectiveOptionArch(); + void emitNoteGnuProperty(const Module &M); + private: void emitAttributes(const MCSubtargetInfo &SubtargetInfo); @@ -581,8 +583,10 @@ void RISCVAsmPrinter::emitEndOfAsmFile(Module &M) { RISCVTargetStreamer &RTS = static_cast(*OutStreamer->getTargetStreamer()); - if (TM.getTargetTriple().isOSBinFormatELF()) + if (TM.getTargetTriple().isOSBinFormatELF()) { RTS.finishAttributeSection(); + emitNoteGnuProperty(M); + } EmitHwasanMemaccessSymbols(M); } @@ -941,6 +945,15 @@ void RISCVAsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { } } +void RISCVAsmPrinter::emitNoteGnuProperty(const Module &M) { + if (const Metadata *const Flag = M.getModuleFlag("cf-protection-return"); + Flag && !mdconst::extract(Flag)->isZero()) { + RISCVTargetStreamer &RTS = + static_cast(*OutStreamer->getTargetStreamer()); + RTS.emitNoteGnuPropertySection(ELF::GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS); + } +} + static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, const AsmPrinter &AP) { MCContext &Ctx = AP.OutContext; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index daae4e88a38e2..b36d496137400 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -187,6 +187,7 @@ def FeatureStdExtZilsd def HasStdExtZilsd : Predicate<"Subtarget->hasStdExtZilsd()">, AssemblerPredicate<(all_of FeatureStdExtZilsd), "'Zilsd' (Load/Store pair instructions)">; +def NoHasStdExtZilsd : Predicate<"!Subtarget->hasStdExtZilsd()">; // Multiply Extensions @@ -291,6 +292,13 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">, AssemblerPredicate<(all_of FeatureStdExtD), "'D' (Double-Precision Floating-Point)">; +def FeatureStdExtQ + : RISCVExtension<2, 2, "Quad-Precision Floating-Point", [FeatureStdExtD]>, + RISCVExtensionBitmask<0, 16>; +def HasStdExtQ : Predicate<"Subtarget->hasStdExtQ()">, + AssemblerPredicate<(all_of FeatureStdExtQ), + "'Q' (Quad-Precision Floating-Point)">; + def FeatureStdExtZfhmin : RISCVExtension<1, 0, "Half-Precision Floating-Point Minimal", [FeatureStdExtF]>, @@ -1525,6 +1533,14 @@ def HasVendorXAndesVPackFPH AssemblerPredicate<(all_of FeatureVendorXAndesVPackFPH), "'XAndesVPackFPH' (Andes Vector Packed FP16 Extension)">; +def FeatureVendorXAndesVDot + : RISCVExtension<5, 0, "Andes Vector Dot Product Extension", + [FeatureStdExtZve32x]>; +def HasVendorXAndesVDot + : Predicate<"Subtarget->hasVendorXAndesVDot()">, + AssemblerPredicate<(all_of FeatureVendorXAndesVDot), + "'XAndesVDot' (Andes Vector Dot Product Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 72bec74584059..b80608c05ad57 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1813,9 +1813,22 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN) Amount = -Amount; - const RISCVRegisterInfo &RI = *STI.getRegisterInfo(); - RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount), - MachineInstr::NoFlags, getStackAlign()); + const RISCVTargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + int64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign()); + if (TLI->hasInlineStackProbe(MF) && -Amount >= ProbeSize) { + // When stack probing is enabled, the decrement of SP may need to be + // probed. We can handle both the decrement and the probing in + // allocateStack. + bool DynAllocation = + MF.getInfo()->hasDynamicAllocation(); + allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF), + /*NeedProbe=*/true, ProbeSize, DynAllocation); + } else { + const RISCVRegisterInfo &RI = *STI.getRegisterInfo(); + RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount), + MachineInstr::NoFlags, getStackAlign()); + } } } diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 9db15ff25f979..18af1545d5a34 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1626,6 +1626,51 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { } break; } + case RISCVISD::LD_RV32: { + assert(Subtarget->hasStdExtZilsd() && "LD_RV32 is only used with Zilsd"); + + SDValue Base, Offset; + SDValue Chain = Node->getOperand(0); + SDValue Addr = Node->getOperand(1); + SelectAddrRegImm(Addr, Base, Offset); + + SDValue Ops[] = {Base, Offset, Chain}; + MachineSDNode *New = CurDAG->getMachineNode( + RISCV::LD_RV32, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue Lo = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_even, DL, + MVT::i32, SDValue(New, 0)); + SDValue Hi = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_odd, DL, + MVT::i32, SDValue(New, 0)); + CurDAG->setNodeMemRefs(New, {cast(Node)->getMemOperand()}); + ReplaceUses(SDValue(Node, 0), Lo); + ReplaceUses(SDValue(Node, 1), Hi); + ReplaceUses(SDValue(Node, 2), SDValue(New, 1)); + CurDAG->RemoveDeadNode(Node); + return; + } + case RISCVISD::SD_RV32: { + SDValue Base, Offset; + SDValue Chain = Node->getOperand(0); + SDValue Addr = Node->getOperand(3); + SelectAddrRegImm(Addr, Base, Offset); + + SDValue Ops[] = { + CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32), + Node->getOperand(1), + CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32), + Node->getOperand(2), + CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)}; + + SDNode *RegPair = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + MVT::Untyped, Ops); + MachineSDNode *New = + CurDAG->getMachineNode(RISCV::SD_RV32, DL, MVT::Other, + {SDValue(RegPair, 0), Base, Offset, Chain}); + CurDAG->setNodeMemRefs(New, {cast(Node)->getMemOperand()}); + ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); + CurDAG->RemoveDeadNode(Node); + return; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IntNo = Node->getConstantOperandVal(0); switch (IntNo) { @@ -2531,8 +2576,7 @@ bool RISCVDAGToDAGISel::SelectAddrFrameIndex(SDValue Addr, SDValue &Base, static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, const RISCVSubtarget *Subtarget, SDValue Addr, SDValue &Base, SDValue &Offset, - bool IsPrefetch = false, - bool IsRV32Zdinx = false) { + bool IsPrefetch = false) { if (!isa(Addr)) return false; @@ -2546,9 +2590,6 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, if (!Subtarget->is64Bit() || isInt<32>(Hi)) { if (IsPrefetch && (Lo12 & 0b11111) != 0) return false; - if (IsRV32Zdinx && !isInt<12>(Lo12 + 4)) - return false; - if (Hi) { int64_t Hi20 = (Hi >> 12) & 0xfffff; Base = SDValue( @@ -2572,8 +2613,6 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, Lo12 = Seq.back().getImm(); if (IsPrefetch && (Lo12 & 0b11111) != 0) return false; - if (IsRV32Zdinx && !isInt<12>(Lo12 + 4)) - return false; // Drop the last instruction. Seq.pop_back(); @@ -2665,7 +2704,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, } bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, - SDValue &Offset, bool IsRV32Zdinx) { + SDValue &Offset) { if (SelectAddrFrameIndex(Addr, Base, Offset)) return true; @@ -2673,39 +2712,14 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, MVT VT = Addr.getSimpleValueType(); if (Addr.getOpcode() == RISCVISD::ADD_LO) { - // If this is non RV32Zdinx we can always fold. - if (!IsRV32Zdinx) { - Base = Addr.getOperand(0); - Offset = Addr.getOperand(1); - return true; - } - - // For RV32Zdinx we need to have more than 4 byte alignment so we can add 4 - // to the offset when we expand in RISCVExpandPseudoInsts. - if (auto *GA = dyn_cast(Addr.getOperand(1))) { - const DataLayout &DL = CurDAG->getDataLayout(); - Align Alignment = commonAlignment( - GA->getGlobal()->getPointerAlignment(DL), GA->getOffset()); - if (Alignment > 4) { - Base = Addr.getOperand(0); - Offset = Addr.getOperand(1); - return true; - } - } - if (auto *CP = dyn_cast(Addr.getOperand(1))) { - Align Alignment = commonAlignment(CP->getAlign(), CP->getOffset()); - if (Alignment > 4) { - Base = Addr.getOperand(0); - Offset = Addr.getOperand(1); - return true; - } - } + Base = Addr.getOperand(0); + Offset = Addr.getOperand(1); + return true; } - int64_t RV32ZdinxRange = IsRV32Zdinx ? 4 : 0; if (CurDAG->isBaseWithConstantOffset(Addr)) { int64_t CVal = cast(Addr.getOperand(1))->getSExtValue(); - if (isInt<12>(CVal) && isInt<12>(CVal + RV32ZdinxRange)) { + if (isInt<12>(CVal) && isInt<12>(CVal)) { Base = Addr.getOperand(0); if (Base.getOpcode() == RISCVISD::ADD_LO) { SDValue LoOperand = Base.getOperand(1); @@ -2718,8 +2732,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, const DataLayout &DL = CurDAG->getDataLayout(); Align Alignment = commonAlignment( GA->getGlobal()->getPointerAlignment(DL), GA->getOffset()); - if ((CVal == 0 || Alignment > CVal) && - (!IsRV32Zdinx || commonAlignment(Alignment, CVal) > 4)) { + if ((CVal == 0 || Alignment > CVal)) { int64_t CombinedOffset = CVal + GA->getOffset(); Base = Base.getOperand(0); Offset = CurDAG->getTargetGlobalAddress( @@ -2740,13 +2753,13 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, // Handle ADD with large immediates. if (Addr.getOpcode() == ISD::ADD && isa(Addr.getOperand(1))) { int64_t CVal = cast(Addr.getOperand(1))->getSExtValue(); - assert(!(isInt<12>(CVal) && isInt<12>(CVal + RV32ZdinxRange)) && + assert(!(isInt<12>(CVal) && isInt<12>(CVal)) && "simm12 not already handled?"); // Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use // an ADDI for part of the offset and fold the rest into the load/store. // This mirrors the AddiPair PatFrag in RISCVInstrInfo.td. - if (CVal >= -4096 && CVal <= (4094 - RV32ZdinxRange)) { + if (CVal >= -4096 && CVal <= 4094) { int64_t Adj = CVal < 0 ? -2048 : 2047; Base = SDValue( CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0), @@ -2764,7 +2777,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, // instructions. if (isWorthFoldingAdd(Addr) && selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr.getOperand(1), Base, - Offset, /*IsPrefetch=*/false, RV32ZdinxRange)) { + Offset, /*IsPrefetch=*/false)) { // Insert an ADD instruction with the materialized Hi52 bits. Base = SDValue( CurDAG->getMachineNode(RISCV::ADD, DL, VT, Addr.getOperand(0), Base), @@ -2774,7 +2787,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, } if (selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr, Base, Offset, - /*IsPrefetch=*/false, RV32ZdinxRange)) + /*IsPrefetch=*/false)) return true; Base = Addr; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index cd211d41f30fb..11d62e5edad3f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -46,11 +46,7 @@ class RISCVDAGToDAGISel : public SelectionDAGISel { std::vector &OutOps) override; bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset, - bool IsRV32Zdinx = false); - bool SelectAddrRegImmRV32Zdinx(SDValue Addr, SDValue &Base, SDValue &Offset) { - return SelectAddrRegImm(Addr, Base, Offset, true); - } + bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectAddrRegRegScale(SDValue Addr, unsigned MaxShiftAmount, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c01496c9a7f3a..5e761fccc815a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20,6 +20,7 @@ #include "RISCVSelectionDAGInfo.h" #include "RISCVSubtarget.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ValueTracking.h" @@ -309,15 +310,21 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction({ISD::VAARG, ISD::VACOPY, ISD::VAEND}, MVT::Other, Expand); - if (!Subtarget.hasVendorXTHeadBb()) + if (!Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() && + !Subtarget.hasVendorXqcibm() && !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand); + if (Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit()) { + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + } + if (Subtarget.is64Bit()) { setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); @@ -581,6 +588,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (!Subtarget.is64Bit()) setOperationAction(ISD::BITCAST, MVT::i64, Custom); + if (Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() && + !Subtarget.is64Bit()) { + setOperationAction(ISD::LOAD, MVT::f64, Custom); + setOperationAction(ISD::STORE, MVT::f64, Custom); + } + if (Subtarget.hasStdExtZfa()) { setOperationAction(ISD::ConstantFP, MVT::f64, Custom); setOperationAction(FPRndMode, MVT::f64, Legal); @@ -2389,6 +2402,8 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS, if (auto *RHSC = dyn_cast(RHS)) { int64_t C = RHSC->getSExtValue(); + const RISCVSubtarget &Subtarget = + DAG.getMachineFunction().getSubtarget(); switch (CC) { default: break; case ISD::SETGT: @@ -2398,6 +2413,13 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS, CC = ISD::SETGE; return; } + if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isInt<16>(C + 1)) { + // We have a branch immediate instruction for SETGE but not SETGT. + // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit signed immediate. + RHS = DAG.getSignedConstant(C + 1, DL, RHS.getValueType()); + CC = ISD::SETGE; + return; + } break; case ISD::SETLT: // Convert X < 1 to 0 >= X. @@ -2408,6 +2430,16 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS, return; } break; + case ISD::SETUGT: + if (Subtarget.hasVendorXqcibi() && C != INT64_MAX && isInt<16>(C + 1) && + C != -1) { + // We have a branch immediate instruction for SETUGE but not SETUGT. + // Convert X > C to X >= C + 1, if (C + 1) is a 16-bit signed immediate. + RHS = DAG.getSignedConstant(C + 1, DL, RHS.getValueType()); + CC = ISD::SETUGE; + return; + } + break; } } @@ -5321,15 +5353,13 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, [&]() { Operands.emplace_back(); }, [&](ArrayRef SrcSubMask, unsigned SrcVecIdx, unsigned DstVecIdx) { - Operands.emplace_back().emplace_back( - SrcVecIdx, UINT_MAX, - SmallVector(SrcSubMask.begin(), SrcSubMask.end())); + Operands.emplace_back().emplace_back(SrcVecIdx, UINT_MAX, + SmallVector(SrcSubMask)); }, [&](ArrayRef SrcSubMask, unsigned Idx1, unsigned Idx2, bool NewReg) { if (NewReg) Operands.emplace_back(); - Operands.back().emplace_back( - Idx1, Idx2, SmallVector(SrcSubMask.begin(), SrcSubMask.end())); + Operands.back().emplace_back(Idx1, Idx2, SmallVector(SrcSubMask)); }); assert(Operands.size() == NumOfDestRegs && "Whole vector must be processed"); // Note: check that we do not emit too many shuffles here to prevent code @@ -5654,12 +5684,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, if (SVT.isFloatingPoint()) V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, Ld->getPointerInfo().getWithOffset(Offset), - Ld->getOriginalAlign(), - Ld->getMemOperand()->getFlags()); + Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); else V = DAG.getExtLoad(ISD::EXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr, Ld->getPointerInfo().getWithOffset(Offset), SVT, - Ld->getOriginalAlign(), + Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); DAG.makeEquivalentMemoryOrdering(Ld, V); @@ -6368,7 +6397,7 @@ SDValue RISCVTargetLowering::expandUnalignedRVVLoad(SDValue Op, assert(NewVT.isValid() && "Expecting equally-sized RVV vector types to be legal"); SDValue L = DAG.getLoad(NewVT, DL, Load->getChain(), Load->getBasePtr(), - Load->getPointerInfo(), Load->getOriginalAlign(), + Load->getPointerInfo(), Load->getBaseAlign(), Load->getMemOperand()->getFlags()); return DAG.getMergeValues({DAG.getBitcast(VT, L), L.getValue(1)}, DL); } @@ -6400,7 +6429,7 @@ SDValue RISCVTargetLowering::expandUnalignedRVVStore(SDValue Op, "Expecting equally-sized RVV vector types to be legal"); StoredVal = DAG.getBitcast(NewVT, StoredVal); return DAG.getStore(Store->getChain(), DL, StoredVal, Store->getBasePtr(), - Store->getPointerInfo(), Store->getOriginalAlign(), + Store->getPointerInfo(), Store->getBaseAlign(), Store->getMemOperand()->getFlags()); } @@ -7705,19 +7734,42 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, } case ISD::LOAD: { auto *Load = cast(Op); - EVT VecTy = Load->getMemoryVT(); + EVT VT = Load->getValueType(0); + if (VT == MVT::f64) { + assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() && + !Subtarget.is64Bit() && "Unexpected custom legalisation"); + + // Replace a double precision load with two i32 loads and a BuildPairF64. + SDLoc DL(Op); + SDValue BasePtr = Load->getBasePtr(); + SDValue Chain = Load->getChain(); + + SDValue Lo = + DAG.getLoad(MVT::i32, DL, Chain, BasePtr, Load->getPointerInfo(), + Load->getBaseAlign(), Load->getMemOperand()->getFlags()); + BasePtr = DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(4)); + SDValue Hi = DAG.getLoad( + MVT::i32, DL, Chain, BasePtr, Load->getPointerInfo().getWithOffset(4), + Load->getBaseAlign(), Load->getMemOperand()->getFlags()); + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), + Hi.getValue(1)); + + SDValue Pair = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); + return DAG.getMergeValues({Pair, Chain}, DL); + } + // Handle normal vector tuple load. - if (VecTy.isRISCVVectorTuple()) { + if (VT.isRISCVVectorTuple()) { SDLoc DL(Op); MVT XLenVT = Subtarget.getXLenVT(); - unsigned NF = VecTy.getRISCVVectorTupleNumFields(); - unsigned Sz = VecTy.getSizeInBits().getKnownMinValue(); + unsigned NF = VT.getRISCVVectorTupleNumFields(); + unsigned Sz = VT.getSizeInBits().getKnownMinValue(); unsigned NumElts = Sz / (NF * 8); int Log2LMUL = Log2_64(NumElts) - 3; auto Flag = SDNodeFlags(); Flag.setNoUnsignedWrap(true); - SDValue Ret = DAG.getUNDEF(VecTy); + SDValue Ret = DAG.getUNDEF(VT); SDValue BasePtr = Load->getBasePtr(); SDValue VROffset = DAG.getNode(RISCVISD::READ_VLENB, DL, XLenVT); VROffset = @@ -7731,7 +7783,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, MVT::getScalableVectorVT(MVT::i8, NumElts), DL, Load->getChain(), BasePtr, MachinePointerInfo(Load->getAddressSpace()), Align(8)); OutChains.push_back(LoadVal.getValue(1)); - Ret = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTy, Ret, LoadVal, + Ret = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Ret, LoadVal, DAG.getVectorIdxConstant(i, DL)); BasePtr = DAG.getNode(ISD::ADD, DL, XLenVT, BasePtr, VROffset, Flag); } @@ -7748,13 +7800,54 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::STORE: { auto *Store = cast(Op); SDValue StoredVal = Store->getValue(); - EVT VecTy = StoredVal.getValueType(); + EVT VT = StoredVal.getValueType(); + if (VT == MVT::f64) { + assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() && + !Subtarget.is64Bit() && "Unexpected custom legalisation"); + + // Replace a double precision store with a SplitF64 and i32 stores. + SDValue DL(Op); + SDValue BasePtr = Store->getBasePtr(); + SDValue Chain = Store->getChain(); + SDValue Split = DAG.getNode(RISCVISD::SplitF64, DL, + DAG.getVTList(MVT::i32, MVT::i32), StoredVal); + + SDValue Lo = DAG.getStore(Chain, DL, Split.getValue(0), BasePtr, + Store->getPointerInfo(), Store->getBaseAlign(), + Store->getMemOperand()->getFlags()); + BasePtr = DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(4)); + SDValue Hi = DAG.getStore(Chain, DL, Split.getValue(1), BasePtr, + Store->getPointerInfo().getWithOffset(4), + Store->getBaseAlign(), + Store->getMemOperand()->getFlags()); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + } + if (VT == MVT::i64) { + assert(Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() && + "Unexpected custom legalisation"); + if (Store->isTruncatingStore()) + return SDValue(); + + if (!Subtarget.enableUnalignedScalarMem() && Store->getAlign() < 8) + return SDValue(); + + SDLoc DL(Op); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, StoredVal, + DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, StoredVal, + DAG.getTargetConstant(1, DL, MVT::i32)); + + return DAG.getMemIntrinsicNode( + RISCVISD::SD_RV32, DL, DAG.getVTList(MVT::Other), + {Store->getChain(), Lo, Hi, Store->getBasePtr()}, MVT::i64, + Store->getMemOperand()); + } // Handle normal vector tuple store. - if (VecTy.isRISCVVectorTuple()) { + if (VT.isRISCVVectorTuple()) { SDLoc DL(Op); MVT XLenVT = Subtarget.getXLenVT(); - unsigned NF = VecTy.getRISCVVectorTupleNumFields(); - unsigned Sz = VecTy.getSizeInBits().getKnownMinValue(); + unsigned NF = VT.getRISCVVectorTupleNumFields(); + unsigned Sz = VT.getSizeInBits().getKnownMinValue(); unsigned NumElts = Sz / (NF * 8); int Log2LMUL = Log2_64(NumElts) - 3; @@ -7775,7 +7868,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, StoredVal, DAG.getVectorIdxConstant(i, DL)); Ret = DAG.getStore(Chain, DL, Extract, BasePtr, MachinePointerInfo(Store->getAddressSpace()), - Store->getOriginalAlign(), + Store->getBaseAlign(), Store->getMemOperand()->getFlags()); Chain = Ret.getValue(0); BasePtr = DAG.getNode(ISD::ADD, DL, XLenVT, BasePtr, VROffset, Flag); @@ -13714,6 +13807,28 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, // sext_inreg we emit for ADD/SUB/MUL/SLLI. LoadSDNode *Ld = cast(N); + if (N->getValueType(0) == MVT::i64) { + assert(Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() && + "Unexpected custom legalisation"); + + if (!Subtarget.enableUnalignedScalarMem() && Ld->getAlign() < 8) + return; + + SDLoc DL(N); + SDValue Result = DAG.getMemIntrinsicNode( + RISCVISD::LD_RV32, DL, + DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), + {Ld->getChain(), Ld->getBasePtr()}, MVT::i64, Ld->getMemOperand()); + SDValue Lo = Result.getValue(0); + SDValue Hi = Result.getValue(1); + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + Results.append({Pair, Result.getValue(2)}); + return; + } + + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + SDLoc dl(N); SDValue Res = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Ld->getChain(), Ld->getBasePtr(), Ld->getMemoryVT(), @@ -15407,6 +15522,32 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG, return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget); } +// Try to expand a multiply to a sequence of shifts and add/subs, +// for a machine without native mul instruction. +static SDValue expandMulToNAFSequence(SDNode *N, SelectionDAG &DAG, + uint64_t MulAmt) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + const uint64_t BitWidth = VT.getFixedSizeInBits(); + + SDValue Result = DAG.getConstant(0, DL, N->getValueType(0)); + SDValue N0 = N->getOperand(0); + + // Find the Non-adjacent form of the multiplier. + for (uint64_t E = MulAmt, I = 0; E && I < BitWidth; ++I, E >>= 1) { + if (E & 1) { + bool IsAdd = (E & 3) == 1; + E -= IsAdd ? 1 : -1; + SDValue ShiftVal = DAG.getNode(ISD::SHL, DL, VT, N0, + DAG.getShiftAmountConstant(I, VT, DL)); + ISD::NodeType AddSubOp = IsAdd ? ISD::ADD : ISD::SUB; + Result = DAG.getNode(AddSubOp, DL, VT, Result, ShiftVal); + } + } + + return Result; +} + // X * (2^N +/- 2^M) -> (add/sub (shl X, C1), (shl X, C2)) static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, uint64_t MulAmt) { @@ -15442,21 +15583,24 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) - return SDValue(); - if (VT != Subtarget.getXLenVT()) return SDValue(); - const bool HasShlAdd = Subtarget.hasStdExtZba() || - Subtarget.hasVendorXTHeadBa() || - Subtarget.hasVendorXAndesPerf(); + bool ShouldExpandMul = + (!DCI.isBeforeLegalize() && !DCI.isCalledByLegalizer()) || + !Subtarget.hasStdExtZmmul(); + if (!ShouldExpandMul) + return SDValue(); ConstantSDNode *CNode = dyn_cast(N->getOperand(1)); if (!CNode) return SDValue(); uint64_t MulAmt = CNode->getZExtValue(); + const bool HasShlAdd = Subtarget.hasStdExtZba() || + Subtarget.hasVendorXTHeadBa() || + Subtarget.hasVendorXAndesPerf(); + // WARNING: The code below is knowingly incorrect with regards to undef semantics. // We're adding additional uses of X here, and in principle, we should be freezing // X before doing so. However, adding freeze here causes real regressions, and no @@ -15594,6 +15738,9 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt)) return V; + if (!Subtarget.hasStdExtZmmul()) + return expandMulToNAFSequence(N, DAG, MulAmt); + return SDValue(); } @@ -18884,6 +19031,10 @@ static SDValue combineToVCPOP(SDNode *N, SelectionDAG &DAG, if (!SrcMVT.isVector() || SrcMVT.getVectorElementType() != MVT::i1) return SDValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(SrcMVT)) + return SDValue(); + // Check that destination type is large enough to hold result without // overflow. if (Opc == ISD::VECREDUCE_ADD) { @@ -18900,9 +19051,6 @@ static SDValue combineToVCPOP(SDNode *N, SelectionDAG &DAG, MVT ContainerVT = SrcMVT; if (SrcMVT.isFixedLengthVector()) { - if (!useRVVForFixedLengthVectorVT(SrcMVT, Subtarget)) - return SDValue(); - ContainerVT = getContainerForFixedLengthVector(DAG, SrcMVT, Subtarget); Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget); } @@ -19747,7 +19895,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, NewVT, *Store->getMemOperand())) { SDValue NewV = DAG.getConstant(NewC, DL, NewVT); return DAG.getStore(Chain, DL, NewV, Store->getBasePtr(), - Store->getPointerInfo(), Store->getOriginalAlign(), + Store->getPointerInfo(), Store->getBaseAlign(), Store->getMemOperand()->getFlags()); } } @@ -19767,10 +19915,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), NewVT, *L->getMemOperand())) { SDValue NewL = DAG.getLoad(NewVT, DL, L->getChain(), L->getBasePtr(), - L->getPointerInfo(), L->getOriginalAlign(), + L->getPointerInfo(), L->getBaseAlign(), L->getMemOperand()->getFlags()); return DAG.getStore(Chain, DL, NewL, Store->getBasePtr(), - Store->getPointerInfo(), Store->getOriginalAlign(), + Store->getPointerInfo(), Store->getBaseAlign(), Store->getMemOperand()->getFlags()); } } @@ -21203,6 +21351,10 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitReadCounterWidePseudo(MI, BB); case RISCV::Select_GPR_Using_CC_GPR: case RISCV::Select_GPR_Using_CC_Imm: + case RISCV::Select_GPR_Using_CC_Simm5NonZero: + case RISCV::Select_GPR_Using_CC_Uimm5NonZero: + case RISCV::Select_GPR_Using_CC_Simm16NonZero: + case RISCV::Select_GPR_Using_CC_Uimm16NonZero: case RISCV::Select_FPR16_Using_CC_GPR: case RISCV::Select_FPR16INX_Using_CC_GPR: case RISCV::Select_FPR32_Using_CC_GPR: @@ -23384,8 +23536,8 @@ bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const { static Value *useTpOffset(IRBuilderBase &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getModule(); - Function *ThreadPointerFunc = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); + Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::thread_pointer, IRB.getPtrTy()); return IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset); } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index f181c1e137545..923990c1927eb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -651,29 +651,21 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFrameInfo &MFI = MF->getFrameInfo(); unsigned Opcode; - bool IsScalableVector = true; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW : RISCV::SD; - IsScalableVector = false; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::SH_INX; - IsScalableVector = false; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { Opcode = RISCV::SW_INX; - IsScalableVector = false; } else if (RISCV::GPRPairRegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoRV32ZdinxSD; - IsScalableVector = false; } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FSH; - IsScalableVector = false; } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FSW; - IsScalableVector = false; } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FSD; - IsScalableVector = false; } else if (RISCV::VRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::VS1R_V; } else if (RISCV::VRM2RegClass.hasSubClassEq(RC)) { @@ -707,7 +699,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, else llvm_unreachable("Can't store this register to stack slot"); - if (IsScalableVector) { + if (RISCVRegisterInfo::isRVVRegClass(RC)) { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, TypeSize::getScalable(MFI.getObjectSize(FI)), MFI.getObjectAlign(FI)); @@ -743,29 +735,21 @@ void RISCVInstrInfo::loadRegFromStackSlot( Flags & MachineInstr::FrameDestroy ? MBB.findDebugLoc(I) : DebugLoc(); unsigned Opcode; - bool IsScalableVector = true; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW : RISCV::LD; - IsScalableVector = false; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::LH_INX; - IsScalableVector = false; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { Opcode = RISCV::LW_INX; - IsScalableVector = false; } else if (RISCV::GPRPairRegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoRV32ZdinxLD; - IsScalableVector = false; } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FLH; - IsScalableVector = false; } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FLW; - IsScalableVector = false; } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FLD; - IsScalableVector = false; } else if (RISCV::VRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::VL1RE8_V; } else if (RISCV::VRM2RegClass.hasSubClassEq(RC)) { @@ -799,7 +783,7 @@ void RISCVInstrInfo::loadRegFromStackSlot( else llvm_unreachable("Can't load this register from stack slot"); - if (IsScalableVector) { + if (RISCVRegisterInfo::isRVVRegClass(RC)) { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, TypeSize::getScalable(MFI.getObjectSize(FI)), MFI.getObjectAlign(FI)); @@ -993,6 +977,30 @@ static RISCVCC::CondCode getCondFromBranchOpc(unsigned Opc) { return RISCVCC::COND_CV_BEQIMM; case RISCV::CV_BNEIMM: return RISCVCC::COND_CV_BNEIMM; + case RISCV::QC_BEQI: + return RISCVCC::COND_QC_BEQI; + case RISCV::QC_E_BEQI: + return RISCVCC::COND_QC_E_BEQI; + case RISCV::QC_BNEI: + return RISCVCC::COND_QC_BNEI; + case RISCV::QC_E_BNEI: + return RISCVCC::COND_QC_E_BNEI; + case RISCV::QC_BLTI: + return RISCVCC::COND_QC_BLTI; + case RISCV::QC_E_BLTI: + return RISCVCC::COND_QC_E_BLTI; + case RISCV::QC_BGEI: + return RISCVCC::COND_QC_BGEI; + case RISCV::QC_E_BGEI: + return RISCVCC::COND_QC_E_BGEI; + case RISCV::QC_BLTUI: + return RISCVCC::COND_QC_BLTUI; + case RISCV::QC_E_BLTUI: + return RISCVCC::COND_QC_E_BLTUI; + case RISCV::QC_BGEUI: + return RISCVCC::COND_QC_BGEUI; + case RISCV::QC_E_BGEUI: + return RISCVCC::COND_QC_E_BGEUI; } } @@ -1050,6 +1058,30 @@ unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC) { return RISCV::CV_BEQIMM; case RISCVCC::COND_CV_BNEIMM: return RISCV::CV_BNEIMM; + case RISCVCC::COND_QC_BEQI: + return RISCV::QC_BEQI; + case RISCVCC::COND_QC_E_BEQI: + return RISCV::QC_E_BEQI; + case RISCVCC::COND_QC_BNEI: + return RISCV::QC_BNEI; + case RISCVCC::COND_QC_E_BNEI: + return RISCV::QC_E_BNEI; + case RISCVCC::COND_QC_BLTI: + return RISCV::QC_BLTI; + case RISCVCC::COND_QC_E_BLTI: + return RISCV::QC_E_BLTI; + case RISCVCC::COND_QC_BGEI: + return RISCV::QC_BGEI; + case RISCVCC::COND_QC_E_BGEI: + return RISCV::QC_E_BGEI; + case RISCVCC::COND_QC_BLTUI: + return RISCV::QC_BLTUI; + case RISCVCC::COND_QC_E_BLTUI: + return RISCV::QC_E_BLTUI; + case RISCVCC::COND_QC_BGEUI: + return RISCV::QC_BGEUI; + case RISCVCC::COND_QC_E_BGEUI: + return RISCV::QC_E_BGEUI; } } @@ -1077,6 +1109,30 @@ RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) { return RISCVCC::COND_CV_BNEIMM; case RISCVCC::COND_CV_BNEIMM: return RISCVCC::COND_CV_BEQIMM; + case RISCVCC::COND_QC_BEQI: + return RISCVCC::COND_QC_BNEI; + case RISCVCC::COND_QC_E_BEQI: + return RISCVCC::COND_QC_E_BNEI; + case RISCVCC::COND_QC_BNEI: + return RISCVCC::COND_QC_BEQI; + case RISCVCC::COND_QC_E_BNEI: + return RISCVCC::COND_QC_E_BEQI; + case RISCVCC::COND_QC_BLTI: + return RISCVCC::COND_QC_BGEI; + case RISCVCC::COND_QC_E_BLTI: + return RISCVCC::COND_QC_E_BGEI; + case RISCVCC::COND_QC_BGEI: + return RISCVCC::COND_QC_BLTI; + case RISCVCC::COND_QC_E_BGEI: + return RISCVCC::COND_QC_E_BLTI; + case RISCVCC::COND_QC_BLTUI: + return RISCVCC::COND_QC_BGEUI; + case RISCVCC::COND_QC_E_BLTUI: + return RISCVCC::COND_QC_E_BGEUI; + case RISCVCC::COND_QC_BGEUI: + return RISCVCC::COND_QC_BLTUI; + case RISCVCC::COND_QC_E_BGEUI: + return RISCVCC::COND_QC_E_BLTUI; } } @@ -1452,6 +1508,18 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp, case RISCV::BGEU: case RISCV::CV_BEQIMM: case RISCV::CV_BNEIMM: + case RISCV::QC_BEQI: + case RISCV::QC_BNEI: + case RISCV::QC_BGEI: + case RISCV::QC_BLTI: + case RISCV::QC_BLTUI: + case RISCV::QC_BGEUI: + case RISCV::QC_E_BEQI: + case RISCV::QC_E_BNEI: + case RISCV::QC_E_BGEI: + case RISCV::QC_E_BLTI: + case RISCV::QC_E_BLTUI: + case RISCV::QC_E_BGEUI: return isIntN(13, BrOffset); case RISCV::JAL: case RISCV::PseudoBR: @@ -2633,6 +2701,12 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM5_LSB0: Ok = isShiftedUInt<4, 1>(Imm); break; + case RISCVOp::OPERAND_UIMM5_NONZERO: + Ok = isUInt<5>(Imm) && (Imm != 0); + break; + case RISCVOp::OPERAND_UIMM5_PLUS1: + Ok = (isUInt<5>(Imm) && (Imm != 0)) || (Imm == 32); + break; case RISCVOp::OPERAND_UIMM6_LSB0: Ok = isShiftedUInt<5, 1>(Imm); break; @@ -2660,6 +2734,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM10_LSB00_NONZERO: Ok = isShiftedUInt<8, 2>(Imm) && (Imm != 0); break; + case RISCVOp::OPERAND_UIMM16_NONZERO: + Ok = isUInt<16>(Imm) && (Imm != 0); + break; case RISCVOp::OPERAND_ZERO: Ok = Imm == 0; break; @@ -2679,6 +2756,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_SIMM5_PLUS1: Ok = (isInt<5>(Imm) && Imm != -16) || Imm == 16; break; + case RISCVOp::OPERAND_SIMM5_NONZERO: + Ok = isInt<5>(Imm) && (Imm != 0); + break; case RISCVOp::OPERAND_SIMM6_NONZERO: Ok = Imm != 0 && isInt<6>(Imm); break; @@ -2691,6 +2771,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_SIMM12_LSB00000: Ok = isShiftedInt<7, 5>(Imm); break; + case RISCVOp::OPERAND_SIMM16_NONZERO: + Ok = isInt<16>(Imm) && (Imm != 0); + break; case RISCVOp::OPERAND_SIMM20_LI: Ok = isInt<20>(Imm); break; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 1ec6eed82469e..b099acd81e995 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -43,6 +43,18 @@ enum CondCode { COND_GEU, COND_CV_BEQIMM, COND_CV_BNEIMM, + COND_QC_BEQI, + COND_QC_BNEI, + COND_QC_BLTI, + COND_QC_BGEI, + COND_QC_BLTUI, + COND_QC_BGEUI, + COND_QC_E_BEQI, + COND_QC_E_BNEI, + COND_QC_E_BLTI, + COND_QC_E_BGEI, + COND_QC_E_BLTUI, + COND_QC_E_BGEUI, COND_INVALID }; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index e9bdeb88e4ca8..ef054120d5443 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -2198,6 +2198,14 @@ def : Pat<(binop_allwusers GPR:$rs1, immop_oneuse:$rs2), def : Pat<(i64 (add GPR:$rs1, negImm:$rs2)), (SUB GPR:$rs1, negImm:$rs2)>; } +//===----------------------------------------------------------------------===// +// Zihintpause +//===----------------------------------------------------------------------===// + +// Zihintpause +let Predicates = [HasStdExtZihintpause] in +def : Pat<(int_riscv_pause), (FENCE 0x1, 0x0)>; + //===----------------------------------------------------------------------===// // Standard extensions //===----------------------------------------------------------------------===// @@ -2210,9 +2218,17 @@ include "RISCVInstrInfoA.td" include "RISCVInstrInfoZa.td" include "RISCVInstrInfoZalasr.td" +// Integer +include "RISCVInstrInfoZimop.td" +include "RISCVInstrInfoZicbo.td" +include "RISCVInstrInfoZicond.td" +include "RISCVInstrInfoZicfiss.td" +include "RISCVInstrInfoZilsd.td" + // Scalar FP include "RISCVInstrInfoF.td" include "RISCVInstrInfoD.td" +include "RISCVInstrInfoQ.td" include "RISCVInstrInfoZfh.td" include "RISCVInstrInfoZfbfmin.td" include "RISCVInstrInfoZfa.td" @@ -2226,13 +2242,6 @@ include "RISCVInstrInfoV.td" include "RISCVInstrInfoZvk.td" include "RISCVInstrInfoZvqdotq.td" -// Integer -include "RISCVInstrInfoZimop.td" -include "RISCVInstrInfoZicbo.td" -include "RISCVInstrInfoZicond.td" -include "RISCVInstrInfoZicfiss.td" -include "RISCVInstrInfoZilsd.td" - // Compressed include "RISCVInstrInfoC.td" include "RISCVInstrInfoZc.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 0c584daf45b14..414e093510607 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -27,8 +27,6 @@ def : GINodeEquiv; def RISCVSplitF64 : RVSDNode<"SplitF64", SDT_RISCVSplitF64>; def : GINodeEquiv; -def AddrRegImmINX : ComplexPattern; - //===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// @@ -529,18 +527,19 @@ defm Select_FPR64IN32X : SelectCC_GPR_rrirr; def PseudoFROUND_D_IN32X : PseudoFROUND; /// Loads -let isCall = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 1 in +let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 1 in def PseudoRV32ZdinxLD : Pseudo<(outs GPRPair:$dst), (ins GPR:$rs1, simm12:$imm12), []>; -def : Pat<(f64 (load (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12))), - (PseudoRV32ZdinxLD GPR:$rs1, simm12:$imm12)>; /// Stores -let isCall = 0, mayLoad = 0, mayStore = 1, Size = 8, isCodeGenOnly = 1 in +let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Size = 8, isCodeGenOnly = 1 in def PseudoRV32ZdinxSD : Pseudo<(outs), (ins GPRPair:$rs2, GPRNoX0:$rs1, simm12:$imm12), []>; -def : Pat<(store (f64 GPRPair:$rs2), (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12)), - (PseudoRV32ZdinxSD GPRPair:$rs2, GPR:$rs1, simm12:$imm12)>; } // Predicates = [HasStdExtZdinx, IsRV32] +let Predicates = [HasStdExtZdinx, HasStdExtZilsd, IsRV32] in { +def : LdPat; +def : StPat; +} + let Predicates = [HasStdExtD, IsRV32] in { // double->[u]int. Round-to-zero must be used. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 360191f03ddf7..84a75666e5f36 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -121,7 +121,7 @@ def FPR32INX : RegisterOperand { let ParserMatchClass = GPRAsFPR32; } -// Describes a combination of predicates from F/D/Zfh/Zfhmin or +// Describes a combination of predicates from F/D/Q/Zfh/Zfhmin or // Zfinx/Zdinx/Zhinx/Zhinxmin that are applied to scalar FP instruction. // Contains the DAGOperand for the primary type for the predicates. The primary // type may be unset for combinations of predicates like Zfh+D. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoQ.td b/llvm/lib/Target/RISCV/RISCVInstrInfoQ.td new file mode 100644 index 0000000000000..da78c13c0edcc --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoQ.td @@ -0,0 +1,167 @@ +//===-- RISCVInstrInfoQ.td - RISC-V 'Q' instructions -------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the RISC-V instructions from the standard 'Q', +// Quad-Precision Floating-Point instruction set extension. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Operand and SDNode transformation definitions. +//===----------------------------------------------------------------------===// + +def QExt : ExtInfo<"", "", [HasStdExtQ], f128, FPR128, FPR32, FPR64, ?>; + +defvar QExts = [QExt]; +defvar QExtsRV64 = [QExt]; + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtQ] in { + def FLQ : FPLoad_r<0b100, "flq", FPR128, WriteFLD128>; + + // Operands for stores are in the order srcreg, base, offset rather than + // reflecting the order these fields are specified in the instruction + // encoding. + def FSQ : FPStore_r<0b100, "fsq", FPR128, WriteFST128>; +} // Predicates = [HasStdExtQ] + +foreach Ext = QExts in { + let SchedRW = [WriteFMA128, ReadFMA128, ReadFMA128, ReadFMA128Addend] in { + defm FMADD_Q : FPFMA_rrr_frm_m; + defm FMSUB_Q : FPFMA_rrr_frm_m; + defm FNMSUB_Q : FPFMA_rrr_frm_m; + defm FNMADD_Q : FPFMA_rrr_frm_m; + } + + let SchedRW = [WriteFAdd128, ReadFAdd128, ReadFAdd128] in { + defm FADD_Q : FPALU_rr_frm_m<0b0000011, "fadd.q", Ext>; + defm FSUB_Q : FPALU_rr_frm_m<0b0000111, "fsub.q", Ext>; + } + + let SchedRW = [WriteFMul128, ReadFMul128, ReadFMul128] in + defm FMUL_Q : FPALU_rr_frm_m<0b0001011, "fmul.q", Ext>; + + let SchedRW = [WriteFDiv128, ReadFDiv128, ReadFDiv128] in + defm FDIV_Q : FPALU_rr_frm_m<0b0001111, "fdiv.q", Ext>; + + defm FSQRT_Q : FPUnaryOp_r_frm_m<0b0101111, 0b00000, Ext, Ext.PrimaryTy, + Ext.PrimaryTy, "fsqrt.q">, + Sched<[WriteFSqrt128, ReadFSqrt128]>; + + let SchedRW = [WriteFSGNJ128, ReadFSGNJ128, ReadFSGNJ128], + mayRaiseFPException = 0 in { + defm FSGNJ_Q : FPALU_rr_m<0b0010011, 0b000, "fsgnj.q", Ext>; + defm FSGNJN_Q : FPALU_rr_m<0b0010011, 0b001, "fsgnjn.q", Ext>; + defm FSGNJX_Q : FPALU_rr_m<0b0010011, 0b010, "fsgnjx.q", Ext>; + } + + let SchedRW = [WriteFMinMax128, ReadFMinMax128, ReadFMinMax128] in { + defm FMIN_Q : FPALU_rr_m<0b0010111, 0b000, "fmin.q", Ext, Commutable = 1>; + defm FMAX_Q : FPALU_rr_m<0b0010111, 0b001, "fmax.q", Ext, Commutable = 1>; + } + + defm FCVT_S_Q : FPUnaryOp_r_frm_m<0b0100000, 0b00011, Ext, Ext.F32Ty, + Ext.PrimaryTy, "fcvt.s.q">, + Sched<[WriteFCvtF128ToF32, ReadFCvtF128ToF32]>; + + defm FCVT_Q_S : FPUnaryOp_r_frmlegacy_m<0b0100011, 0b00000, Ext, + Ext.PrimaryTy, Ext.F32Ty, + "fcvt.q.s">, + Sched<[WriteFCvtF32ToF128, ReadFCvtF32ToF128]>; + + defm FCVT_D_Q : FPUnaryOp_r_frm_m<0b0100001, 0b00011, Ext, Ext.F64Ty, + Ext.PrimaryTy, "fcvt.d.q">, + Sched<[WriteFCvtF128ToF64, ReadFCvtF128ToF64]>; + + defm FCVT_Q_D : FPUnaryOp_r_frmlegacy_m<0b0100011, 0b00001, Ext, + Ext.PrimaryTy, Ext.F64Ty, + "fcvt.q.d">, + Sched<[WriteFCvtF64ToF128, ReadFCvtF64ToF128]>; + + let SchedRW = [WriteFCmp128, ReadFCmp128, ReadFCmp128] in { + defm FEQ_Q : FPCmp_rr_m<0b1010011, 0b010, "feq.q", Ext, Commutable = 1>; + defm FLT_Q : FPCmp_rr_m<0b1010011, 0b001, "flt.q", Ext>; + defm FLE_Q : FPCmp_rr_m<0b1010011, 0b000, "fle.q", Ext>; + } + + let mayRaiseFPException = 0 in + defm FCLASS_Q : FPUnaryOp_r_m<0b1110011, 0b00000, 0b001, Ext, GPR, + Ext.PrimaryTy, "fclass.q">, + Sched<[WriteFClass128, ReadFClass128]>; + + let IsSignExtendingOpW = 1 in + defm FCVT_W_Q : FPUnaryOp_r_frm_m<0b1100011, 0b00000, Ext, GPR, + Ext.PrimaryTy, "fcvt.w.q">, + Sched<[WriteFCvtF128ToI32, ReadFCvtF128ToI32]>; + + let IsSignExtendingOpW = 1 in + defm FCVT_WU_Q : FPUnaryOp_r_frm_m<0b1100011, 0b00001, Ext, GPR, + Ext.PrimaryTy, "fcvt.wu.q">, + Sched<[WriteFCvtF128ToI32, ReadFCvtF128ToI32]>; + + let mayRaiseFPException = 0 in + defm FCVT_Q_W : FPUnaryOp_r_frmlegacy_m<0b1101011, 0b00000, Ext, + Ext.PrimaryTy, GPR, "fcvt.q.w">, + Sched<[WriteFCvtI32ToF128, ReadFCvtI32ToF128]>; + + let mayRaiseFPException = 0 in + defm FCVT_Q_WU : FPUnaryOp_r_frmlegacy_m<0b1101011, 0b00001, Ext, + Ext.PrimaryTy, GPR, "fcvt.q.wu">, + Sched<[WriteFCvtI32ToF128, ReadFCvtI32ToF128]>; +} // foreach Ext = QExts + +foreach Ext = QExtsRV64 in { + defm FCVT_L_Q : FPUnaryOp_r_frm_m<0b1100011, 0b00010, Ext, GPR, + Ext.PrimaryTy, "fcvt.l.q", [IsRV64]>, + Sched<[WriteFCvtF128ToI64, ReadFCvtF128ToI64]>; + + defm FCVT_LU_Q : FPUnaryOp_r_frm_m<0b1100011, 0b00011, Ext, GPR, + Ext.PrimaryTy, "fcvt.lu.q", [IsRV64]>, + Sched<[WriteFCvtF128ToI64, ReadFCvtF128ToI64]>; + + let mayRaiseFPException = 0 in + defm FCVT_Q_L : FPUnaryOp_r_frmlegacy_m<0b1101011, 0b00010, Ext, + Ext.PrimaryTy, GPR, "fcvt.q.l", + [IsRV64]>, + Sched<[WriteFCvtI64ToF128, ReadFCvtI64ToF128]>; + + let mayRaiseFPException = 0 in + defm FCVT_Q_LU : FPUnaryOp_r_frmlegacy_m<0b1101011, 0b00011, Ext, + Ext.PrimaryTy, GPR, "fcvt.q.lu", + [IsRV64]>, + Sched<[WriteFCvtI64ToF128, ReadFCvtI64ToF128]>; +} // foreach Ext = QExtsRV64 + +//===----------------------------------------------------------------------===// +// Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20) +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtQ] in { + def : InstAlias<"flq $rd, (${rs1})", (FLQ FPR128:$rd, GPR:$rs1, 0), 0>; + def : InstAlias<"fsq $rs2, (${rs1})", (FSQ FPR128:$rs2, GPR:$rs1, 0), 0>; + + def : InstAlias<"fmv.q $rd, $rs", (FSGNJ_Q FPR128:$rd, FPR128:$rs, + FPR128:$rs)>; + def : InstAlias<"fabs.q $rd, $rs", (FSGNJX_Q FPR128:$rd, FPR128:$rs, + FPR128:$rs)>; + def : InstAlias<"fneg.q $rd, $rs", (FSGNJN_Q FPR128:$rd, FPR128:$rs, + FPR128:$rs)>; + + // fgt.q/fge.q are recognised by the GNU assembler but the canonical + // flt.q/fle.q forms will always be printed. Therefore, set a zero weight. + def : InstAlias<"fgt.q $rd, $rs, $rt", + (FLT_Q GPR:$rd, FPR128:$rt, FPR128:$rs), 0>; + def : InstAlias<"fge.q $rd, $rs, $rt", + (FLE_Q GPR:$rd, FPR128:$rt, FPR128:$rs), 0>; + + def PseudoFLQ : PseudoFloatLoad<"flq", FPR128>; + def PseudoFSQ : PseudoStore<"fsq", FPR128>; +} // Predicates = [HasStdExtQ] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index aa70a9d03cc1f..6afe88b805d35 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -338,6 +338,56 @@ class NDSRVInstVFPMAD funct6, string opcodestr> let RVVConstraint = VMConstraint; } +class NDSRVInstVD4DOT funct6, string opcodestr> + : RVInst<(outs VR:$vd), (ins VR:$vs1, VR:$vs2, VMaskOp:$vm), + opcodestr # "." # "vv", "$vd, $vs1, $vs2$vm", [], InstFormatR>, + SchedBinaryMC<"WriteVIMulAddV", "ReadVIMulAddV", "ReadVIMulAddV"> { + bits<5> vs2; + bits<5> vs1; + bits<5> vd; + bit vm; + + let Inst{31-26} = funct6; + let Inst{25} = vm; + let Inst{24-20} = vs2; + let Inst{19-15} = vs1; + let Inst{14-12} = 0b100; + let Inst{11-7} = vd; + let Inst{6-0} = OPC_CUSTOM_2.Value; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + + let RVVConstraint = VMConstraint; +} + +//===----------------------------------------------------------------------===// +// Multiclass +//===----------------------------------------------------------------------===// + +let fprclass = !cast("FPR32") in +def SCALAR_F16_FPR32 : FPR_Info<16>; + +let hasSideEffects = 0 in +multiclass VPseudoVFPMAD_VF_RM { + foreach m = SCALAR_F16_FPR32.MxList in { + defm "" : VPseudoBinaryV_VF_RM, + SchedBinary<"WriteVFMulAddF", "ReadVFMulAddV", "ReadVFMulAddF", + m.MX, SCALAR_F16_FPR32.SEW, forcePassthruRead=true>; + } +} + +multiclass VPatVFPMADBinaryV_VX_RM vtilist> { + foreach vti = vtilist in { + defvar kind = "V"#vti.ScalarSuffix; + defm : VPatBinaryRoundingMode; + } +} + //===----------------------------------------------------------------------===// // XAndesPerf //===----------------------------------------------------------------------===// @@ -398,9 +448,21 @@ let Predicates = [HasVendorXAndesVPackFPH], def NDS_VFPMADT_VF : NDSRVInstVFPMAD<0b000010, "nds.vfpmadt">; def NDS_VFPMADB_VF : NDSRVInstVFPMAD<0b000011, "nds.vfpmadb">; } + +//===----------------------------------------------------------------------===// +// XAndesVDot +//===----------------------------------------------------------------------===// + +let Predicates = [HasVendorXAndesVDot], Uses = [VL, VTYPE] in { +def NDS_VD4DOTS_VV : NDSRVInstVD4DOT<0b000100, "nds.vd4dots">; +def NDS_VD4DOTU_VV : NDSRVInstVD4DOT<0b000111, "nds.vd4dotu">; +def NDS_VD4DOTSU_VV : NDSRVInstVD4DOT<0b000101, "nds.vd4dotsu">; +} } // DecoderNamespace = "XAndes" -// Patterns +//===----------------------------------------------------------------------===// +// Pseudo-instructions and codegen patterns +//===----------------------------------------------------------------------===// let Predicates = [HasVendorXAndesPerf] in { @@ -428,3 +490,12 @@ def : Sh1AddPat; def : Sh2AddPat; def : Sh3AddPat; } // Predicates = [HasVendorXAndesPerf, IsRV64] + +let Predicates = [HasVendorXAndesVPackFPH], + mayRaiseFPException = true in { +defm PseudoNDS_VFPMADT : VPseudoVFPMAD_VF_RM; +defm PseudoNDS_VFPMADB : VPseudoVFPMAD_VF_RM; +} // Predicates = [HasVendorXAndesVPackFPH] + +defm : VPatVFPMADBinaryV_VX_RM<"int_riscv_nds_vfpmadt", "PseudoNDS_VFPMADT", AllFP16Vectors>; +defm : VPatVFPMADBinaryV_VX_RM<"int_riscv_nds_vfpmadb", "PseudoNDS_VFPMADB", AllFP16Vectors>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 49c2922fdbcff..5649e9b985366 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -188,6 +188,36 @@ def AddLike: PatFrags<(ops node:$A, node:$B), def AddShl : PatFrag<(ops node:$Ra, node:$Rb, node:$SH3), (add node:$Ra, (shl node:$Rb, node:$SH3))>; +def IntCCtoQCRISCVCC : SDNodeXForm(N->getOperand(2))->get(); + int64_t Imm = cast(N->getOperand(1))->getSExtValue(); + RISCVCC::CondCode BrCC; + switch (CC) { + default: + report_fatal_error("Unexpected CondCode for Xqcibi branch instructions"); + case ISD::SETEQ: + BrCC = isInt<5>(Imm) ? RISCVCC::COND_QC_BEQI : RISCVCC::COND_QC_E_BEQI; + break; + case ISD::SETNE: + BrCC = isInt<5>(Imm) ? RISCVCC::COND_QC_BNEI : RISCVCC::COND_QC_E_BNEI; + break; + case ISD::SETLT: + BrCC = isInt<5>(Imm) ? RISCVCC::COND_QC_BLTI : RISCVCC::COND_QC_E_BLTI; + break; + case ISD::SETGE: + BrCC = isInt<5>(Imm) ? RISCVCC::COND_QC_BGEI : RISCVCC::COND_QC_E_BGEI; + break; + case ISD::SETULT: + BrCC = isUInt<5>(Imm) ? RISCVCC::COND_QC_BLTUI : RISCVCC::COND_QC_E_BLTUI; + break; + case ISD::SETUGE: + BrCC = isUInt<5>(Imm) ? RISCVCC::COND_QC_BGEUI : RISCVCC::COND_QC_E_BGEUI; + break; + } + return CurDAG->getTargetConstant(BrCC, SDLoc(N), Subtarget->getXLenVT()); +}]>; + + //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -1288,6 +1318,36 @@ class QCScaledStPat : Pat<(StoreOp (i32 GPR:$rd), (AddShl (i32 GPRMem:$rs1), (i32 GPRNoX0:$rs2), uimm3:$shamt)), (Inst GPR:$rd, GPRMem:$rs1, GPRNoX0:$rs2, uimm3:$shamt)>; +// Match `riscv_brcc` and lower to the appropriate XQCIBI branch instruction. +class BcciPat + : Pat<(riscv_brcc (XLenVT GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12), + (Inst GPRNoX0:$rs1, InTyImm:$rs2, bare_simm13_lsb0:$imm12)>; + +class Bcci48Pat + : Pat<(riscv_brcc (XLenVT GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12), + (Inst GPRNoX0:$rs1, InTyImm:$rs2, bare_simm13_lsb0:$imm12)>; + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, usesCustomInserter = 1 in { + def Select_GPR_Using_CC_Simm5NonZero : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, simm5nonzero:$imm5, + cond_code:$cc, GPR:$truev, GPR:$falsev), []>; + def Select_GPR_Using_CC_Uimm5NonZero : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, uimm5nonzero:$imm5, + cond_code:$cc, GPR:$truev, GPR:$falsev), []>; + def Select_GPR_Using_CC_Simm16NonZero : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, simm16nonzero:$imm16, + cond_code:$cc, GPR:$truev, GPR:$falsev), []>; + def Select_GPR_Using_CC_Uimm16NonZero : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, uimm16nonzero:$imm16, + cond_code:$cc, GPR:$truev, GPR:$falsev), []>; +} + +class SelectQCbi + : Pat<(riscv_selectcc_frag:$cc (i32 GPR:$lhs), InTyImm:$Constant, Cond, + (i32 GPR:$truev), GPR:$falsev), + (OpNode GPR:$lhs, InTyImm:$Constant, + (IntCCtoQCRISCVCC $cc), GPR:$truev, GPR:$falsev)>; + /// Simple arithmetic operations let Predicates = [HasVendorXqcilia, IsRV32] in { @@ -1342,6 +1402,44 @@ def : PatGprNoX0GprNoX0; def : PatGprNoX0GprNoX0; } // Predicates = [HasVendorXqcia, IsRV32] +/// Branches + +let Predicates = [HasVendorXqcibi, IsRV32], AddedComplexity = 2 in { +def : BcciPat; +def : BcciPat; +def : BcciPat; +def : BcciPat; +def : BcciPat; +def : BcciPat; + +def : Bcci48Pat; +def : Bcci48Pat; +def : Bcci48Pat; +def : Bcci48Pat; +def : Bcci48Pat; +def : Bcci48Pat; + +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; + +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; +def : SelectQCbi; +} // let Predicates = [HasVendorXqcibi, IsRV32], AddedComplexity = 2 + +let Predicates = [HasVendorXqcibm, IsRV32] in { +def : Pat<(sext_inreg (i32 GPR:$rs1), i16), (QC_EXT GPR:$rs1, 16, 0)>; +def : Pat<(sext_inreg (i32 GPR:$rs1), i8), (QC_EXT GPR:$rs1, 8, 0)>; +def : Pat<(sext_inreg (i32 GPR:$rs1), i1), (QC_EXT GPR:$rs1, 1, 0)>; +} // Predicates = [HasVendorXqcibm, IsRV32] + let Predicates = [HasVendorXqciint, IsRV32] in def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>; @@ -1436,4 +1534,8 @@ def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs1, simm6nonzero:$imm), (C_ADDI GPRNoX0:$rs1, simm6nonzero:$imm)>; def : CompressPat<(QC_E_ANDI GPRC:$rs1, GPRC:$rs1, simm6:$imm), (C_ANDI GPRC:$rs1, simm6:$imm)>; +def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, 0), + (C_MV GPRNoX0:$rs1, GPRNoX0:$rs2)>; +def : CompressPat<(QC_E_ADDI X2, X2, simm10_lsb0000nonzero:$imm), + (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>; } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td index 8a449d32e0104..184473821dfdb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td @@ -175,6 +175,35 @@ def FLEQ_H : FPCmp_rr<0b1010010, 0b100, "fleq.h", FPR16>; } } // Predicates = [HasStdExtZfa, HasStdExtZfh] +let Predicates = [HasStdExtZfa, HasStdExtQ] in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def FLI_Q : FPFLI_r<0b1111011, 0b00001, 0b000, FPR128, "fli.q">, + Sched<[WriteFLI128]>; + +let SchedRW = [WriteFMinMax128, ReadFMinMax128, ReadFMinMax128] in { +def FMINM_Q: FPALU_rr<0b0010111, 0b010, "fminm.q", FPR128, Commutable=1>; +def FMAXM_Q: FPALU_rr<0b0010111, 0b011, "fmaxm.q", FPR128, Commutable=1>; +} + +def FROUND_Q : FPUnaryOp_r_frm<0b0100011, 0b00100, FPR128, FPR128, "fround.q">, + Sched<[WriteFRoundF128, ReadFRoundF128]>; +def FROUNDNX_Q : FPUnaryOp_r_frm<0b0100011, 0b00101, FPR128, FPR128, + "froundnx.q">, + Sched<[WriteFRoundF128, ReadFRoundF128]>; + +let SchedRW = [WriteFCmp128, ReadFCmp128, ReadFCmp128] in { +def FLTQ_Q : FPCmp_rr<0b1010011, 0b101, "fltq.q", FPR128>; +def FLEQ_Q : FPCmp_rr<0b1010011, 0b100, "fleq.q", FPR128>; +} +} // Predicates = [HasStdExtZfa, HasStdExtQ] + +let Predicates = [HasStdExtZfa, HasStdExtQ, IsRV64] in { + let mayRaiseFPException = 0 in { + def FMVH_X_Q : FPUnaryOp_r<0b1110011, 0b00001, 0b000, GPR, FPR128, "fmvh.x.q">; + def FMVP_Q_X : FPBinaryOp_rr<0b1011011, 0b000, FPR128, GPR, "fmvp.q.x">; + } +} // Predicates = [HasStdExtZfa, HasStdExtQ, IsRV64] + //===----------------------------------------------------------------------===// // Pseudo-instructions and codegen patterns //===----------------------------------------------------------------------===// @@ -200,6 +229,13 @@ def : InstAlias<"fgeq.h $rd, $rs, $rt", (FLEQ_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>; } +let Predicates = [HasStdExtZfa, HasStdExtQ] in { +def : InstAlias<"fgtq.q $rd, $rs, $rt", + (FLTQ_Q GPR:$rd, FPR128:$rt, FPR128:$rs), 0>; +def : InstAlias<"fgeq.q $rd, $rs, $rt", + (FLEQ_Q GPR:$rd, FPR128:$rt, FPR128:$rs), 0>; +} + //===----------------------------------------------------------------------===// // Codegen patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td index 56c870414596b..e44bdcb4e2f0f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td @@ -37,7 +37,7 @@ class CBO_r optype, string opcodestr> let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in class Prefetch_ri optype, string opcodestr> - : RVInstS<0b110, OPC_OP_IMM, (outs), (ins GPR:$rs1, simm12_lsb00000:$imm12), + : RVInstS<0b110, OPC_OP_IMM, (outs), (ins GPRMem:$rs1, simm12_lsb00000:$imm12), opcodestr, "${imm12}(${rs1})"> { let Inst{11-7} = 0b00000; let rs2 = optype; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td index 3e526273c0768..a3203f288b545 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td @@ -11,6 +11,20 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// RISC-V specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDT_RISCV_LD_RV32 + : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisPtrTy<2>]>; +def SDT_RISCV_SD_RV32 + : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisPtrTy<2>]>; + +def riscv_ld_rv32 : RVSDNode<"LD_RV32", SDT_RISCV_LD_RV32, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def riscv_st_rv32 : RVSDNode<"SD_RV32", SDT_RISCV_SD_RV32, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + //===----------------------------------------------------------------------===// // Instruction Class Templates //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index 5453753fa4579..1e2bdb10aa810 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -116,7 +116,9 @@ static unsigned log2LdstWidth(unsigned Opcode) { case RISCV::FSW: return 2; case RISCV::LD: + case RISCV::LD_RV32: case RISCV::SD: + case RISCV::SD_RV32: case RISCV::FLD: case RISCV::FSD: return 3; @@ -144,7 +146,9 @@ static unsigned offsetMask(unsigned Opcode) { case RISCV::FLW: case RISCV::FSW: case RISCV::LD: + case RISCV::LD_RV32: case RISCV::SD: + case RISCV::SD_RV32: case RISCV::FLD: case RISCV::FSD: return maskTrailingOnes(5U); @@ -184,7 +188,8 @@ static bool isCompressedReg(Register Reg) { RISCV::GPRF16CRegClass.contains(Reg) || RISCV::GPRF32CRegClass.contains(Reg) || RISCV::FPR32CRegClass.contains(Reg) || - RISCV::FPR64CRegClass.contains(Reg); + RISCV::FPR64CRegClass.contains(Reg) || + RISCV::GPRPairCRegClass.contains(Reg); } // Return true if MI is a load for which there exists a compressed version. @@ -203,6 +208,8 @@ static bool isCompressibleLoad(const MachineInstr &MI) { case RISCV::LW_INX: case RISCV::LD: return STI.hasStdExtCOrZca(); + case RISCV::LD_RV32: + return STI.hasStdExtZclsd(); case RISCV::FLW: return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); case RISCV::FLD: @@ -225,6 +232,8 @@ static bool isCompressibleStore(const MachineInstr &MI) { case RISCV::SW_INX: case RISCV::SD: return STI.hasStdExtCOrZca(); + case RISCV::SD_RV32: + return STI.hasStdExtZclsd(); case RISCV::FSW: return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce(); case RISCV::FSD: @@ -342,8 +351,10 @@ static Register analyzeCompressibleUses(MachineInstr &FirstMI, RCToScavenge = &RISCV::FPR32CRegClass; else if (RISCV::FPR64RegClass.contains(RegImm.Reg)) RCToScavenge = &RISCV::FPR64CRegClass; + else if (RISCV::GPRPairRegClass.contains(RegImm.Reg)) + RCToScavenge = &RISCV::GPRPairCRegClass; else - return RISCV::NoRegister; + return Register(); RegScavenger RS; RS.enterBasicBlockEnd(MBB); @@ -400,6 +411,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) { const RISCVSubtarget &STI = Fn.getSubtarget(); const RISCVInstrInfo &TII = *STI.getInstrInfo(); + const RISCVRegisterInfo &TRI = *STI.getRegisterInfo(); // This optimization only makes sense if compressed instructions are emitted. if (!STI.hasStdExtCOrZca()) @@ -438,7 +450,20 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) { BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(RISCV::PseudoMV_FPR32INX), NewReg) .addReg(RegImm.Reg); + } else if (RISCV::GPRPairRegClass.contains(RegImm.Reg)) { + assert(RegImm.Imm == 0); + BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(RISCV::ADDI), + TRI.getSubReg(NewReg, RISCV::sub_gpr_even)) + .addReg(TRI.getSubReg(RegImm.Reg, RISCV::sub_gpr_even)) + .addImm(0); + BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(RISCV::ADDI), + TRI.getSubReg(NewReg, RISCV::sub_gpr_odd)) + .addReg(TRI.getSubReg(RegImm.Reg, RISCV::sub_gpr_odd)) + .addImm(0); } else { + assert((RISCV::FPR32RegClass.contains(RegImm.Reg) || + RISCV::FPR64RegClass.contains(RegImm.Reg)) && + "Expected FP register class"); // If we are looking at replacing an FPR register we don't expect to // have any offset. The only compressible FP instructions with an offset // are loads and stores, for which the offset applies to the GPR operand diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index eb3d43c9af7c2..60ebd0fdff2a8 100644 --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -409,6 +409,7 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, case RISCV::LHU: case RISCV::LWU: case RISCV::LD: + case RISCV::LD_RV32: case RISCV::FLH: case RISCV::FLW: case RISCV::FLD: @@ -418,6 +419,7 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi, case RISCV::SW: case RISCV::SW_INX: case RISCV::SD: + case RISCV::SD_RV32: case RISCV::FSH: case RISCV::FSW: case RISCV::FSD: { diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index c6f6c9007b2b1..112142e1ef2f2 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -48,6 +48,9 @@ static_assert(RISCV::F31_F == RISCV::F0_F + 31, static_assert(RISCV::F1_D == RISCV::F0_D + 1, "Register list not consecutive"); static_assert(RISCV::F31_D == RISCV::F0_D + 31, "Register list not consecutive"); +static_assert(RISCV::F1_Q == RISCV::F0_Q + 1, "Register list not consecutive"); +static_assert(RISCV::F31_Q == RISCV::F0_Q + 31, + "Register list not consecutive"); static_assert(RISCV::V1 == RISCV::V0 + 1, "Register list not consecutive"); static_assert(RISCV::V31 == RISCV::V0 + 31, "Register list not consecutive"); @@ -288,6 +291,30 @@ void RISCVRegisterInfo::adjustReg(MachineBasicBlock &MBB, return; } + // Use the QC_E_ADDI instruction from the Xqcilia extension that can take a + // signed 26-bit immediate. + if (ST.hasVendorXqcilia() && isInt<26>(Val)) { + // The one case where using this instruction is sub-optimal is if Val can be + // materialized with a single compressible LUI and following add/sub is also + // compressible. Avoid doing this if that is the case. + int Hi20 = (Val & 0xFFFFF000) >> 12; + bool IsCompressLUI = + ((Val & 0xFFF) == 0) && (Hi20 != 0) && + (isUInt<5>(Hi20) || (Hi20 >= 0xfffe0 && Hi20 <= 0xfffff)); + bool IsCompressAddSub = + (SrcReg == DestReg) && + ((Val > 0 && RISCV::GPRNoX0RegClass.contains(SrcReg)) || + (Val < 0 && RISCV::GPRCRegClass.contains(SrcReg))); + + if (!(IsCompressLUI && IsCompressAddSub)) { + BuildMI(MBB, II, DL, TII->get(RISCV::QC_E_ADDI), DestReg) + .addReg(SrcReg, getKillRegState(KillSrcReg)) + .addImm(Val) + .setMIFlag(Flag); + return; + } + } + // Try to split the offset across two ADDIs. We need to keep the intermediate // result aligned after each ADDI. We need to determine the maximum value we // can put in each ADDI. In the negative direction, we can use -2048 which is diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index eb3d5e553f1ef..cd725ca6166e2 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -45,6 +45,13 @@ class RISCVReg64 let SubRegIndices = [sub_32]; } +def sub_64 : SubRegIndex<64>; +class RISCVReg128 + : RISCVRegWithSubRegs { + let SubRegIndices = [sub_64]; +} + let FallbackRegAltNameIndex = NoRegAltName in def ABIRegAltName : RegAltNameIndex; @@ -412,6 +419,11 @@ let RegAltNameIndices = [ABIRegAltName] in { def F#Index#_D : RISCVReg64("F"#Index#"_F")>, DwarfRegAlias("F"#Index#"_H")>; } + + foreach Index = 0-31 in { + def F#Index#_Q : RISCVReg128("F"#Index#"_D")>, + DwarfRegAlias("F"#Index#"_H")>; + } } // The order of registers represents the preferred allocation sequence, @@ -462,6 +474,15 @@ def FPR64C : RISCVRegisterClass<[f64], 64, (add (sequence "F%u_D", 8, 9) )>; +def FPR128 : RISCVRegisterClass<[f128], 128, (add + (sequence "F%u_Q", 15, 10), + (sequence "F%u_Q", 0, 7), + (sequence "F%u_Q", 16, 17), + (sequence "F%u_Q", 28, 31), + (sequence "F%u_Q", 8, 9), + (sequence "F%u_Q", 18, 27) +)>; + //===----------------------------------------------------------------------===// // GPR Classes for "H/F/D in X" //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVSchedGenericOOO.td b/llvm/lib/Target/RISCV/RISCVSchedGenericOOO.td index be9c4ddf7cf48..6dd973bc1e83f 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedGenericOOO.td +++ b/llvm/lib/Target/RISCV/RISCVSchedGenericOOO.td @@ -492,7 +492,9 @@ def : ReadAdvance; //===----------------------------------------------------------------------===// // Unsupported extensions //===----------------------------------------------------------------------===// +defm : UnsupportedSchedQ; defm : UnsupportedSchedV; +defm : UnsupportedSchedZfaWithQ; defm : UnsupportedSchedZvk; defm : UnsupportedSchedSFB; defm : UnsupportedSchedXsfvcp; diff --git a/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td b/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td index a1127966e8417..8ba4cd0acdd6c 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td +++ b/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td @@ -263,6 +263,7 @@ def : ReadAdvance; def : ReadAdvance; // Unsupported extensions. +defm : UnsupportedSchedQ; defm : UnsupportedSchedV; defm : UnsupportedSchedZbc; defm : UnsupportedSchedZbs; diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td index 1148581415380..4c4654ba2fc0f 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td +++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td @@ -250,6 +250,7 @@ def : ReadAdvance; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZba; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index f4d2073d3b52d..af64a871a9292 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -1300,6 +1300,7 @@ foreach mx = SchedMxList in { //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbc; defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td index 1ac05c9444725..370ea64699383 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td @@ -1231,6 +1231,7 @@ defm "" : LMULReadAdvance<"ReadVSM3MEV", 0>; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbc; defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td index ca116e0c54f3f..5933d73174f79 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td @@ -348,6 +348,7 @@ def : ReadAdvance; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbc; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td index 2bfd5ef811c7b..7c04d1c54473d 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td @@ -1487,6 +1487,7 @@ defm "" : LMULReadAdvance<"ReadVSM3MEV", 0>; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbc; defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index c21ab969d12ac..8948694c420a0 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -342,6 +342,7 @@ def : ReadAdvance; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedV; defm : UnsupportedSchedXsfvcp; defm : UnsupportedSchedZabha; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR345.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR345.td index e509abc9f922e..815c2da992a11 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR345.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR345.td @@ -199,6 +199,7 @@ multiclass SCR3_Unsupported : multiclass SCR4_SCR5_Unsupported : SCR_Unsupported, + UnsupportedSchedQ, UnsupportedSchedZfhmin; // Bypasses (none) diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR7.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR7.td index 4631474a945cb..decd578360753 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR7.td @@ -241,6 +241,7 @@ multiclass SCR7_Other { // Unsupported scheduling classes for SCR7. multiclass SCR7_Unsupported { + defm : UnsupportedSchedQ; defm : UnsupportedSchedSFB; defm : UnsupportedSchedV; defm : UnsupportedSchedXsfvcp; diff --git a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td index 2afe02552974e..5322de100d0ad 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td +++ b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td @@ -318,6 +318,7 @@ def : ReadAdvance; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedV; defm : UnsupportedSchedXsfvcp; defm : UnsupportedSchedZabha; diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td index 16d192feafd29..3076a2ebb813d 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td +++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td @@ -306,6 +306,7 @@ def : ReadAdvance; //===----------------------------------------------------------------------===// // Unsupported extensions +defm : UnsupportedSchedQ; defm : UnsupportedSchedV; defm : UnsupportedSchedZfa; defm : UnsupportedSchedZfhmin; diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td index f5c17d85f629d..4d49ad4d6b317 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedule.td +++ b/llvm/lib/Target/RISCV/RISCVSchedule.td @@ -43,26 +43,33 @@ def WriteAtomicSTD : SchedWrite; // Atomic store double word def WriteFAdd16 : SchedWrite; // 16-bit floating point addition/subtraction def WriteFAdd32 : SchedWrite; // 32-bit floating point addition/subtraction def WriteFAdd64 : SchedWrite; // 64-bit floating point addition/subtraction +def WriteFAdd128 : SchedWrite; // 128-bit floating point addition/subtraction def WriteFMul16 : SchedWrite; // 16-bit floating point multiply def WriteFMul32 : SchedWrite; // 32-bit floating point multiply def WriteFMul64 : SchedWrite; // 64-bit floating point multiply +def WriteFMul128 : SchedWrite; // 128-bit floating point multiply def WriteFMA16 : SchedWrite; // 16-bit floating point fused multiply-add def WriteFMA32 : SchedWrite; // 32-bit floating point fused multiply-add def WriteFMA64 : SchedWrite; // 64-bit floating point fused multiply-add +def WriteFMA128 : SchedWrite; // 128-bit floating point fused multiply-add def WriteFDiv16 : SchedWrite; // 16-bit floating point divide def WriteFDiv32 : SchedWrite; // 32-bit floating point divide def WriteFDiv64 : SchedWrite; // 64-bit floating point divide +def WriteFDiv128 : SchedWrite; // 128-bit floating point divide def WriteFSqrt16 : SchedWrite; // 16-bit floating point sqrt def WriteFSqrt32 : SchedWrite; // 32-bit floating point sqrt def WriteFSqrt64 : SchedWrite; // 64-bit floating point sqrt +def WriteFSqrt128 : SchedWrite; // 128-bit floating point sqrt // Integer to float conversions def WriteFCvtI32ToF16 : SchedWrite; def WriteFCvtI32ToF32 : SchedWrite; def WriteFCvtI32ToF64 : SchedWrite; +def WriteFCvtI32ToF128 : SchedWrite; def WriteFCvtI64ToF16 : SchedWrite; // RV64I only def WriteFCvtI64ToF32 : SchedWrite; // RV64I only def WriteFCvtI64ToF64 : SchedWrite; // RV64I only +def WriteFCvtI64ToF128 : SchedWrite; // RV64I only // Float to integer conversions def WriteFCvtF16ToI32 : SchedWrite; @@ -71,6 +78,8 @@ def WriteFCvtF32ToI32 : SchedWrite; def WriteFCvtF32ToI64 : SchedWrite; // RV64I only def WriteFCvtF64ToI32 : SchedWrite; def WriteFCvtF64ToI64 : SchedWrite; // RV64I only +def WriteFCvtF128ToI32 : SchedWrite; +def WriteFCvtF128ToI64 : SchedWrite; // RV64I only // Float to float conversions def WriteFCvtF32ToF64 : SchedWrite; @@ -79,24 +88,33 @@ def WriteFCvtF16ToF32 : SchedWrite; def WriteFCvtF32ToF16 : SchedWrite; def WriteFCvtF16ToF64 : SchedWrite; def WriteFCvtF64ToF16 : SchedWrite; +def WriteFCvtF128ToF32 : SchedWrite; +def WriteFCvtF128ToF64 : SchedWrite; +def WriteFCvtF32ToF128 : SchedWrite; +def WriteFCvtF64ToF128 : SchedWrite; // Zfa fround instructions. def WriteFRoundF32 : SchedWrite; def WriteFRoundF64 : SchedWrite; def WriteFRoundF16 : SchedWrite; +def WriteFRoundF128 : SchedWrite; def WriteFClass16 : SchedWrite; // 16-bit floating point classify def WriteFClass32 : SchedWrite; // 32-bit floating point classify def WriteFClass64 : SchedWrite; // 64-bit floating point classify +def WriteFClass128 : SchedWrite; // 128-bit floating point classify def WriteFCmp16 : SchedWrite; // 16-bit floating point compare def WriteFCmp32 : SchedWrite; // 32-bit floating point compare def WriteFCmp64 : SchedWrite; // 64-bit floating point compare +def WriteFCmp128 : SchedWrite; // 128-bit floating point compare def WriteFSGNJ16 : SchedWrite; // 16-bit floating point sign-injection def WriteFSGNJ32 : SchedWrite; // 32-bit floating point sign-injection def WriteFSGNJ64 : SchedWrite; // 64-bit floating point sign-injection +def WriteFSGNJ128 : SchedWrite; // 128-bit floating point sign-injection def WriteFMinMax16 : SchedWrite; // 16-bit floating point min or max def WriteFMinMax32 : SchedWrite; // 32-bit floating point min or max def WriteFMinMax64 : SchedWrite; // 64-bit floating point min or max +def WriteFMinMax128 : SchedWrite; // 128-bit floating point min or max def WriteFMovF16ToI16 : SchedWrite; def WriteFMovI16ToF16 : SchedWrite; @@ -108,13 +126,16 @@ def WriteFMovI64ToF64 : SchedWrite; // RV64I only def WriteFLI16 : SchedWrite; // Floating point constant load def WriteFLI32 : SchedWrite; // Floating point constant load def WriteFLI64 : SchedWrite; // Floating point constant load +def WriteFLI128 : SchedWrite; // Floating point constant load def WriteFLD16 : SchedWrite; // Floating point sp load def WriteFLD32 : SchedWrite; // Floating point sp load def WriteFLD64 : SchedWrite; // Floating point dp load +def WriteFLD128 : SchedWrite; // Floating point qp load def WriteFST16 : SchedWrite; // Floating point sp store def WriteFST32 : SchedWrite; // Floating point sp store def WriteFST64 : SchedWrite; // Floating point dp store +def WriteFST128 : SchedWrite; // Floating point qp store // short forward branch for Bullet def WriteSFB : SchedWrite; @@ -156,42 +177,55 @@ def ReadAtomicSTD : SchedRead; // Atomic store double word def ReadFAdd16 : SchedRead; // 16-bit floating point addition/subtraction def ReadFAdd32 : SchedRead; // 32-bit floating point addition/subtraction def ReadFAdd64 : SchedRead; // 64-bit floating point addition/subtraction +def ReadFAdd128 : SchedRead; // 128-bit floating point addition/subtraction def ReadFMul16 : SchedRead; // 16-bit floating point multiply def ReadFMul32 : SchedRead; // 32-bit floating point multiply def ReadFMul64 : SchedRead; // 64-bit floating point multiply +def ReadFMul128 : SchedRead; // 128-bit floating point multiply def ReadFMA16 : SchedRead; // 16-bit floating point fused multiply-add def ReadFMA16Addend : SchedRead; // 16-bit floating point fused multiply-add (addend) def ReadFMA32 : SchedRead; // 32-bit floating point fused multiply-add def ReadFMA32Addend : SchedRead; // 32-bit floating point fused multiply-add (addend) def ReadFMA64 : SchedRead; // 64-bit floating point fused multiply-add def ReadFMA64Addend : SchedRead; // 64-bit floating point fused multiply-add (addend) +def ReadFMA128 : SchedRead; // 128-bit floating point fused multiply-add +def ReadFMA128Addend: SchedRead; // 128-bit floating point fused multiply-add (addend) def ReadFDiv16 : SchedRead; // 16-bit floating point divide def ReadFDiv32 : SchedRead; // 32-bit floating point divide def ReadFDiv64 : SchedRead; // 64-bit floating point divide +def ReadFDiv128 : SchedRead; // 128-bit floating point divide def ReadFSqrt16 : SchedRead; // 16-bit floating point sqrt def ReadFSqrt32 : SchedRead; // 32-bit floating point sqrt def ReadFSqrt64 : SchedRead; // 64-bit floating point sqrt +def ReadFSqrt128 : SchedRead; // 128-bit floating point sqrt def ReadFCmp16 : SchedRead; def ReadFCmp32 : SchedRead; def ReadFCmp64 : SchedRead; +def ReadFCmp128 : SchedRead; def ReadFSGNJ16 : SchedRead; def ReadFSGNJ32 : SchedRead; def ReadFSGNJ64 : SchedRead; +def ReadFSGNJ128 : SchedRead; def ReadFMinMax16 : SchedRead; def ReadFMinMax32 : SchedRead; def ReadFMinMax64 : SchedRead; +def ReadFMinMax128 : SchedRead; def ReadFCvtF16ToI32 : SchedRead; def ReadFCvtF16ToI64 : SchedRead; def ReadFCvtF32ToI32 : SchedRead; def ReadFCvtF32ToI64 : SchedRead; def ReadFCvtF64ToI32 : SchedRead; def ReadFCvtF64ToI64 : SchedRead; +def ReadFCvtF128ToI32 : SchedRead; +def ReadFCvtF128ToI64 : SchedRead; def ReadFCvtI32ToF16 : SchedRead; def ReadFCvtI32ToF32 : SchedRead; def ReadFCvtI32ToF64 : SchedRead; +def ReadFCvtI32ToF128 : SchedRead; def ReadFCvtI64ToF16 : SchedRead; def ReadFCvtI64ToF32 : SchedRead; def ReadFCvtI64ToF64 : SchedRead; +def ReadFCvtI64ToF128 : SchedRead; def ReadFMovF16ToI16 : SchedRead; def ReadFMovI16ToF16 : SchedRead; def ReadFMovF32ToI32 : SchedRead; @@ -204,12 +238,20 @@ def ReadFCvtF16ToF32 : SchedRead; def ReadFCvtF32ToF16 : SchedRead; def ReadFCvtF16ToF64 : SchedRead; def ReadFCvtF64ToF16 : SchedRead; +def ReadFCvtF128ToF32 : SchedRead; +def ReadFCvtF128ToF64 : SchedRead; +def ReadFCvtF32ToF128 : SchedRead; +def ReadFCvtF64ToF128 : SchedRead; + def ReadFRoundF16 : SchedRead; def ReadFRoundF32 : SchedRead; def ReadFRoundF64 : SchedRead; +def ReadFRoundF128 : SchedRead; + def ReadFClass16 : SchedRead; def ReadFClass32 : SchedRead; def ReadFClass64 : SchedRead; +def ReadFClass128 : SchedRead; // For CPUs that support Zfhmin, but not Zfh. multiclass UnsupportedSchedZfh { @@ -266,7 +308,50 @@ def : ReadAdvance; } // Unsupported = true } -multiclass UnsupportedSchedD { +multiclass UnsupportedSchedQ { +let Unsupported = true in { +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +} // Unsupported = true +} + +multiclass UnsupportedSchedD : UnsupportedSchedQ { let Unsupported = true in { def : WriteRes; def : WriteRes; @@ -360,18 +445,40 @@ def : ReadAdvance; } // Unsupported = true } -multiclass UnsupportedSchedZfa { +multiclass UnsupportedSchedZfaWithQ { +let Unsupported = true in { +def : WriteRes; +def : WriteRes; + +def : ReadAdvance; +} +} + +multiclass UnsupportedSchedZfaWithD : UnsupportedSchedZfaWithQ { let Unsupported = true in { -def : WriteRes; -def : WriteRes; def : WriteRes; -def : WriteRes; -def : WriteRes; def : WriteRes; -def : ReadAdvance; def : ReadAdvance; +} +} + +multiclass UnsupportedSchedZfaWithZfh { +let Unsupported = true in { +def : WriteRes; +def : WriteRes; + def : ReadAdvance; +} +} + +multiclass UnsupportedSchedZfa : UnsupportedSchedZfaWithD, + UnsupportedSchedZfaWithZfh { +let Unsupported = true in { +def : WriteRes; +def : WriteRes; + +def : ReadAdvance; } // Unsupported = true } diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index d11ce46bf78b5..15dd4d57727dd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -37,6 +37,7 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include using namespace llvm; @@ -645,6 +646,12 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { OptimizationLevel Level) { LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated)); }); + + PB.registerVectorizerEndEPCallback( + [](FunctionPassManager &FPM, OptimizationLevel Level) { + if (Level.isOptimizingForSpeed()) + FPM.addPass(createFunctionToLoopPassAdaptor(EVLIndVarSimplifyPass())); + }); } yaml::MachineFunctionInfo * diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 9ed2ba274bc53..f7cbfa1546de6 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -83,8 +83,7 @@ struct OperandInfo { OperandInfo() = delete; static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) { - return A.Log2EEW == B.Log2EEW && A.EMUL->first == B.EMUL->first && - A.EMUL->second == B.EMUL->second; + return A.Log2EEW == B.Log2EEW && A.EMUL == B.EMUL; } static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) { diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index b824b9aeda660..5991a9af6364d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -319,7 +319,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::MaxByteOffset, {DerefBytes}); } - if (Arg.hasAttribute(Attribute::Alignment)) { + if (Arg.hasAttribute(Attribute::Alignment) && !ST->isVulkanEnv()) { auto Alignment = static_cast( Arg.getAttribute(Attribute::Alignment).getValueAsInt()); buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Alignment, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 22fc1ca2c4c2d..b336732ec4b64 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -1081,15 +1081,19 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( return; Value *Op0 = Ref->getOperand(0); Value *Op1 = Ref->getOperand(1); - Type *ElemTy0 = GR->findDeducedElementType(Op0); + bool Incomplete0 = isTodoType(Op0); + bool Incomplete1 = isTodoType(Op1); Type *ElemTy1 = GR->findDeducedElementType(Op1); + Type *ElemTy0 = (Incomplete0 && !Incomplete1 && ElemTy1) + ? nullptr + : GR->findDeducedElementType(Op0); if (ElemTy0) { KnownElemTy = ElemTy0; - Incomplete = isTodoType(Op0); + Incomplete = Incomplete0; Ops.push_back(std::make_pair(Op1, 1)); } else if (ElemTy1) { KnownElemTy = ElemTy1; - Incomplete = isTodoType(Op1); + Incomplete = Incomplete1; Ops.push_back(std::make_pair(Op0, 0)); } } else if (CallInst *CI = dyn_cast(I)) { @@ -1108,8 +1112,6 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( IRBuilder<> B(Ctx); for (auto &OpIt : Ops) { Value *Op = OpIt.first; - if (Op->use_empty()) - continue; if (AskOps && !AskOps->contains(Op)) continue; Type *AskTy = nullptr; diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 251828b6bc35b..ac397fc486e19 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -770,7 +770,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( if (IsConst && ST.isOpenCLEnv()) buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::Constant, {}); - if (GVar && GVar->getAlign().valueOrOne().value() != 1) { + if (GVar && GVar->getAlign().valueOrOne().value() != 1 && !ST.isVulkanEnv()) { unsigned Alignment = (unsigned)GVar->getAlign().valueOrOne().value(); buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::Alignment, {Alignment}); } @@ -799,6 +799,9 @@ static std::string GetSpirvImageTypeName(const SPIRVType *Type, const std::string &Prefix, SPIRVGlobalRegistry &GR); +// Returns a name based on the Type. Notes that this does not look at +// decorations, and will return the same string for two types that are the same +// except for decorations. static std::string buildSpirvTypeName(const SPIRVType *Type, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry &GR) { @@ -885,9 +888,9 @@ Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding( Register VarReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass); - // TODO: The name should come from the llvm-ir, but how that name will be - // passed from the HLSL to the backend has not been decided. Using this place - // holder for now. + // TODO(138533): The name should come from the llvm-ir, but how that name will + // be passed from the HLSL to the backend has not been decided. Using this + // place holder for now. std::string Name = ("__resource_" + buildSpirvTypeName(VarType, MIRBuilder, *this) + "_" + Twine(Set) + "_" + Twine(Binding)) @@ -955,6 +958,8 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct( const StructType *Ty, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier::AccessQualifier AccQual, bool ExplicitLayoutRequired, bool EmitIR) { + const SPIRVSubtarget &ST = + cast(MIRBuilder.getMF().getSubtarget()); SmallVector FieldTypes; constexpr unsigned MaxWordCount = UINT16_MAX; const size_t NumElements = Ty->getNumElements(); @@ -977,7 +982,7 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct( Register ResVReg = createTypeVReg(MIRBuilder); if (Ty->hasName()) buildOpName(ResVReg, Ty->getName(), MIRBuilder); - if (Ty->isPacked()) + if (Ty->isPacked() && !ST.isVulkanEnv()) buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {}); SPIRVType *SPVType = @@ -1629,7 +1634,8 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName( // Unable to recognize SPIRV type name return nullptr; - auto SpirvTy = getOrCreateSPIRVType(Ty, MIRBuilder, AQ, false, true); + const SPIRVType *SpirvTy = + getOrCreateSPIRVType(Ty, MIRBuilder, AQ, false, true); // Handle "type*" or "type* vector[N]". if (TypeStr.consume_front("*")) diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 3fcff3dd8f553..b6a2da6e2045d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -39,7 +39,7 @@ class SPIRVPreLegalizer : public MachineFunctionPass { } // namespace void SPIRVPreLegalizer::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp index c96ee6b02491a..d378f2b0d1fff 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp @@ -187,8 +187,8 @@ void SPIRVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); @@ -212,7 +212,8 @@ bool SPIRVPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); - GISelValueTracking *VT = &getAnalysis().get(MF); + GISelValueTracking *VT = + &getAnalysis().get(MF); MachineDominatorTree *MDT = &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, @@ -234,7 +235,7 @@ INITIALIZE_PASS_BEGIN(SPIRVPreLegalizerCombiner, DEBUG_TYPE, "Combine SPIRV machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysis) +INITIALIZE_PASS_DEPENDENCY(GISelValueTrackingAnalysisLegacy) INITIALIZE_PASS_END(SPIRVPreLegalizerCombiner, DEBUG_TYPE, "Combine SPIRV machine instrs before legalization", false, false) diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index b83aecffe779c..f8141c8daf6e7 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -2952,7 +2952,7 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG) LoadSDNode *LdNode = cast(Op.getNode()); assert(LdNode->getOffset().isUndef() && "Unexpected node type"); - Align Alignment = commonAlignment(LdNode->getOriginalAlign(), 8); + Align Alignment = commonAlignment(LdNode->getBaseAlign(), 8); SDValue Hi64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(), @@ -3018,7 +3018,7 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) { StNode->getValue(), SubRegOdd); - Align Alignment = commonAlignment(StNode->getOriginalAlign(), 8); + Align Alignment = commonAlignment(StNode->getBaseAlign(), 8); SDValue OutChains[2]; OutChains[0] = @@ -3050,8 +3050,7 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue()); SDValue Chain = DAG.getStore( St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(), - St->getOriginalAlign(), St->getMemOperand()->getFlags(), - St->getAAInfo()); + St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo()); return Chain; } @@ -3537,9 +3536,8 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N, SDLoc dl(N); SDValue LoadRes = DAG.getExtLoad( Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(), - Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, - Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags(), - Ld->getAAInfo()); + Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getBaseAlign(), + Ld->getMemOperand()->getFlags(), Ld->getAAInfo()); SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes); Results.push_back(Res); diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index d24a94ab25ff3..4bef8ff9bbac1 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -139,7 +139,7 @@ void SystemZELFFrameLowering::orderFrameObjects( return ADensityCmp < BDensityCmp; return A.DPairCount * B.ObjectSize < B.DPairCount * A.ObjectSize; }; - std::stable_sort(SortingObjects.begin(), SortingObjects.end(), CmpD12); + llvm::stable_sort(SortingObjects, CmpD12); // Now modify the original list to represent the final order that // we want. diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index db8b2bdb62077..8eb2c84be9b60 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -6930,10 +6930,9 @@ SDValue SystemZTargetLowering::lowerLoadF16(SDValue Op, } else { LoadSDNode *Ld = cast(Op.getNode()); assert(EVT(RegVT) == Ld->getMemoryVT() && "Unhandled f16 load"); - NewLd = - DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i64, Ld->getChain(), - Ld->getBasePtr(), Ld->getPointerInfo(), MVT::i16, - Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); + NewLd = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i64, Ld->getChain(), + Ld->getBasePtr(), Ld->getPointerInfo(), MVT::i16, + Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); } SDValue F16Val = convertToF16(NewLd, DAG); return DAG.getMergeValues({F16Val, NewLd.getValue(1)}, DL); @@ -7949,7 +7948,7 @@ SDValue SystemZTargetLowering::combineLOAD( if (HiPart) { SDValue EltLoad = DAG.getLoad( HiPart->getValueType(0), DL, LD->getChain(), LD->getBasePtr(), - LD->getPointerInfo(), LD->getOriginalAlign(), + LD->getPointerInfo(), LD->getBaseAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); DCI.CombineTo(HiPart, EltLoad, true); @@ -7959,7 +7958,7 @@ SDValue SystemZTargetLowering::combineLOAD( SDValue EltLoad = DAG.getLoad( LoPart->getValueType(0), DL, LD->getChain(), DAG.getObjectPtrOffset(DL, LD->getBasePtr(), TypeSize::getFixed(8)), - LD->getPointerInfo().getWithOffset(8), LD->getOriginalAlign(), + LD->getPointerInfo().getWithOffset(8), LD->getBaseAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo()); DCI.CombineTo(LoPart, EltLoad, true); @@ -8109,7 +8108,7 @@ SDValue SystemZTargetLowering::combineSTORE( SDValue AddrSpaceCast = DAG.getAddrSpaceCast(DL, PtrVT, SN->getBasePtr(), SYSTEMZAS::PTR32, 0); return DAG.getStore(SN->getChain(), DL, SN->getValue(), AddrSpaceCast, - SN->getPointerInfo(), SN->getOriginalAlign(), + SN->getPointerInfo(), SN->getBaseAlign(), SN->getMemOperand()->getFlags(), SN->getAAInfo()); } } @@ -8183,17 +8182,14 @@ SDValue SystemZTargetLowering::combineSTORE( if ((MemVT == MVT::i128 && isI128MovedFromParts(Op1, LoPart, HiPart)) || (MemVT == MVT::f128 && isF128MovedFromParts(Op1, LoPart, HiPart))) { SDLoc DL(SN); - SDValue Chain0 = - DAG.getStore(SN->getChain(), DL, HiPart, SN->getBasePtr(), - SN->getPointerInfo(), SN->getOriginalAlign(), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - SDValue Chain1 = - DAG.getStore(SN->getChain(), DL, LoPart, - DAG.getObjectPtrOffset(DL, SN->getBasePtr(), - TypeSize::getFixed(8)), - SN->getPointerInfo().getWithOffset(8), - SN->getOriginalAlign(), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); + SDValue Chain0 = DAG.getStore( + SN->getChain(), DL, HiPart, SN->getBasePtr(), SN->getPointerInfo(), + SN->getBaseAlign(), SN->getMemOperand()->getFlags(), SN->getAAInfo()); + SDValue Chain1 = DAG.getStore( + SN->getChain(), DL, LoPart, + DAG.getObjectPtrOffset(DL, SN->getBasePtr(), TypeSize::getFixed(8)), + SN->getPointerInfo().getWithOffset(8), SN->getBaseAlign(), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain0, Chain1); } diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 642a9cff4853c..11193304a785d 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -124,6 +124,10 @@ class X86AsmParser : public MCTargetAsmParser { return Result; } + bool tokenIsStartOfStatement(AsmToken::TokenKind Token) override { + return Token == AsmToken::LCurly; + } + X86TargetStreamer &getTargetStreamer() { assert(getParser().getStreamer().getTargetStreamer() && "do not have a target streamer"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 26da58a140331..6b4b4beb97ca5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7349,8 +7349,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, "Cannot merge volatile or atomic loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), LDBase->getOriginalAlign(), - MMOFlags); + LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, NewLd); @@ -7438,7 +7437,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode( X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), - LDBase->getOriginalAlign(), MachineMemOperand::MOLoad); + LDBase->getBaseAlign(), MachineMemOperand::MOLoad); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, ResNode); @@ -13084,8 +13083,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // uops even if the original vector load is not eliminated. // Reduce the vector load and shuffle to a broadcasted scalar load. - LoadSDNode *Ld = cast(V); - SDValue BaseAddr = Ld->getOperand(1); + auto *Ld = cast(V); + SDValue BaseAddr = Ld->getBasePtr(); MVT SVT = VT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); @@ -25372,12 +25371,11 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL); SDValue Ch0 = DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), - Store->getOriginalAlign(), - Store->getMemOperand()->getFlags()); - SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, - Store->getPointerInfo().getWithOffset(HalfOffset), - Store->getOriginalAlign(), - Store->getMemOperand()->getFlags()); + Store->getBaseAlign(), Store->getMemOperand()->getFlags()); + SDValue Ch1 = + DAG.getStore(Store->getChain(), DL, Value1, Ptr1, + Store->getPointerInfo().getWithOffset(HalfOffset), + Store->getBaseAlign(), Store->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); } @@ -25408,10 +25406,10 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, TypeSize::getFixed(Offset), DL); SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, DAG.getVectorIdxConstant(i, DL)); - SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, - Store->getPointerInfo().getWithOffset(Offset), - Store->getOriginalAlign(), - Store->getMemOperand()->getFlags()); + SDValue Ch = + DAG.getStore(Store->getChain(), DL, Scl, Ptr, + Store->getPointerInfo().getWithOffset(Offset), + Store->getBaseAlign(), Store->getMemOperand()->getFlags()); Stores.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); @@ -25444,7 +25442,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } @@ -25487,7 +25485,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, DAG.getVectorIdxConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } assert(Subtarget.hasSSE1() && "Expected SSE"); @@ -25522,7 +25520,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, "Expected AVX512F without AVX512DQI"); SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getOriginalAlign(), + Ld->getPointerInfo(), Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); // Replace chain users with the new chain. @@ -34740,7 +34738,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (Subtarget.hasSSE2()) { MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getOriginalAlign(), + Ld->getPointerInfo(), Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); MVT VecVT = MVT::getVectorVT(LdVT, 2); @@ -39065,7 +39063,7 @@ static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT, - LN->getPointerInfo(), LN->getOriginalAlign(), + LN->getPointerInfo(), LN->getBaseAlign(), LN->getMemOperand()->getFlags()); } @@ -42178,7 +42176,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, - LN->getPointerInfo(), LN->getOriginalAlign(), + LN->getPointerInfo(), LN->getBaseAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); @@ -42223,8 +42221,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SDValue Ops[] = { LN->getChain(), Ptr }; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, - LN->getPointerInfo().getWithOffset(Offset), - LN->getOriginalAlign(), + LN->getPointerInfo().getWithOffset(Offset), LN->getBaseAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); @@ -42259,7 +42256,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(), - LN->getPointerInfo(), LN->getOriginalAlign(), + LN->getPointerInfo(), LN->getBaseAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); @@ -44013,6 +44010,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::VZEXT_MOVL: // Variable blend. case X86ISD::BLENDV: + // Target unary shuffles: + case X86ISD::MOVDDUP: // Target unary shuffles by immediate: case X86ISD::PSHUFD: case X86ISD::PSHUFLW: @@ -46371,11 +46370,10 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) { - SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), - MemIntr->getBasePtr(), - MemIntr->getPointerInfo(), - MemIntr->getOriginalAlign(), - MemIntr->getMemOperand()->getFlags()); + SDValue Load = + DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(), + MemIntr->getPointerInfo(), MemIntr->getBaseAlign(), + MemIntr->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); return Load; } @@ -52949,12 +52947,11 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, NumElems / 2); SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), - Ld->getOriginalAlign(), - Ld->getMemOperand()->getFlags()); - SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, - Ld->getPointerInfo().getWithOffset(HalfOffset), - Ld->getOriginalAlign(), - Ld->getMemOperand()->getFlags()); + Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); + SDValue Load2 = + DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, + Ld->getPointerInfo().getWithOffset(HalfOffset), + Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); @@ -52970,8 +52967,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); if (TLI.isTypeLegal(IntVT)) { SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), - Ld->getOriginalAlign(), + Ld->getPointerInfo(), Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); @@ -53013,7 +53009,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, SDValue Cast = DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0); return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast, - Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(), + Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); } } @@ -53079,8 +53075,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, } Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); - Alignment = commonAlignment(MaskedOp->getOriginalAlign(), - EltVT.getStoreSize()); + Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize()); return true; } @@ -53323,7 +53318,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal = DAG.getBitcast(NewVT, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } @@ -53335,9 +53330,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SDValue Val = StoredVal.getOperand(0); // We must store zeros to the unused bits. Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1); - return DAG.getStore(St->getChain(), dl, Val, - St->getBasePtr(), St->getPointerInfo(), - St->getOriginalAlign(), + return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } @@ -53350,7 +53344,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, Ops[0] = StoredVal; StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } @@ -53372,19 +53366,16 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SDValue Ch0 = DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(), - St->getOriginalAlign(), - St->getMemOperand()->getFlags()); - SDValue Ch1 = - DAG.getStore(St->getChain(), dl, Hi, Ptr1, - St->getPointerInfo().getWithOffset(4), - St->getOriginalAlign(), - St->getMemOperand()->getFlags()); + St->getBaseAlign(), St->getMemOperand()->getFlags()); + SDValue Ch1 = DAG.getStore( + St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4), + St->getBaseAlign(), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } @@ -53406,7 +53397,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)), DAG.getConstant(SignMask, dl, IntVT)); return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } } @@ -53531,8 +53522,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0); return DAG.getTruncStore( St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT, - St->getOriginalAlign(), St->getMemOperand()->getFlags(), - St->getAAInfo()); + St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo()); } } @@ -53631,7 +53621,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), - St->getPointerInfo(), St->getOriginalAlign(), + St->getPointerInfo(), St->getBaseAlign(), St->getMemOperand()->getFlags()); } @@ -56708,13 +56698,15 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SDValue Scale = GorS->getScale(); EVT IndexVT = Index.getValueType(); EVT IndexSVT = IndexVT.getVectorElementType(); + unsigned IndexWidth = Index.getScalarValueSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); if (DCI.isBeforeLegalize()) { - unsigned IndexWidth = Index.getScalarValueSizeInBits(); // Attempt to move shifted index into the address scale, allows further // index truncation below. - if (Index.getOpcode() == ISD::SHL && isa(Scale)) { + if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT && + isa(Scale)) { unsigned ScaleAmt = Scale->getAsZExtVal(); assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2"); unsigned Log2ScaleAmt = Log2_32(ScaleAmt); @@ -56774,8 +56766,6 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, } } - EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); - // Try to move splat adders from the index operand to the base // pointer operand. Taking care to multiply by the scale. We can only do // this when index element type is the same as the pointer type. @@ -56823,8 +56813,6 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, } if (DCI.isBeforeLegalizeOps()) { - unsigned IndexWidth = Index.getScalarValueSizeInBits(); - // Make sure the index is either i32 or i64 if (IndexWidth != 32 && IndexWidth != 64) { MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; @@ -57142,7 +57130,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, std::pair Tmp = Subtarget.getTargetLowering()->BuildFILD( VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG); + Ld->getPointerInfo(), Ld->getBaseAlign(), DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second); return Tmp.first; } @@ -58406,9 +58394,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ((VT.is256BitVector() && (EltSizeInBits >= 32 || Subtarget.hasInt256())) || (VT.is512BitVector() && Subtarget.useAVX512Regs() && - (EltSizeInBits >= 32 || Subtarget.hasVBMI2())))) { - // TODO: Relax VBMI requirement for repeated shuffle ops - currently - // limited to targets that should always have good cross lane shuffles. + (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) { SDValue Concat0 = CombineSubOperand(VT, Ops, 0); SDValue Concat1 = CombineSubOperand(VT, Ops, 1); if (Concat0 || Concat1 || @@ -59985,7 +59971,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { SDValue Load = DAG.getExtLoad( Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), - MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); + MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); return Load; } @@ -60317,11 +60303,10 @@ static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) { LoadSDNode *LN = cast(Src.getNode()); if (LN->isSimple()) { - SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), - LN->getBasePtr(), - LN->getPointerInfo(), - LN->getOriginalAlign(), - LN->getMemOperand()->getFlags()); + SDValue NewLd = + DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(), + LN->getPointerInfo(), LN->getBaseAlign(), + LN->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1)); return NewLd; } diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp index ad7bdcd112459..e76ddd4b648dc 100644 --- a/llvm/lib/TargetParser/RISCVISAInfo.cpp +++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp @@ -925,8 +925,9 @@ void RISCVISAInfo::updateImpliedLengths() { assert(FLen == 0 && MaxELenFp == 0 && MaxELen == 0 && MinVLen == 0 && "Expected lengths to be initialied to zero"); - // TODO: Handle q extension. - if (Exts.count("d")) + if (Exts.count("q")) + FLen = 128; + else if (Exts.count("d")) FLen = 64; else if (Exts.count("f")) FLen = 32; diff --git a/llvm/lib/TextAPI/InterfaceFile.cpp b/llvm/lib/TextAPI/InterfaceFile.cpp index e2253723659e2..49e6fa673fd88 100644 --- a/llvm/lib/TextAPI/InterfaceFile.cpp +++ b/llvm/lib/TextAPI/InterfaceFile.cpp @@ -102,8 +102,7 @@ void InterfaceFile::inlineLibrary(std::shared_ptr Library, if (Overwrite && It != Documents.end() && Reexport->getInstallName() == (*It)->getInstallName()) { - std::replace(Documents.begin(), Documents.end(), *It, - std::move(Reexport)); + llvm::replace(Documents, *It, std::move(Reexport)); return; } diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp index 5375448d2d2e2..eea6dfba14e37 100644 --- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -30,6 +30,7 @@ class Lowerer : public coro::LowererBase { void lowerCoroPromise(CoroPromiseInst *Intrin); void lowerCoroDone(IntrinsicInst *II); void lowerCoroNoop(IntrinsicInst *II); + void hidePromiseAlloca(CoroIdInst *CoroId, CoroBeginInst *CoroBegin); public: Lowerer(Module &M) @@ -153,6 +154,28 @@ void Lowerer::lowerCoroNoop(IntrinsicInst *II) { II->eraseFromParent(); } +// Later middle-end passes will assume promise alloca dead after coroutine +// suspend, leading to misoptimizations. We hide promise alloca using +// coro.promise and will lower it back to alloca at CoroSplit. +void Lowerer::hidePromiseAlloca(CoroIdInst *CoroId, CoroBeginInst *CoroBegin) { + auto *PA = CoroId->getPromise(); + if (!PA || !CoroBegin) + return; + Builder.SetInsertPoint(*CoroBegin->getInsertionPointAfterDef()); + + auto *Alignment = Builder.getInt32(PA->getAlign().value()); + auto *FromPromise = Builder.getInt1(false); + SmallVector Arg{CoroBegin, Alignment, FromPromise}; + auto *PI = Builder.CreateIntrinsic( + Builder.getPtrTy(), Intrinsic::coro_promise, Arg, {}, "promise.addr"); + PI->setCannotDuplicate(); + PA->replaceUsesWithIf(PI, [CoroId](Use &U) { + bool IsBitcast = U == U.getUser()->stripPointerCasts(); + bool IsCoroId = U.getUser() == CoroId; + return !IsBitcast && !IsCoroId; + }); +} + // Prior to CoroSplit, calls to coro.begin needs to be marked as NoDuplicate, // as CoroSplit assumes there is exactly one coro.begin. After CoroSplit, // NoDuplicate attribute will be removed from coro.begin otherwise, it will @@ -165,6 +188,7 @@ static void setCannotDuplicate(CoroIdInst *CoroId) { void Lowerer::lowerEarlyIntrinsics(Function &F) { CoroIdInst *CoroId = nullptr; + CoroBeginInst *CoroBegin = nullptr; SmallVector CoroFrees; bool HasCoroSuspend = false; for (Instruction &I : llvm::make_early_inc_range(instructions(F))) { @@ -175,6 +199,13 @@ void Lowerer::lowerEarlyIntrinsics(Function &F) { switch (CB->getIntrinsicID()) { default: continue; + case Intrinsic::coro_begin: + case Intrinsic::coro_begin_custom_abi: + if (CoroBegin) + report_fatal_error( + "coroutine should have exactly one defining @llvm.coro.begin"); + CoroBegin = cast(&I); + break; case Intrinsic::coro_free: CoroFrees.push_back(cast(&I)); break; @@ -227,13 +258,16 @@ void Lowerer::lowerEarlyIntrinsics(Function &F) { } } - // Make sure that all CoroFree reference the coro.id intrinsic. - // Token type is not exposed through coroutine C/C++ builtins to plain C, so - // we allow specifying none and fixing it up here. - if (CoroId) + if (CoroId) { + // Make sure that all CoroFree reference the coro.id intrinsic. + // Token type is not exposed through coroutine C/C++ builtins to plain C, so + // we allow specifying none and fixing it up here. for (CoroFreeInst *CF : CoroFrees) CF->setArgOperand(0, CoroId); + hidePromiseAlloca(CoroId, CoroBegin); + } + // Coroutine suspention could potentially lead to any argument modified // outside of the function, hence arguments should not have noalias // attributes. diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 7b59c39283ded..02500ff778b80 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -192,7 +192,8 @@ static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin, // Collect "interesting" coroutine intrinsics. void coro::Shape::analyze(Function &F, SmallVectorImpl &CoroFrames, - SmallVectorImpl &UnusedCoroSaves) { + SmallVectorImpl &UnusedCoroSaves, + CoroPromiseInst *&CoroPromise) { clear(); bool HasFinalSuspend = false; @@ -286,6 +287,11 @@ void coro::Shape::analyze(Function &F, } } break; + case Intrinsic::coro_promise: + assert(CoroPromise == nullptr && + "CoroEarly must ensure coro.promise unique"); + CoroPromise = cast(II); + break; } } } @@ -477,7 +483,7 @@ void coro::AnyRetconABI::init() { void coro::Shape::cleanCoroutine( SmallVectorImpl &CoroFrames, - SmallVectorImpl &UnusedCoroSaves) { + SmallVectorImpl &UnusedCoroSaves, CoroPromiseInst *PI) { // The coro.frame intrinsic is always lowered to the result of coro.begin. for (CoroFrameInst *CF : CoroFrames) { CF->replaceAllUsesWith(CoroBegin); @@ -489,6 +495,13 @@ void coro::Shape::cleanCoroutine( for (CoroSaveInst *CoroSave : UnusedCoroSaves) CoroSave->eraseFromParent(); UnusedCoroSaves.clear(); + + if (PI) { + PI->replaceAllUsesWith(PI->isFromPromise() + ? cast(CoroBegin) + : cast(getPromiseAlloca())); + PI->eraseFromParent(); + } } static void propagateCallAttrsFromCallee(CallInst *Call, Function *Callee) { diff --git a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp index 9cf4e448c9b6f..7ea7937d8b827 100644 --- a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -91,8 +91,12 @@ PreservedAnalyses ForceFunctionAttrsPass::run(Module &M, bool Changed = false; if (!CSVFilePath.empty()) { auto BufferOrError = MemoryBuffer::getFileOrSTDIN(CSVFilePath); - if (!BufferOrError) - report_fatal_error("Cannot open CSV file."); + if (!BufferOrError) { + std::error_code EC = BufferOrError.getError(); + M.getContext().emitError("cannot open CSV file: " + EC.message()); + return PreservedAnalyses::all(); + } + StringRef Buffer = BufferOrError.get()->getBuffer(); auto MemoryBuffer = MemoryBuffer::getMemBuffer(Buffer); line_iterator It(*MemoryBuffer); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 4b2683dc6c2a7..5b4350845b726 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1832,8 +1832,8 @@ void CallsiteContextGraph::updateStackNodes() { DenseMap FuncToIndex; for (const auto &[Idx, CallCtxInfo] : enumerate(Calls)) FuncToIndex.insert({CallCtxInfo.Func, Idx}); - std::stable_sort( - Calls.begin(), Calls.end(), + llvm::stable_sort( + Calls, [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) { return A.StackIds.size() > B.StackIds.size() || (A.StackIds.size() == B.StackIds.size() && @@ -3688,27 +3688,27 @@ void CallsiteContextGraph::identifyClones( const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, /*Cold*/ 1, /*NotColdCold*/ 2}; - std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), - [&](const std::shared_ptr &A, - const std::shared_ptr &B) { - // Nodes with non-empty context ids should be sorted before - // those with empty context ids. - if (A->ContextIds.empty()) - // Either B ContextIds are non-empty (in which case we - // should return false because B < A), or B ContextIds - // are empty, in which case they are equal, and we should - // maintain the original relative ordering. - return false; - if (B->ContextIds.empty()) - return true; - - if (A->AllocTypes == B->AllocTypes) - // Use the first context id for each edge as a - // tie-breaker. - return *A->ContextIds.begin() < *B->ContextIds.begin(); - return AllocTypeCloningPriority[A->AllocTypes] < - AllocTypeCloningPriority[B->AllocTypes]; - }); + llvm::stable_sort(Node->CallerEdges, + [&](const std::shared_ptr &A, + const std::shared_ptr &B) { + // Nodes with non-empty context ids should be sorted + // before those with empty context ids. + if (A->ContextIds.empty()) + // Either B ContextIds are non-empty (in which case we + // should return false because B < A), or B ContextIds + // are empty, in which case they are equal, and we + // should maintain the original relative ordering. + return false; + if (B->ContextIds.empty()) + return true; + + if (A->AllocTypes == B->AllocTypes) + // Use the first context id for each edge as a + // tie-breaker. + return *A->ContextIds.begin() < *B->ContextIds.begin(); + return AllocTypeCloningPriority[A->AllocTypes] < + AllocTypeCloningPriority[B->AllocTypes]; + }); assert(Node->AllocTypes != (uint8_t)AllocationType::None); @@ -4180,8 +4180,7 @@ void CallsiteContextGraph::mergeNodeCalleeClones( // their caller edge counts, putting the original non-clone node first in // cases of a tie. This simplifies finding an existing node to use as the // merge node. - std::stable_sort(CalleeEdges.begin(), CalleeEdges.end(), - CalleeCallerEdgeLessThan); + llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan); /// Find other callers of the given set of callee edges that can /// share the same callee merge node. See the comments at this method diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 4947a0da3bdb0..616eeae3b1fec 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -2151,8 +2151,8 @@ void SampleProfileLoader::removePseudoProbeInstsDiscriminator(Module &M) { std::optional DwarfDiscriminator = PseudoProbeDwarfDiscriminator::extractDwarfBaseDiscriminator( Discriminator); - I.setDebugLoc(DIL->cloneWithDiscriminator( - DwarfDiscriminator ? *DwarfDiscriminator : 0)); + I.setDebugLoc( + DIL->cloneWithDiscriminator(DwarfDiscriminator.value_or(0))); } } } diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 49c9515fa6a0b..aa527aec622bf 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -298,7 +298,9 @@ wholeprogramdevirt::findLowestOffset(ArrayRef Targets, ++Byte; } } - return (MinByte + I) * 8; + // Rounding up ensures the constant is always stored at address we + // can directly load from without misalignment. + return alignTo((MinByte + I) * 8, Size); NextI:; } } @@ -1834,9 +1836,19 @@ bool DevirtModule::tryVirtualConstProp( if (!RetType) return false; unsigned BitWidth = RetType->getBitWidth(); + + // TODO: Since we can evaluated these constants at compile-time, we can save + // some space by calculating the smallest range of values that all these + // constants can fit in, then only allocate enough space to fit those values. + // At each callsite, we can get the original type by doing a sign/zero + // extension. For example, if we would store an i64, but we can see that all + // the values fit into an i16, then we can store an i16 before/after the + // vtable and at each callsite do a s/zext. if (BitWidth > 64) return false; + Align TypeAlignment = M.getDataLayout().getPrefTypeAlign(RetType); + // Make sure that each function is defined, does not access memory, takes at // least one argument, does not use its first argument (which we assume is // 'this'), and has the same return type. @@ -1861,6 +1873,18 @@ bool DevirtModule::tryVirtualConstProp( Fn->arg_empty() || !Fn->arg_begin()->use_empty() || Fn->getReturnType() != RetType) return false; + + // This only works if the integer size is at most the alignment of the + // vtable. If the table is underaligned, then we can't guarantee that the + // constant will always be aligned to the integer type alignment. For + // example, if the table is `align 1`, we can never guarantee that an i32 + // stored before/after the vtable is 32-bit aligned without changing the + // alignment of the new global. + GlobalVariable *GV = Target.TM->Bits->GV; + Align TableAlignment = M.getDataLayout().getValueOrABITypeAlignment( + GV->getAlign(), GV->getValueType()); + if (TypeAlignment > TableAlignment) + return false; } for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) { @@ -1880,6 +1904,9 @@ bool DevirtModule::tryVirtualConstProp( // Find an allocation offset in bits in all vtables associated with the // type. + // TODO: If there would be "holes" in the vtable that were added by + // padding, we could place i1s there to reduce any extra padding that + // would be introduced by the i1s. uint64_t AllocBefore = findLowestOffset(TargetsForSlot, /*IsAfter=*/false, BitWidth); uint64_t AllocAfter = @@ -1911,6 +1938,14 @@ bool DevirtModule::tryVirtualConstProp( setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte, OffsetBit); + // In an earlier check we forbade constant propagation from operating on + // tables whose alignment is less than the alignment needed for loading + // the constant. Thus, the address we take the offset from will always be + // aligned to at least this integer alignment. Now, we need to ensure that + // the offset is also aligned to this integer alignment to ensure we always + // have an aligned load. + assert(OffsetByte % TypeAlignment.value() == 0); + if (RemarksEnabled || AreStatisticsEnabled()) for (auto &&Target : TargetsForSlot) Target.WasDevirt = true; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index a031d2e79c7f9..24026e310ad11 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -5663,15 +5663,14 @@ static bool combineInstructionsOverFunction( // Iterate while there is work to do. unsigned Iteration = 0; while (true) { - ++Iteration; - - if (Iteration > Opts.MaxIterations && !VerifyFixpoint) { + if (Iteration >= Opts.MaxIterations && !VerifyFixpoint) { LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << Opts.MaxIterations << " on " << F.getName() << " reached; stopping without verifying fixpoint\n"); break; } + ++Iteration; ++NumWorklistIterations; LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index 0dedd0207571b..3fa844eda21cf 100644 --- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -42,6 +42,7 @@ enum class ARCRuntimeEntryPointKind { Autorelease, StoreStrong, RetainRV, + ClaimRV, UnsafeClaimRV, RetainAutorelease, RetainAutoreleaseRV, @@ -62,6 +63,7 @@ class ARCRuntimeEntryPoints { Autorelease = nullptr; StoreStrong = nullptr; RetainRV = nullptr; + ClaimRV = nullptr; UnsafeClaimRV = nullptr; RetainAutorelease = nullptr; RetainAutoreleaseRV = nullptr; @@ -87,6 +89,9 @@ class ARCRuntimeEntryPoints { case ARCRuntimeEntryPointKind::RetainRV: return getIntrinsicEntryPoint(RetainRV, Intrinsic::objc_retainAutoreleasedReturnValue); + case ARCRuntimeEntryPointKind::ClaimRV: + return getIntrinsicEntryPoint( + ClaimRV, Intrinsic::objc_claimAutoreleasedReturnValue); case ARCRuntimeEntryPointKind::UnsafeClaimRV: return getIntrinsicEntryPoint( UnsafeClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue); @@ -126,6 +131,9 @@ class ARCRuntimeEntryPoints { /// Declaration for objc_retainAutoreleasedReturnValue(). Function *RetainRV = nullptr; + /// Declaration for objc_claimAutoreleasedReturnValue(). + Function *ClaimRV = nullptr; + /// Declaration for objc_unsafeClaimAutoreleasedReturnValue(). Function *UnsafeClaimRV = nullptr; diff --git a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt index 9d234cce5f880..80867dbc270d7 100644 --- a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt +++ b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt @@ -22,5 +22,6 @@ add_llvm_component_library(LLVMObjCARCOpts Analysis Core Support + TargetParser TransformUtils ) diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp index b6ade1c29a2b5..32e7092e80117 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp @@ -101,8 +101,37 @@ BundledRetainClaimRVs::~BundledRetainClaimRVs() { // can't be tail calls. if (auto *CI = dyn_cast(CB)) CI->setTailCallKind(CallInst::TCK_NoTail); + + // We can also do one final optimization: modify the bundle in the + // annotated call, to change the bundle operand from + // objc_retainAutoreleasedReturnValue + // to: + // objc_claimAutoreleasedReturnValue + // allowing the marker to be omitted from the bundle expansion later. + // + // Note that, confusingly, ClaimRV is semantically equivalent to RetainRV, + // and only differs in that it doesn't require the marker. + // The bundle provides the guarantee that we're emitting the ClaimRV call + // adjacent to the original call, and providing that guarantee is the + // only difference between ClaimRV and RetainRV. + // + // UnsafeClaimRV has a different RC contract entirely. + + // Find the clang.arc.attachedcall bundle, and rewrite its operand. + if (UseClaimRV) { + for (auto OBI : CB->bundle_op_infos()) { + auto OBU = CB->operandBundleFromBundleOpInfo(OBI); + if (OBU.getTagID() == LLVMContext::OB_clang_arc_attachedcall && + OBU.Inputs[0] == EP.get(ARCRuntimeEntryPointKind::RetainRV)) { + CB->setOperand(OBI.Begin, + EP.get(ARCRuntimeEntryPointKind::ClaimRV)); + break; + } + } + } } + // Erase the RV call we emitted earlier: it's already in the bundle. EraseInstruction(P.first); } diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/llvm/lib/Transforms/ObjCARC/ObjCARC.h index f4d7c92d499c1..d0bff00446aa0 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARC.h +++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.h @@ -22,6 +22,7 @@ #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H #define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H +#include "ARCRuntimeEntryPoints.h" #include "llvm/Analysis/ObjCARCAnalysisUtils.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/IR/EHPersonalities.h" @@ -104,7 +105,9 @@ CallInst *createCallInstWithColors( class BundledRetainClaimRVs { public: - BundledRetainClaimRVs(bool ContractPass) : ContractPass(ContractPass) {} + BundledRetainClaimRVs(ARCRuntimeEntryPoints &EP, bool ContractPass, + bool UseClaimRV) + : EP(EP), ContractPass(ContractPass), UseClaimRV(UseClaimRV) {} ~BundledRetainClaimRVs(); /// Insert a retainRV/claimRV call to the normal destination blocks of invokes @@ -155,7 +158,9 @@ class BundledRetainClaimRVs { /// A map of inserted retainRV/claimRV calls to annotated calls/invokes. DenseMap RVCalls; + ARCRuntimeEntryPoints &EP; bool ContractPass; + bool UseClaimRV; }; } // end namespace objcarc diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index e11748b2c9dbb..86d7e2f07c1d9 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -42,6 +42,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/ObjCARC.h" using namespace llvm; @@ -52,6 +53,11 @@ using namespace llvm::objcarc; STATISTIC(NumPeeps, "Number of calls peephole-optimized"); STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); +static cl::opt UseObjCClaimRV( + "arc-contract-use-objc-claim-rv", + cl::desc( + "Enable generation of calls to objc_claimAutoreleasedReturnValue")); + //===----------------------------------------------------------------------===// // Declarations //===----------------------------------------------------------------------===// @@ -74,6 +80,9 @@ class ObjCARCContract { /// A flag indicating whether this optimization pass should run. bool Run; + /// Whether objc_claimAutoreleasedReturnValue is available. + bool HasClaimRV = false; + /// The inline asm string to insert between calls and RetainRV calls to make /// the optimization work on targets which need it. const MDString *RVInstMarker; @@ -517,6 +526,39 @@ bool ObjCARCContract::tryToPeepholeInstruction( } } +/// Should we use objc_claimAutoreleasedReturnValue? +static bool useClaimRuntimeCall(Module &M) { + // Let the flag override our OS-based default. + if (UseObjCClaimRV != cl::BOU_UNSET) + return UseObjCClaimRV == cl::BOU_TRUE; + + Triple TT(M.getTargetTriple()); + + // On x86_64, claimARV doesn't make sense, as the marker isn't actually a nop + // there (it's needed by the calling convention). + if (!TT.isAArch64()) + return false; + + unsigned Major = TT.getOSMajorVersion(); + switch (TT.getOS()) { + default: + return false; + case Triple::IOS: + case Triple::TvOS: + return Major >= 16; + case Triple::WatchOS: + return Major >= 9; + case Triple::BridgeOS: + return Major >= 7; + case Triple::MacOSX: + return Major >= 13; + case Triple::Darwin: + return Major >= 21; + } + + return false; +} + //===----------------------------------------------------------------------===// // Top Level Driver //===----------------------------------------------------------------------===// @@ -528,6 +570,8 @@ bool ObjCARCContract::init(Module &M) { EP.init(&M); + HasClaimRV = useClaimRuntimeCall(M); + // Initialize RVInstMarker. RVInstMarker = getRVInstMarker(M); @@ -545,7 +589,7 @@ bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) { AA = A; DT = D; PA.setAA(A); - BundledRetainClaimRVs BRV(/*ContractPass=*/true); + BundledRetainClaimRVs BRV(EP, /*ContractPass=*/true, HasClaimRV); BundledInsts = &BRV; std::pair R = BundledInsts->insertAfterInvokes(F, DT); diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 2ef87f531dfae..5eb3f51d38945 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -2423,7 +2423,7 @@ bool ObjCARCOpt::run(Function &F, AAResults &AA) { return false; Changed = CFGChanged = false; - BundledRetainClaimRVs BRV(/*ContractPass=*/false); + BundledRetainClaimRVs BRV(EP, /*ContractPass=*/false, /*UseClaimRV=*/false); BundledInsts = &BRV; LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index f2d6e268743eb..da5be383df15c 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -1563,8 +1563,12 @@ removeEntryFromStack(const StackEntry &E, ConstraintInfo &Info, static bool checkOrAndOpImpliedByOther( FactOrCheck &CB, ConstraintInfo &Info, Module *ReproducerModule, SmallVectorImpl &ReproducerCondStack, - SmallVectorImpl &DFSInStack) { + SmallVectorImpl &DFSInStack, + SmallVectorImpl &ToRemove) { Instruction *JoinOp = CB.getContextInst(); + if (JoinOp->use_empty()) + return false; + CmpInst *CmpToCheck = cast(CB.getInstructionToSimplify()); unsigned OtherOpIdx = JoinOp->getOperand(0) == CmpToCheck ? 1 : 0; @@ -1611,15 +1615,12 @@ static bool checkOrAndOpImpliedByOther( if (auto ImpliedCondition = checkCondition(CmpToCheck->getPredicate(), CmpToCheck->getOperand(0), CmpToCheck->getOperand(1), CmpToCheck, Info)) { - if (IsOr && isa(JoinOp)) { - JoinOp->setOperand( - OtherOpIdx == 0 ? 2 : 0, + if (IsOr == *ImpliedCondition) + JoinOp->replaceAllUsesWith( ConstantInt::getBool(JoinOp->getType(), *ImpliedCondition)); - } else - JoinOp->setOperand( - 1 - OtherOpIdx, - ConstantInt::getBool(JoinOp->getType(), *ImpliedCondition)); - + else + JoinOp->replaceAllUsesWith(JoinOp->getOperand(OtherOpIdx)); + ToRemove.push_back(JoinOp); return true; } @@ -1852,9 +1853,9 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, ReproducerModule.get(), ReproducerCondStack, S.DT, ToRemove); if (!Simplified && match(CB.getContextInst(), m_LogicalOp(m_Value(), m_Value()))) { - Simplified = - checkOrAndOpImpliedByOther(CB, Info, ReproducerModule.get(), - ReproducerCondStack, DFSInStack); + Simplified = checkOrAndOpImpliedByOther( + CB, Info, ReproducerModule.get(), ReproducerCondStack, DFSInStack, + ToRemove); } Changed |= Simplified; } else if (auto *MinMax = dyn_cast(Inst)) { diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 0087d037f8cf2..d3771c0903456 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -1123,14 +1123,10 @@ static bool replaceIfSimplePointerUse(const TargetTransformInfo &TTI, static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, Value *NewV) { IRBuilder<> B(MI); - MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa); - MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope); - MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); - if (auto *MSI = dyn_cast(MI)) { B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), MSI->getDestAlign(), false, // isVolatile - TBAA, ScopeMD, NoAliasMD); + MI->getAAMetadata()); } else if (auto *MTI = dyn_cast(MI)) { Value *Src = MTI->getRawSource(); Value *Dest = MTI->getRawDest(); @@ -1143,23 +1139,22 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, Dest = NewV; if (auto *MCI = dyn_cast(MTI)) { - MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); if (MCI->isForceInlined()) B.CreateMemCpyInline(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile - TBAA, TBAAStruct, ScopeMD, NoAliasMD); + MI->getAAMetadata()); else B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile - TBAA, TBAAStruct, ScopeMD, NoAliasMD); + MI->getAAMetadata()); } else { assert(isa(MTI)); B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile - TBAA, ScopeMD, NoAliasMD); + MI->getAAMetadata()); } } else llvm_unreachable("unhandled MemIntrinsic"); diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 8f5d1ecba982d..f33c84c307ab8 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1026,14 +1026,6 @@ bool LoopIdiomRecognize::processLoopStridedStore( SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) { Module *M = TheStore->getModule(); - Value *SplatValue = isBytewiseValue(StoredVal, *DL); - Constant *PatternValue = nullptr; - - if (!SplatValue) - PatternValue = getMemSetPatternValue(StoredVal, DL); - - assert((SplatValue || PatternValue) && - "Expected either splat value or pattern value."); // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the @@ -1095,9 +1087,6 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); - if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) - return Changed; - AAMDNodes AATags = TheStore->getAAMetadata(); for (Instruction *Store : Stores) AATags = AATags.merge(Store->getAAMetadata()); @@ -1107,12 +1096,11 @@ bool LoopIdiomRecognize::processLoopStridedStore( AATags = AATags.extendTo(-1); CallInst *NewCall; - if (SplatValue) { - NewCall = Builder.CreateMemSet( - BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), - /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); - } else { - assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); + if (Value *SplatValue = isBytewiseValue(StoredVal, *DL)) { + NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, + MaybeAlign(StoreAlignment), + /*isVolatile=*/false, AATags); + } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; @@ -1123,23 +1111,18 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. + Constant *PatternValue = getMemSetPatternValue(StoredVal, DL); + assert(PatternValue && "Expected pattern value."); GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. GV->setAlignment(Align(16)); - Value *PatternPtr = GV; - NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); - - // Set the TBAA info if present. - if (AATags.TBAA) - NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA); - - if (AATags.Scope) - NewCall->setMetadata(LLVMContext::MD_alias_scope, AATags.Scope); - - if (AATags.NoAlias) - NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias); + NewCall = Builder.CreateCall(MSP, {BasePtr, GV, NumBytes}); + NewCall->setAAMetadata(AATags); + } else { + // Neither a memset, nor memset_pattern16 + return Changed; } NewCall->setDebugLoc(TheStore->getDebugLoc()); @@ -1430,21 +1413,20 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // by previous checks. if (!IsAtomic) { if (UseMemMove) - NewCall = Builder.CreateMemMove( - StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes, - /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); + NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr, + LoadAlign, NumBytes, + /*isVolatile=*/false, AATags); else NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, - NumBytes, /*isVolatile=*/false, AATags.TBAA, - AATags.TBAAStruct, AATags.Scope, AATags.NoAlias); + NumBytes, /*isVolatile=*/false, AATags); } else { // Create the call. // Note that unordered atomic loads/stores are *required* by the spec to // have an alignment but non-atomic loads/stores may not. NewCall = Builder.CreateElementUnorderedAtomicMemCpy( StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize, - AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias); + AATags); } NewCall->setDebugLoc(TheStore->getDebugLoc()); diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 0e0c012a9d676..5487dbef8a434 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -81,17 +81,28 @@ MetadataPredicate createIdentityMDPredicate(const Function &F, return [](const Metadata *MD) { return false; }; DISubprogram *SPClonedWithinModule = F.getSubprogram(); + + // Don't clone inlined subprograms. + auto ShouldKeep = [SPClonedWithinModule](const DISubprogram *SP) -> bool { + return SP != SPClonedWithinModule; + }; + return [=](const Metadata *MD) { // Avoid cloning types, compile units, and (other) subprograms. if (isa(MD) || isa(MD)) return true; if (auto *SP = dyn_cast(MD)) - return SP != SPClonedWithinModule; + return ShouldKeep(SP); // If a subprogram isn't going to be cloned skip its lexical blocks as well. if (auto *LScope = dyn_cast(MD)) - return LScope->getSubprogram() != SPClonedWithinModule; + return ShouldKeep(LScope->getSubprogram()); + + // Avoid cloning local variables of subprograms that won't be cloned. + if (auto *DV = dyn_cast(MD)) + if (auto *S = dyn_cast_or_null(DV->getScope())) + return ShouldKeep(S->getSubprogram()); return false; }; diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index de84a76ede7ff..511c15555fa83 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -290,8 +290,9 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER // in Bionic's libc/private/bionic_tls.h. - Function *ThreadPointerFunc = - Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); + Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::thread_pointer, + IRB.getPtrTy(M->getDataLayout().getDefaultGlobalsAddressSpace())); return IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 8 * Slot); } diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 0dc6a7d2f594f..2b5488b2e8126 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_component_library(LLVMVectorize + EVLIndVarSimplify.cpp LoadStoreVectorizer.cpp LoopIdiomVectorize.cpp LoopVectorizationLegality.cpp diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp new file mode 100644 index 0000000000000..4a1fb095bae35 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp @@ -0,0 +1,301 @@ +//===---- EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass optimizes a vectorized loop with canonical IV to using EVL-based +// IV if it was tail-folded by predicated EVL. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" + +#define DEBUG_TYPE "evl-iv-simplify" + +using namespace llvm; + +STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated"); + +static cl::opt EnableEVLIndVarSimplify( + "enable-evl-indvar-simplify", + cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden, + cl::init(true)); + +namespace { +struct EVLIndVarSimplifyImpl { + ScalarEvolution &SE; + OptimizationRemarkEmitter *ORE = nullptr; + + EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR, + OptimizationRemarkEmitter *ORE) + : SE(LAR.SE), ORE(ORE) {} + + /// Returns true if modify the loop. + bool run(Loop &L); +}; +} // anonymous namespace + +/// Returns the constant part of vectorization factor from the induction +/// variable's step value SCEV expression. +static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) { + if (!Step) + return 0U; + + // Looking for loops with IV step value in the form of `( x + // vscale)`. + if (const auto *Mul = dyn_cast(Step)) { + if (Mul->getNumOperands() == 2) { + const SCEV *LHS = Mul->getOperand(0); + const SCEV *RHS = Mul->getOperand(1); + if (const auto *Const = dyn_cast(LHS); + Const && isa(RHS)) { + uint64_t V = Const->getAPInt().getLimitedValue(); + if (llvm::isUInt<32>(V)) + return V; + } + } + } + + // If not, see if the vscale_range of the parent function is a fixed value, + // which makes the step value to be replaced by a constant. + if (F.hasFnAttribute(Attribute::VScaleRange)) + if (const auto *ConstStep = dyn_cast(Step)) { + APInt V = ConstStep->getAPInt().abs(); + ConstantRange CR = llvm::getVScaleRange(&F, 64); + if (const APInt *Fixed = CR.getSingleElement()) { + V = V.zextOrTrunc(Fixed->getBitWidth()); + uint64_t VF = V.udiv(*Fixed).getLimitedValue(); + if (VF && llvm::isUInt<32>(VF) && + // Make sure step is divisible by vscale. + V.urem(*Fixed).isZero()) + return VF; + } + } + + return 0U; +} + +bool EVLIndVarSimplifyImpl::run(Loop &L) { + if (!EnableEVLIndVarSimplify) + return false; + + if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized")) + return false; + const MDOperand *EVLMD = + findStringMetadataForLoop(&L, "llvm.loop.isvectorized.tailfoldingstyle") + .value_or(nullptr); + if (!EVLMD || !EVLMD->equalsStr("evl")) + return false; + + BasicBlock *LatchBlock = L.getLoopLatch(); + ICmpInst *OrigLatchCmp = L.getLatchCmpInst(); + if (!LatchBlock || !OrigLatchCmp) + return false; + + InductionDescriptor IVD; + PHINode *IndVar = L.getInductionVariable(SE); + if (!IndVar || !L.getInductionDescriptor(SE, IVD)) { + const char *Reason = (IndVar ? "induction descriptor is not available" + : "cannot recognize induction variable"); + LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName() + << " because" << Reason << "\n"); + if (ORE) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", + L.getStartLoc(), L.getHeader()) + << "Cannot retrieve IV because " << ore::NV("Reason", Reason); + }); + } + return false; + } + + BasicBlock *InitBlock, *BackEdgeBlock; + if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) { + LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in " + << L.getName() << "\n"); + if (ORE) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", + L.getStartLoc(), L.getHeader()) + << "Does not have a unique incoming and backedge"; + }); + } + return false; + } + + // Retrieve the loop bounds. + std::optional Bounds = L.getBounds(SE); + if (!Bounds) { + LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName() + << "\n"); + if (ORE) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", + L.getStartLoc(), L.getHeader()) + << "Could not obtain the loop bounds"; + }); + } + return false; + } + Value *CanonicalIVInit = &Bounds->getInitialIVValue(); + Value *CanonicalIVFinal = &Bounds->getFinalIVValue(); + + const SCEV *StepV = IVD.getStep(); + uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent()); + if (!VF) { + LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV + << "'\n"); + if (ORE) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", + L.getStartLoc(), L.getHeader()) + << "Could not infer VF from IndVar step " + << ore::NV("Step", StepV); + }); + } + return false; + } + LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName() + << "\n"); + + // Try to find the EVL-based induction variable. + using namespace PatternMatch; + BasicBlock *BB = IndVar->getParent(); + + Value *EVLIndVar = nullptr; + Value *RemTC = nullptr; + Value *TC = nullptr; + auto IntrinsicMatch = m_Intrinsic( + m_Value(RemTC), m_SpecificInt(VF), + /*Scalable=*/m_SpecificInt(1)); + for (PHINode &PN : BB->phis()) { + if (&PN == IndVar) + continue; + + // Check 1: it has to contain both incoming (init) & backedge blocks + // from IndVar. + if (PN.getBasicBlockIndex(InitBlock) < 0 || + PN.getBasicBlockIndex(BackEdgeBlock) < 0) + continue; + // Check 2: EVL index is always increasing, thus its inital value has to be + // equal to either the initial IV value (when the canonical IV is also + // increasing) or the last IV value (when canonical IV is decreasing). + Value *Init = PN.getIncomingValueForBlock(InitBlock); + using Direction = Loop::LoopBounds::Direction; + switch (Bounds->getDirection()) { + case Direction::Increasing: + if (Init != CanonicalIVInit) + continue; + break; + case Direction::Decreasing: + if (Init != CanonicalIVFinal) + continue; + break; + case Direction::Unknown: + // To be more permissive and see if either the initial or final IV value + // matches PN's init value. + if (Init != CanonicalIVInit && Init != CanonicalIVFinal) + continue; + break; + } + Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock); + assert(RecValue && "expect recurrent IndVar value"); + + LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN + << "\n"); + + // Check 3: Pattern match to find the EVL-based index and total trip count + // (TC). + if (match(RecValue, + m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) && + match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) { + EVLIndVar = RecValue; + break; + } + } + + if (!EVLIndVar || !TC) + return false; + + LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n"); + if (ORE) { + ORE->emit([&]() { + DebugLoc DL; + BasicBlock *Region = nullptr; + if (auto *I = dyn_cast(EVLIndVar)) { + DL = I->getDebugLoc(); + Region = I->getParent(); + } else { + DL = L.getStartLoc(); + Region = L.getHeader(); + } + return OptimizationRemark(DEBUG_TYPE, "UseEVLIndVar", DL, Region) + << "Using " << ore::NV("EVLIndVar", EVLIndVar) + << " for EVL-based IndVar"; + }); + } + + // Create an EVL-based comparison and replace the branch to use it as + // predicate. + + // Loop::getLatchCmpInst check at the beginning of this function has ensured + // that latch block ends in a conditional branch. + auto *LatchBranch = cast(LatchBlock->getTerminator()); + assert(LatchBranch->isConditional() && + "expect the loop latch to be ended with a conditional branch"); + ICmpInst::Predicate Pred; + if (LatchBranch->getSuccessor(0) == L.getHeader()) + Pred = ICmpInst::ICMP_NE; + else + Pred = ICmpInst::ICMP_EQ; + + IRBuilder<> Builder(OrigLatchCmp); + auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC); + OrigLatchCmp->replaceAllUsesWith(NewLatchCmp); + + // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are + // not used outside the cycles. However, in this case the now-RAUW-ed + // OrigLatchCmp will be considered a use outside the cycle while in reality + // it's practically dead. Thus we need to remove it before calling + // RecursivelyDeleteDeadPHINode. + (void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp); + if (llvm::RecursivelyDeleteDeadPHINode(IndVar)) + LLVM_DEBUG(dbgs() << "Removed original IndVar\n"); + + ++NumEliminatedCanonicalIV; + + return true; +} + +PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + Function &F = *L.getHeader()->getParent(); + auto &FAMProxy = LAM.getResult(L, AR); + OptimizationRemarkEmitter *ORE = + FAMProxy.getCachedResult(F); + + if (EVLIndVarSimplifyImpl(AR, ORE).run(L)) + return PreservedAnalyses::allInSet(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d04fea5d9b0ac..d6af8a1435d07 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -603,13 +603,13 @@ class InnerLoopVectorizer { // --- Vectorization state --- /// The vector-loop preheader. - BasicBlock *LoopVectorPreHeader; + BasicBlock *LoopVectorPreHeader = nullptr; /// The scalar-loop preheader. - BasicBlock *LoopScalarPreHeader; + BasicBlock *LoopScalarPreHeader = nullptr; /// Middle Block between the vector and the scalar. - BasicBlock *LoopMiddleBlock; + BasicBlock *LoopMiddleBlock = nullptr; /// A list of all bypass blocks. The first block is the entry of the loop. SmallVector LoopBypassBlocks; @@ -8539,36 +8539,29 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( return nullptr; } -VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, - ArrayRef Operands) { - unsigned NumIncoming = Phi->getNumIncomingValues(); - +VPBlendRecipe *VPRecipeBuilder::tryToBlend(VPWidenPHIRecipe *PhiR) { // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. - // Map incoming IR BasicBlocks to incoming VPValues, for lookup below. - // TODO: Add operands and masks in order from the VPlan predecessors. - DenseMap VPIncomingValues; - for (const auto &[Idx, Pred] : enumerate(predecessors(Phi->getParent()))) - VPIncomingValues[Pred] = Operands[Idx]; - + unsigned NumIncoming = PhiR->getNumIncoming(); SmallVector OperandsWithMask; for (unsigned In = 0; In < NumIncoming; In++) { - BasicBlock *Pred = Phi->getIncomingBlock(In); - OperandsWithMask.push_back(VPIncomingValues.lookup(Pred)); - VPValue *EdgeMask = getEdgeMask(Pred, Phi->getParent()); + OperandsWithMask.push_back(PhiR->getIncomingValue(In)); + const VPBasicBlock *Pred = PhiR->getIncomingBlock(In); + VPValue *EdgeMask = getEdgeMask(Pred, PhiR->getParent()); if (!EdgeMask) { assert(In == 0 && "Both null and non-null edge masks found"); - assert(all_equal(Operands) && + assert(all_equal(PhiR->operands()) && "Distinct incoming values with one having a full mask"); break; } OperandsWithMask.push_back(EdgeMask); } - return new VPBlendRecipe(Phi, OperandsWithMask); + return new VPBlendRecipe(cast(PhiR->getUnderlyingInstr()), + OperandsWithMask); } VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, @@ -8955,15 +8948,21 @@ bool VPRecipeBuilder::getScaledReductions( return false; } -VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe( - Instruction *Instr, ArrayRef Operands, VFRange &Range) { +VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, + VFRange &Range) { // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; - if (auto *Phi = dyn_cast(Instr)) { - if (Phi->getParent() != OrigLoop->getHeader()) - return tryToBlend(Phi, Operands); - + Instruction *Instr = R->getUnderlyingInstr(); + SmallVector Operands(R->operands()); + if (auto *PhiR = dyn_cast(R)) { + VPBasicBlock *Parent = PhiR->getParent(); + VPRegionBlock *LoopRegionOf = Parent->getEnclosingLoopRegion(); + // Handle phis in non-header blocks. + if (!LoopRegionOf || LoopRegionOf->getEntry() != Parent) + return tryToBlend(PhiR); + + auto *Phi = cast(R->getUnderlyingInstr()); assert(Operands.size() == 2 && "Must have 2 operands for header phis"); if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) return Recipe; @@ -9528,11 +9527,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, continue; } - SmallVector Operands(R.operands()); VPRecipeBase *Recipe = - RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range); - if (!Recipe) + RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); + if (!Recipe) { + SmallVector Operands(R.operands()); Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range); + } RecipeBuilder.setRecipe(Instr, Recipe); if (isa(Recipe) && isa(Instr)) { @@ -9568,10 +9568,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); - for (ElementCount VF : Range) - Plan->addVF(VF); - Plan->setName("Initial VPlan"); - // Update wide induction increments to use the same step as the corresponding // wide induction. This enables detecting induction increments directly in // VPlan and removes redundant splats. @@ -9601,6 +9597,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + // Transform recipes to abstract recipes if it is legal and beneficial and + // clamp the range for better cost estimation. + // TODO: Enable following transform when the EVL-version of extended-reduction + // and mulacc-reduction are implemented. + if (!CM.foldTailWithEVL()) { + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); + VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, + CostCtx, Range); + } + + for (ElementCount VF : Range) + Plan->addVF(VF); + Plan->setName("Initial VPlan"); + // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 45cf4e1eac092..19d19946ea23c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1584,7 +1584,7 @@ static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask, /// before: 6 9 5 4 9 2 1 0 /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(MutableArrayRef Order) { - const unsigned Sz = Order.size(); + const size_t Sz = Order.size(); SmallBitVector UnusedIndices(Sz, /*t=*/true); SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { @@ -2216,7 +2216,7 @@ class BoUpSLP { !LI2->isSimple()) return CheckSameEntryOrFail(); - std::optional Dist = getPointersDiff( + std::optional Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); if (!Dist || *Dist == 0) { @@ -3619,9 +3619,10 @@ class BoUpSLP { /// vector loads/masked gathers instead of regular gathers. Later these loads /// are reshufled to build final gathered nodes. void tryToVectorizeGatheredLoads( - const SmallMapVector, - SmallVector>>, - 8> &GatheredLoads); + const SmallMapVector< + std::tuple, + SmallVector>>, 8> + &GatheredLoads); /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store @@ -4315,7 +4316,7 @@ class BoUpSLP { /// bundle being the last instruction in the program order during /// vectorization process since the basic blocks are affected, need to /// pre-gather them before. - DenseMap EntryToLastInstruction; + SmallDenseMap EntryToLastInstruction; /// List of gather nodes, depending on other gather/vector nodes, which should /// be emitted after the vector instruction emission process to correctly @@ -5368,7 +5369,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, // patterns. SmallVector GatheredScalars(TE.Scalars.begin(), TE.Scalars.end()); Type *ScalarTy = GatheredScalars.front()->getType(); - int NumScalars = GatheredScalars.size(); + size_t NumScalars = GatheredScalars.size(); if (!isValidElementType(ScalarTy)) return std::nullopt; auto *VecTy = getWidenedType(ScalarTy, NumScalars); @@ -5442,7 +5443,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I); MutableArrayRef Slice = CurrentOrder.slice(I * PartSz, Limit); // Shuffle of at least 2 vectors - ignore. - if (any_of(Slice, [&](int I) { return I != NumScalars; })) { + if (any_of(Slice, [&](unsigned I) { return I != NumScalars; })) { std::fill(Slice.begin(), Slice.end(), NumScalars); ShuffledSubMasks.set(I); continue; @@ -5540,8 +5541,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, return std::max(Entries[I].front()->getVectorFactor(), Entries[I].back()->getVectorFactor()); }); - int NumUndefs = - count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; }); + unsigned NumUndefs = + count_if(CurrentOrder, [&](unsigned Idx) { return Idx == NumScalars; }); if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2)) return std::nullopt; return std::move(CurrentOrder); @@ -5868,7 +5869,11 @@ static bool buildCompressMask(ArrayRef PointerOps, Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()]; for (unsigned I : seq(1, Sz)) { Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]]; - unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + std::optional OptPos = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + if (!OptPos || OptPos > std::numeric_limits::max()) + return false; + unsigned Pos = static_cast(*OptPos); CompressMask[I] = Pos; if (!Stride) continue; @@ -5894,7 +5899,7 @@ static bool isMaskedLoadCompress( VectorType *&LoadVecTy) { InterleaveFactor = 0; Type *ScalarTy = VL.front()->getType(); - const unsigned Sz = VL.size(); + const size_t Sz = VL.size(); auto *VecTy = getWidenedType(ScalarTy, Sz); constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; SmallVector Mask; @@ -5921,11 +5926,11 @@ static bool isMaskedLoadCompress( Ptr0 = PointerOps[Order.front()]; PtrN = PointerOps[Order.back()]; } - std::optional Diff = + std::optional Diff = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); if (!Diff) return false; - const unsigned MaxRegSize = + const size_t MaxRegSize = TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(); // Check for very large distances between elements. @@ -6051,9 +6056,10 @@ static bool isStridedLoad(ArrayRef VL, ArrayRef PointerOps, ArrayRef Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, - const bool IsAnyPointerUsedOutGraph, const int Diff) { - const unsigned Sz = VL.size(); - const unsigned AbsoluteDiff = std::abs(Diff); + const bool IsAnyPointerUsedOutGraph, + const int64_t Diff) { + const size_t Sz = VL.size(); + const uint64_t AbsoluteDiff = std::abs(Diff); Type *ScalarTy = VL.front()->getType(); auto *VecTy = getWidenedType(ScalarTy, Sz); if (IsAnyPointerUsedOutGraph || @@ -6061,9 +6067,9 @@ static bool isStridedLoad(ArrayRef VL, ArrayRef PointerOps, (Sz > MinProfitableStridedLoads || (AbsoluteDiff <= MaxProfitableLoadStride * Sz && AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || - Diff == -(static_cast(Sz) - 1)) { - int Stride = Diff / static_cast(Sz - 1); - if (Diff != Stride * static_cast(Sz - 1)) + Diff == -(static_cast(Sz) - 1)) { + int64_t Stride = Diff / static_cast(Sz - 1); + if (Diff != Stride * static_cast(Sz - 1)) return false; Align Alignment = cast(Order.empty() ? VL.front() : VL[Order.front()]) @@ -6081,9 +6087,9 @@ static bool isStridedLoad(ArrayRef VL, ArrayRef PointerOps, } // Iterate through all pointers and check if all distances are // unique multiple of Dist. - SmallSet Dists; + SmallSet Dists; for (Value *Ptr : PointerOps) { - int Dist = 0; + int64_t Dist = 0; if (Ptr == PtrN) Dist = Diff; else if (Ptr != Ptr0) @@ -6122,7 +6128,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. PointerOps.clear(); - const unsigned Sz = VL.size(); + const size_t Sz = VL.size(); PointerOps.resize(Sz); auto *POIter = PointerOps.begin(); for (Value *V : VL) { @@ -6165,10 +6171,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, Ptr0 = PointerOps[Order.front()]; PtrN = PointerOps[Order.back()]; } - std::optional Diff = + std::optional Diff = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == Sz - 1) + if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI, [&](Value *V) { @@ -6427,8 +6433,9 @@ static bool clusterSortPtrAccesses(ArrayRef VL, // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each // Ptr into, sort and return the sorted indices with values next to one // another. - SmallMapVector, - SmallVector>>, 8> + SmallMapVector< + std::pair, + SmallVector>>, 8> Bases; Bases .try_emplace(std::make_pair( @@ -6441,10 +6448,10 @@ static bool clusterSortPtrAccesses(ArrayRef VL, getUnderlyingObject(Ptr, RecursionMaxDepth)); bool Found = any_of(Bases.try_emplace(Key).first->second, [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) { - std::optional Diff = getPointersDiff( - ElemTy, std::get<0>(Base.front()), ElemTy, - Ptr, DL, SE, - /*StrictCheck=*/true); + std::optional Diff = + getPointersDiff(ElemTy, std::get<0>(Base.front()), + ElemTy, Ptr, DL, SE, + /*StrictCheck=*/true); if (!Diff) return false; @@ -6494,10 +6501,11 @@ static bool clusterSortPtrAccesses(ArrayRef VL, for (auto &Vec : Base.second) { if (Vec.size() > 1) { stable_sort(Vec, llvm::less_second()); - int InitialOffset = std::get<1>(Vec[0]); + int64_t InitialOffset = std::get<1>(Vec[0]); bool AnyConsecutive = all_of(enumerate(Vec), [InitialOffset](const auto &P) { - return std::get<1>(P.value()) == int(P.index()) + InitialOffset; + return std::get<1>(P.value()) == + int64_t(P.index()) + InitialOffset; }); // Fill SortedIndices array only if it looks worth-while to sort the // ptrs. @@ -7007,7 +7015,7 @@ static void combineOrders(MutableArrayRef Order, ArrayRef SecondaryOrder) { assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) && "Expected same size of orders"); - unsigned Sz = Order.size(); + size_t Sz = Order.size(); SmallBitVector UsedIndices(Sz); for (unsigned Idx : seq(0, Sz)) { if (Order[Idx] != Sz) @@ -7999,7 +8007,7 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { if (StoresVec.size() > Lane) continue; if (!StoresVec.empty()) { - std::optional Diff = getPointersDiff( + std::optional Diff = getPointersDiff( SI->getValueOperand()->getType(), SI->getPointerOperand(), SI->getValueOperand()->getType(), StoresVec.front()->getPointerOperand(), *DL, *SE, @@ -8027,14 +8035,14 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, // To avoid calling getPointersDiff() while sorting we create a vector of // pairs {store, offset from first} and sort this instead. - SmallVector> StoreOffsetVec; + SmallVector> StoreOffsetVec; StoreInst *S0 = StoresVec[0]; StoreOffsetVec.emplace_back(0, 0); Type *S0Ty = S0->getValueOperand()->getType(); Value *S0Ptr = S0->getPointerOperand(); for (unsigned Idx : seq(1, StoresVec.size())) { StoreInst *SI = StoresVec[Idx]; - std::optional Diff = + std::optional Diff = getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); @@ -8046,7 +8054,7 @@ bool BoUpSLP::canFormVector(ArrayRef StoresVec, return false; sort(StoreOffsetVec, llvm::less_first()); unsigned Idx = 0; - int PrevDist = 0; + int64_t PrevDist = 0; for (const auto &P : StoreOffsetVec) { if (Idx > 0 && P.first != PrevDist + 1) return false; @@ -8130,15 +8138,15 @@ void BoUpSLP::buildTree(ArrayRef Roots) { static void gatherPossiblyVectorizableLoads( const BoUpSLP &R, ArrayRef VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, - SmallVectorImpl>> &GatheredLoads, + SmallVectorImpl>> &GatheredLoads, bool AddNew = true) { if (VL.empty()) return; Type *ScalarTy = getValueType(VL.front()); if (!isValidElementType(ScalarTy)) return; - SmallVector>> ClusteredLoads; - SmallVector> ClusteredDistToLoad; + SmallVector>> ClusteredLoads; + SmallVector> ClusteredDistToLoad; for (Value *V : VL) { auto *LI = dyn_cast(V); if (!LI) @@ -8154,7 +8162,7 @@ static void gatherPossiblyVectorizableLoads( RecursionMaxDepth) && "Expected loads with the same type, same parent and same " "underlying pointer."); - std::optional Dist = getPointersDiff( + std::optional Dist = getPointersDiff( LI->getType(), LI->getPointerOperand(), Data.front().first->getType(), Data.front().first->getPointerOperand(), DL, SE, /*StrictCheck=*/true); @@ -8176,11 +8184,11 @@ static void gatherPossiblyVectorizableLoads( } } auto FindMatchingLoads = - [&](ArrayRef> Loads, - SmallVectorImpl>> + [&](ArrayRef> Loads, + SmallVectorImpl>> &GatheredLoads, SetVector &ToAdd, SetVector &Repeated, - int &Offset, unsigned &Start) { + int64_t &Offset, unsigned &Start) { if (Loads.empty()) return GatheredLoads.end(); LoadInst *LI = Loads.front().first; @@ -8191,16 +8199,16 @@ static void gatherPossiblyVectorizableLoads( if (LI->getParent() != Data.front().first->getParent() || LI->getType() != Data.front().first->getType()) continue; - std::optional Dist = + std::optional Dist = getPointersDiff(LI->getType(), LI->getPointerOperand(), Data.front().first->getType(), Data.front().first->getPointerOperand(), DL, SE, /*StrictCheck=*/true); if (!Dist) continue; - SmallSet DataDists; + SmallSet DataDists; SmallPtrSet DataLoads; - for (std::pair P : Data) { + for (std::pair P : Data) { DataDists.insert(P.second); DataLoads.insert(P.first); } @@ -8231,10 +8239,10 @@ static void gatherPossiblyVectorizableLoads( ToAdd.clear(); return GatheredLoads.end(); }; - for (ArrayRef> Data : ClusteredLoads) { + for (ArrayRef> Data : ClusteredLoads) { unsigned Start = 0; SetVector ToAdd, LocalToAdd, Repeated; - int Offset = 0; + int64_t Offset = 0; auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset, Start); while (It != GatheredLoads.end()) { @@ -8249,7 +8257,7 @@ static void gatherPossiblyVectorizableLoads( return !ToAdd.contains(Idx) && !Repeated.contains(Idx); })) { auto AddNewLoads = - [&](SmallVectorImpl> &Loads) { + [&](SmallVectorImpl> &Loads) { for (unsigned Idx : seq(Data.size())) { if (ToAdd.contains(Idx) || Repeated.contains(Idx)) continue; @@ -8259,7 +8267,7 @@ static void gatherPossiblyVectorizableLoads( if (!AddNew) { LoadInst *LI = Data.front().first; It = find_if( - GatheredLoads, [&](ArrayRef> PD) { + GatheredLoads, [&](ArrayRef> PD) { return PD.front().first->getParent() == LI->getParent() && PD.front().first->getType() == LI->getType(); }); @@ -8267,7 +8275,7 @@ static void gatherPossiblyVectorizableLoads( AddNewLoads(*It); It = std::find_if( std::next(It), GatheredLoads.end(), - [&](ArrayRef> PD) { + [&](ArrayRef> PD) { return PD.front().first->getParent() == LI->getParent() && PD.front().first->getType() == LI->getType(); }); @@ -8280,9 +8288,10 @@ static void gatherPossiblyVectorizableLoads( } void BoUpSLP::tryToVectorizeGatheredLoads( - const SmallMapVector, - SmallVector>>, - 8> &GatheredLoads) { + const SmallMapVector< + std::tuple, + SmallVector>>, 8> + &GatheredLoads) { GatheredLoadsEntriesFirst = VectorizableTree.size(); SmallVector> LoadSetsToVectorize( @@ -8291,8 +8300,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( Set.insert_range(VectorizableTree[Idx]->Scalars); // Sort loads by distance. - auto LoadSorter = [](const std::pair &L1, - const std::pair &L2) { + auto LoadSorter = [](const std::pair &L1, + const std::pair &L2) { return L1.second > L2.second; }; @@ -8454,28 +8463,30 @@ void BoUpSLP::tryToVectorizeGatheredLoads( }; auto ProcessGatheredLoads = [&, &TTI = *TTI]( - ArrayRef>> GatheredLoads, + ArrayRef>> GatheredLoads, bool Final = false) { SmallVector NonVectorized; - for (ArrayRef> LoadsDists : GatheredLoads) { + for (ArrayRef> LoadsDists : + GatheredLoads) { if (LoadsDists.size() <= 1) { NonVectorized.push_back(LoadsDists.back().first); continue; } - SmallVector> LocalLoadsDists(LoadsDists); + SmallVector> LocalLoadsDists( + LoadsDists); SmallVector OriginalLoads(make_first_range(LoadsDists)); stable_sort(LocalLoadsDists, LoadSorter); SmallVector Loads; unsigned MaxConsecutiveDistance = 0; unsigned CurrentConsecutiveDist = 1; - int LastDist = LocalLoadsDists.front().second; + int64_t LastDist = LocalLoadsDists.front().second; bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads); - for (const std::pair &L : LocalLoadsDists) { + for (const std::pair &L : LocalLoadsDists) { if (isVectorized(L.first)) continue; assert(LastDist >= L.second && "Expected first distance always not less than second"); - if (static_cast(LastDist - L.second) == + if (static_cast(LastDist - L.second) == CurrentConsecutiveDist) { ++CurrentConsecutiveDist; MaxConsecutiveDistance = @@ -8698,12 +8709,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads( if (!Ref.empty() && !NonVectorized.empty() && std::accumulate( Ref.begin(), Ref.end(), 0u, - [](unsigned S, - ArrayRef> LoadsDists) -> unsigned { - return S + LoadsDists.size(); - }) != NonVectorized.size() && + [](unsigned S, ArrayRef> LoadsDists) + -> unsigned { return S + LoadsDists.size(); }) != + NonVectorized.size() && IsMaskedGatherSupported(NonVectorized)) { - SmallVector>> FinalGatheredLoads; + SmallVector>> + FinalGatheredLoads; for (LoadInst *LI : NonVectorized) { // Reinsert non-vectorized loads to other list of loads with the same // base pointers. @@ -9299,10 +9310,10 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - std::optional Dist = + std::optional Dist = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); // Check that the sorted pointer operands are consecutive. - if (static_cast(*Dist) == VL.size() - 1) + if (static_cast(*Dist) == VL.size() - 1) return TreeEntry::Vectorize; } @@ -10212,7 +10223,7 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!"); SmallVector ReuseShuffleIndices; - SmallVector VL(VLRef.begin(), VLRef.end()); + SmallVector VL(VLRef); // Tries to build split node. auto TrySplitNode = [&](const InstructionsState &LocalState) { @@ -10751,7 +10762,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const { if (!isValidElementType(EltTy)) return 0; - uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N)); + size_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL->getTypeStoreSizeInBits(T)) return 0; @@ -11950,7 +11961,7 @@ void BoUpSLP::transformNodes() { // A list of loads to be gathered during the vectorization process. We can // try to vectorize them at the end, if profitable. SmallMapVector, - SmallVector>>, 8> + SmallVector>>, 8> GatheredLoads; for (std::unique_ptr &TE : VectorizableTree) { @@ -13606,7 +13617,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, unsigned InterleaveFactor; SmallVector CompressMask; VectorType *LoadVecTy; - SmallVector Scalars(VL.begin(), VL.end()); + SmallVector Scalars(VL); if (!E->ReorderIndices.empty()) { SmallVector Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); @@ -15976,9 +15987,10 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc, } Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { - auto &Res = EntryToLastInstruction.try_emplace(E).first->second; - if (Res) - return *Res; + auto It = EntryToLastInstruction.find(E); + if (It != EntryToLastInstruction.end()) + return *cast(It->second); + Instruction *Res = nullptr; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block (except for extractelement-like instructions with // constant indices or gathered loads). @@ -16083,10 +16095,11 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { auto *I = dyn_cast_or_null(E->VectorizedValue); if (!I) I = &getLastInstructionInBundle(E); - if (Res->comesBefore(I)) + if (Res->getParent() == I->getParent() && Res->comesBefore(I)) Res = I; } } + EntryToLastInstruction.try_emplace(E, Res); return *Res; } @@ -16095,6 +16108,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load) { Res = FindFirstInst(); + EntryToLastInstruction.try_emplace(E, Res); return *Res; } @@ -16128,19 +16142,14 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { [](Value *V) { return !isa(V) && isa(V); })) || - all_of(E->Scalars, - [](Value *V) { - return isa(V) || - (!isVectorLikeInstWithConstOps(V) && - isUsedOutsideBlock(V)); - }) || - (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) { - return isa(V) || - areAllOperandsNonInsts(V); - }))) + all_of(E->Scalars, [](Value *V) { + return isa(V) || + (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V)); + })) Res = FindLastInst(); else Res = FindFirstInst(); + EntryToLastInstruction.try_emplace(E, Res); return *Res; } @@ -16151,6 +16160,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { if (Bundle) { assert(!E->isGather() && "Gathered instructions should not be scheduled"); Res = Bundle->getBundle().back()->getInst(); + EntryToLastInstruction.try_emplace(E, Res); return *Res; } @@ -16175,6 +16185,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { if (!Res) Res = FindLastInst(); assert(Res && "Failed to find last instruction in bundle"); + EntryToLastInstruction.try_emplace(E, Res); return *Res; } @@ -16193,7 +16204,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { } if (IsPHI || (!E->isGather() && E->State != TreeEntry::SplitVectorize && - doesNotNeedToSchedule(E->Scalars)) || + all_of(E->Scalars, areAllOperandsNonInsts)) || (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load)) { @@ -17782,17 +17793,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *VecOp = NewPhi->getIncomingValueForBlock(IBB); NewPhi->addIncoming(VecOp, IBB); TreeEntry *OpTE = getOperandEntry(E, I); + assert(!OpTE->VectorizedValue && "Expected no vectorized value."); OpTE->VectorizedValue = VecOp; continue; } Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeOperand(E, I); + const TreeEntry *OpE = getOperandEntry(E, I); + Value *Vec; + if (OpE->isGather()) { + assert(OpE->VectorizedValue && "Expected vectorized value."); + Vec = OpE->VectorizedValue; + if (auto *IVec = dyn_cast(Vec)) + Builder.SetInsertPoint(IVec->getNextNonDebugInstruction()); + } else { + Vec = vectorizeOperand(E, I); + } if (VecTy != Vec->getType()) { - assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() || - MinBWs.contains(getOperandEntry(E, I))) && - "Expected item in MinBWs."); + assert( + (It != MinBWs.end() || OpE->isGather() || MinBWs.contains(OpE)) && + "Expected item in MinBWs."); Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I)); } NewPhi->addIncoming(Vec, IBB); @@ -18264,12 +18285,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); PO = IsReverseOrder ? PtrN : Ptr0; - std::optional Diff = getPointersDiff( + std::optional Diff = getPointersDiff( VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE); Type *StrideTy = DL->getIndexType(PO->getType()); Value *StrideVal; if (Diff) { - int Stride = *Diff / (static_cast(E->Scalars.size()) - 1); + int64_t Stride = + *Diff / (static_cast(E->Scalars.size()) - 1); StrideVal = ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride * DL->getTypeAllocSize(ScalarTy)); @@ -18678,6 +18700,28 @@ Value *BoUpSLP::vectorizeTree( else Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); + // Vectorize gather operands of the PHI nodes. + for (const std::unique_ptr &TE : reverse(VectorizableTree)) { + if (TE->isGather() && TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->hasState() && + !TE->UserTreeIndex.UserTE->isAltShuffle() && + TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && + TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI && + !TE->VectorizedValue) { + auto *PH = cast(TE->UserTreeIndex.UserTE->getMainOp()); + BasicBlock *IBB = PH->getIncomingBlock(TE->UserTreeIndex.EdgeIdx); + // If there is the same incoming block earlier - skip, it will be handled + // in PHI node. + if (TE->UserTreeIndex.EdgeIdx > 0 && + any_of(seq(TE->UserTreeIndex.EdgeIdx), [&](unsigned Idx) { + return PH->getIncomingBlock(Idx) == IBB; + })) + continue; + Builder.SetInsertPoint(IBB->getTerminator()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + (void)vectorizeTree(TE.get()); + } + } // Emit gathered loads first to emit better code for the users of those // gathered loads. for (const std::unique_ptr &TE : VectorizableTree) { @@ -21127,18 +21171,18 @@ class RelatedStoreInsts { /// \p PtrDist. /// Does nothing if there is already a store with that \p PtrDist. /// \returns The previously associated Instruction index, or std::nullopt - std::optional insertOrLookup(unsigned InstrIdx, int PtrDist) { + std::optional insertOrLookup(unsigned InstrIdx, int64_t PtrDist) { auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx); - return Inserted ? std::nullopt : std::optional(It->second); + return Inserted ? std::nullopt : std::make_optional(It->second); } - using DistToInstMap = std::map; + using DistToInstMap = std::map; const DistToInstMap &getStores() const { return Instrs; } /// If \p SI is related to this group of stores, return the distance of its /// pointer operand to the one the group's BaseInstr. - std::optional getPointerDiff(StoreInst &SI, const DataLayout &DL, - ScalarEvolution &SE) const { + std::optional getPointerDiff(StoreInst &SI, const DataLayout &DL, + ScalarEvolution &SE) const { StoreInst &BaseStore = *AllStores[BaseInstrIdx]; return getPointersDiff( BaseStore.getValueOperand()->getType(), BaseStore.getPointerOperand(), @@ -21149,7 +21193,7 @@ class RelatedStoreInsts { /// Recompute the pointer distances to be based on \p NewBaseInstIdx. /// Stores whose index is less than \p MinSafeIdx will be dropped. void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx, - int DistFromCurBase) { + int64_t DistFromCurBase) { DistToInstMap PrevSet = std::move(Instrs); reset(NewBaseInstIdx); @@ -21165,7 +21209,7 @@ class RelatedStoreInsts { /// Remove all stores that have been vectorized from this group. void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) { DistToInstMap::reverse_iterator LastVectorizedStore = find_if( - reverse(Instrs), [&](const std::pair &DistAndIdx) { + reverse(Instrs), [&](const std::pair &DistAndIdx) { return VectorizedStores.contains(AllStores[DistAndIdx.second]); }); @@ -21198,7 +21242,7 @@ bool SLPVectorizerPass::vectorizeStores( bool Changed = false; auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) { - int PrevDist = -1; + int64_t PrevDist = -1; BoUpSLP::ValueList Operands; // Collect the chain into a list. for (auto [Idx, Data] : enumerate(StoreSeq)) { @@ -21499,7 +21543,7 @@ bool SLPVectorizerPass::vectorizeStores( // dependencies and no need to waste compile time to try to vectorize them. // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) { - std::optional PtrDist; + std::optional PtrDist; auto *RelatedStores = find_if( SortedStores, [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) { PtrDist = StoreSeq.getPointerDiff(*SI, *DL, *SE); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 5c7a3aa9f68d7..ae86181487261 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -122,10 +122,10 @@ class VPRecipeBuilder { tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef Operands, VFRange &Range); - /// Handle non-loop phi nodes. Return a new VPBlendRecipe otherwise. Currently + /// Handle non-loop phi nodes, returning a new VPBlendRecipe. Currently /// all such phi nodes are turned into a sequence of select instructions as /// the vectorizer currently performs full if-conversion. - VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef Operands); + VPBlendRecipe *tryToBlend(VPWidenPHIRecipe *PhiR); /// Handle call instructions. If \p CI can be widened for \p Range.Start, /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be @@ -179,11 +179,9 @@ class VPRecipeBuilder { /// that are valid so recipes can be formed later. void collectScaledReductions(VFRange &Range); - /// Create and return a widened recipe for \p I if one can be created within + /// Create and return a widened recipe for \p R if one can be created within /// the given VF \p Range. - VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, - ArrayRef Operands, - VFRange &Range); + VPRecipeBase *tryToCreateWidenRecipe(VPSingleDefRecipe *R, VFRange &Range); /// Create and return a partial reduction recipe for a reduction instruction /// along with binary operation and reduction phi operands. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 167aff737d3fd..c36feb0e2fdef 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -230,7 +230,7 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) { if (hasScalarValue(Def, Lane)) return Data.VPV2Scalars[Def][Lane.mapToCacheIndex(VF)]; - if (!Lane.isFirstLane() && vputils::isUniformAfterVectorization(Def) && + if (!Lane.isFirstLane() && vputils::isSingleScalar(Def) && hasScalarValue(Def, VPLane::getFirstLane())) { return Data.VPV2Scalars[Def][0]; } @@ -303,17 +303,17 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { return ScalarValue; } - bool IsUniform = vputils::isUniformAfterVectorization(Def); + bool IsSingleScalar = vputils::isSingleScalar(Def); - VPLane LastLane(IsUniform ? 0 : VF.getKnownMinValue() - 1); + VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1); // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, LastLane)) { // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and - // VPExpandSCEVRecipes can also be uniform. + // VPExpandSCEVRecipes can also be a single scalar. assert((isa(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); - IsUniform = true; + IsSingleScalar = true; LastLane = 0; } @@ -334,7 +334,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { // resulting vectors are stored in State, we will only generate the // insertelements once. Value *VectorValue = nullptr; - if (IsUniform) { + if (IsSingleScalar) { VectorValue = GetBroadcastInstrs(ScalarValue); set(Def, VectorValue); } else { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 2c4cac7655ec9..cdfd26350878b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -517,6 +517,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: + case VPRecipeBase::VPMulAccumulateReductionSC: + case VPRecipeBase::VPExtendedReductionSC: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: @@ -601,13 +603,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {} }; + struct NonNegFlagsTy { + char NonNeg : 1; + NonNegFlagsTy(bool IsNonNeg) : NonNeg(IsNonNeg) {} + }; + private: struct ExactFlagsTy { char IsExact : 1; }; - struct NonNegFlagsTy { - char NonNeg : 1; - }; struct FastMathFlagsTy { char AllowReassoc : 1; char NoNaNs : 1; @@ -697,6 +701,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp), DisjointFlags(DisjointFlags) {} + template + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + NonNegFlagsTy NonNegFlags, DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::NonNegOp), + NonNegFlags(NonNegFlags) {} + protected: VPRecipeWithIRFlags(const unsigned char SC, ArrayRef Operands, GEPNoWrapFlags GEPFlags, DebugLoc DL = {}) @@ -715,7 +725,9 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC || - R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; + R->getVPDefID() == VPRecipeBase::VPVectorPointerSC || + R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || + R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; } static inline bool classof(const VPUser *U) { @@ -812,6 +824,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { FastMathFlags getFastMathFlags() const; + /// Returns true if the recipe has non-negative flag. + bool hasNonNegFlag() const { return OpType == OperationType::NonNegOp; } + + bool isNonNeg() const { + assert(OpType == OperationType::NonNegOp && + "recipe doesn't have a NNEG flag"); + return NonNegFlags.NonNeg; + } + bool hasNoUnsignedWrap() const { assert(OpType == OperationType::OverflowingBinOp && "recipe doesn't have a NUW flag"); @@ -1135,7 +1156,9 @@ class VPPhiAccessors { const VPBasicBlock *getIncomingBlock(unsigned Idx) const; /// Returns the number of incoming values, also number of incoming blocks. - unsigned getNumIncoming() const { return getAsRecipe()->getNumOperands(); } + virtual unsigned getNumIncoming() const { + return getAsRecipe()->getNumOperands(); + } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -1234,7 +1257,7 @@ class VPIRInstruction : public VPRecipeBase { /// cast/dyn_cast/isa and execute() implementation. A single VPValue operand is /// allowed, and it is used to add a new incoming value for the single /// predecessor VPBB. -struct VPIRPhi : public VPIRInstruction { +struct VPIRPhi : public VPIRInstruction, public VPPhiAccessors { VPIRPhi(PHINode &PN) : VPIRInstruction(PN) {} static inline bool classof(const VPRecipeBase *U) { @@ -1251,6 +1274,9 @@ struct VPIRPhi : public VPIRInstruction { void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif + +protected: + const VPRecipeBase *getAsRecipe() const override { return this; } }; /// Helper to manage IR metadata for recipes. It filters out metadata that @@ -1289,10 +1315,19 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I), Opcode(I.getOpcode()) {} + VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, + ArrayRef Operands, bool NUW, bool NSW, DebugLoc DL) + : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL), + Opcode(Opcode) {} + public: VPWidenRecipe(Instruction &I, ArrayRef Operands) : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} + VPWidenRecipe(unsigned Opcode, ArrayRef Operands, bool NUW, + bool NSW, DebugLoc DL) + : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {} + ~VPWidenRecipe() override = default; VPWidenRecipe *clone() override { @@ -1337,8 +1372,15 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { "opcode of underlying cast doesn't match"); } - VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) - : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPIRMetadata(), + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), VPIRMetadata(), + Opcode(Opcode), ResultTy(ResultTy) {} + + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + bool IsNonNeg, DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg), + DL), Opcode(Opcode), ResultTy(ResultTy) {} ~VPWidenCastRecipe() override = default; @@ -1785,13 +1827,15 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, /// * VPWidenPointerInductionRecipe: Generate vector and scalar values for a /// pointer induction. Produces either a vector PHI per-part or scalar values /// per-lane based on the canonical induction. -class VPHeaderPHIRecipe : public VPSingleDefRecipe { +class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors { protected: VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr, VPValue *Start, DebugLoc DL = {}) : VPSingleDefRecipe(VPDefID, ArrayRef({Start}), UnderlyingInstr, DL) { } + const VPRecipeBase *getAsRecipe() const override { return this; } + public: ~VPHeaderPHIRecipe() override = default; @@ -1980,6 +2024,11 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe { return isUnrolled() ? getOperand(getNumOperands() - 2) : nullptr; } + /// Returns the number of incoming values, also number of incoming blocks. + /// Note that at the moment, VPWidenIntOrFpInductionRecipes only have a single + /// incoming value, its start value. + unsigned getNumIncoming() const override { return 1; } + /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { return Trunc; } @@ -1992,7 +2041,8 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe { /// Returns the scalar type of the induction. Type *getScalarType() const { - return Trunc ? Trunc->getType() : getPHINode()->getType(); + return Trunc ? Trunc->getType() + : getStartValue()->getLiveInIRValue()->getType(); } /// Returns the VPValue representing the value of this induction at @@ -2381,6 +2431,28 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { setUnderlyingValue(I); } + /// For VPExtendedReductionRecipe. + /// Note that the debug location is from the extend. + VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, + ArrayRef Operands, VPValue *CondOp, + bool IsOrdered, DebugLoc DL) + : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind), + IsOrdered(IsOrdered), IsConditional(CondOp) { + if (CondOp) + addOperand(CondOp); + } + + /// For VPMulAccumulateReductionRecipe. + /// Note that the NUW/NSW flags and the debug location are from the Mul. + VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, + ArrayRef Operands, VPValue *CondOp, + bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL) + : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind), + IsOrdered(IsOrdered), IsConditional(CondOp) { + if (CondOp) + addOperand(CondOp); + } + public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, @@ -2389,6 +2461,13 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { ArrayRef({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} + VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, + VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, + bool IsOrdered, DebugLoc DL = {}) + : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, + ArrayRef({ChainOp, VecOp}), CondOp, + IsOrdered, DL) {} + ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { @@ -2399,7 +2478,9 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || + R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || + R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; } static inline bool classof(const VPUser *U) { @@ -2538,23 +2619,200 @@ class VPReductionEVLRecipe : public VPReductionRecipe { } }; +/// A recipe to represent inloop extended reduction operations, performing a +/// reduction on a extended vector operand into a scalar value, and adding the +/// result to a chain. This recipe is abstract and needs to be lowered to +/// concrete recipes before codegen. The operands are {ChainOp, VecOp, +/// [Condition]}. +class VPExtendedReductionRecipe : public VPReductionRecipe { + /// Opcode of the extend for VecOp. + Instruction::CastOps ExtOp; + + /// The scalar type after extending. + Type *ResultTy; + + /// For cloning VPExtendedReductionRecipe. + VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed) + : VPReductionRecipe( + VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(), + {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(), + ExtRed->isOrdered(), ExtRed->getDebugLoc()), + ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) { + transferFlags(*ExtRed); + setUnderlyingValue(ExtRed->getUnderlyingValue()); + } + +public: + VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext) + : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(), + R->isOrdered(), Ext->getDebugLoc()), + ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) { + assert((ExtOp == Instruction::CastOps::ZExt || + ExtOp == Instruction::CastOps::SExt) && + "VPExtendedReductionRecipe only supports zext and sext."); + + transferFlags(*Ext); + setUnderlyingValue(R->getUnderlyingValue()); + } + + ~VPExtendedReductionRecipe() override = default; + + VPExtendedReductionRecipe *clone() override { + return new VPExtendedReductionRecipe(this); + } + + VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC); + + void execute(VPTransformState &State) override { + llvm_unreachable("VPExtendedReductionRecipe should be transform to " + "VPExtendedRecipe + VPReductionRecipe before execution."); + }; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The scalar type after extending. + Type *getResultType() const { return ResultTy; } + + /// Is the extend ZExt? + bool isZExt() const { return getExtOpcode() == Instruction::ZExt; } + + /// Get the opcode of the extend for VecOp. + Instruction::CastOps getExtOpcode() const { return ExtOp; } +}; + +/// A recipe to represent inloop MulAccumulateReduction operations, multiplying +/// the vector operands (which may be extended), performing a reduction.add on +/// the result, and adding the scalar result to a chain. This recipe is abstract +/// and needs to be lowered to concrete recipes before codegen. The operands are +/// {ChainOp, VecOp1, VecOp2, [Condition]}. +class VPMulAccumulateReductionRecipe : public VPReductionRecipe { + /// Opcode of the extend for VecOp1 and VecOp2. + Instruction::CastOps ExtOp; + + /// Non-neg flag of the extend recipe. + bool IsNonNeg = false; + + /// The scalar type after extending. + Type *ResultTy = nullptr; + + /// For cloning VPMulAccumulateReductionRecipe. + VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(), + {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()}, + MulAcc->getCondOp(), MulAcc->isOrdered(), + WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), + MulAcc->getDebugLoc()), + ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), + ResultTy(MulAcc->getResultType()) { + transferFlags(*MulAcc); + setUnderlyingValue(MulAcc->getUnderlyingValue()); + } + +public: + VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, + VPWidenCastRecipe *Ext0, + VPWidenCastRecipe *Ext1, Type *ResultTy) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)}, + R->getCondOp(), R->isOrdered(), + WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), + R->getDebugLoc()), + ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) { + assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == + Instruction::Add && + "The reduction instruction in MulAccumulateteReductionRecipe must " + "be Add"); + assert((ExtOp == Instruction::CastOps::ZExt || + ExtOp == Instruction::CastOps::SExt) && + "VPMulAccumulateReductionRecipe only supports zext and sext."); + setUnderlyingValue(R->getUnderlyingValue()); + // Only set the non-negative flag if the original recipe contains. + if (Ext0->hasNonNegFlag()) + IsNonNeg = Ext0->isNonNeg(); + } + + VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, + Type *ResultTy) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)}, + R->getCondOp(), R->isOrdered(), + WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), + R->getDebugLoc()), + ExtOp(Instruction::CastOps::CastOpsEnd), ResultTy(ResultTy) { + assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == + Instruction::Add && + "The reduction instruction in MulAccumulateReductionRecipe must be " + "Add"); + setUnderlyingValue(R->getUnderlyingValue()); + } + + ~VPMulAccumulateReductionRecipe() override = default; + + VPMulAccumulateReductionRecipe *clone() override { + return new VPMulAccumulateReductionRecipe(this); + } + + VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC); + + void execute(VPTransformState &State) override { + llvm_unreachable("VPMulAccumulateReductionRecipe should transform to " + "VPWidenCastRecipe + " + "VPWidenRecipe + VPReductionRecipe before execution"); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + Type *getResultType() const { return ResultTy; } + + /// The first vector value to be extended and reduced. + VPValue *getVecOp0() const { return getOperand(1); } + + /// The second vector value to be extended and reduced. + VPValue *getVecOp1() const { return getOperand(2); } + + /// Return true if this recipe contains extended operands. + bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; } + + /// Return the opcode of the extends for the operands. + Instruction::CastOps getExtOpcode() const { return ExtOp; } + + /// Return if the operands are zero-extended. + bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; } + + /// Return true if the operand extends have the non-negative flag. + bool isNonNeg() const { return IsNonNeg; } +}; + /// VPReplicateRecipe replicates a given instruction producing multiple scalar /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be -/// uniform only one copy, per lane zero, will be generated. +/// a single scalar, only one copy, per lane zero, will be generated. class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { /// Indicator if only a single replica per lane is needed. - bool IsUniform; + bool IsSingleScalar; /// Indicator if the replicas are also predicated. bool IsPredicated; public: VPReplicateRecipe(Instruction *I, ArrayRef Operands, - bool IsUniform, VPValue *Mask = nullptr, + bool IsSingleScalar, VPValue *Mask = nullptr, VPIRMetadata Metadata = {}) : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I), - VPIRMetadata(Metadata), IsUniform(IsUniform), IsPredicated(Mask) { + VPIRMetadata(Metadata), IsSingleScalar(IsSingleScalar), + IsPredicated(Mask) { if (Mask) addOperand(Mask); } @@ -2563,7 +2821,7 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPReplicateRecipe *clone() override { auto *Copy = - new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform, + new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsSingleScalar, isPredicated() ? getMask() : nullptr, *this); Copy->transferFlags(*this); return Copy; @@ -2586,7 +2844,7 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPSlotTracker &SlotTracker) const override; #endif - bool isUniform() const { return IsUniform; } + bool isSingleScalar() const { return IsSingleScalar; } bool isPredicated() const { return IsPredicated; } @@ -2594,7 +2852,7 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return isUniform(); + return isSingleScalar(); } /// Returns true if the recipe uses scalars of operand \p Op. @@ -3283,6 +3541,46 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, } }; +/// Casting from VPRecipeBase -> VPPhiAccessors is supported for all recipe +/// types implementing VPPhiAccessors. Used by isa<> & co. +template <> struct CastIsPossible { + static inline bool isPossible(const VPRecipeBase *f) { + // TODO: include VPPredInstPHIRecipe too, once it implements VPPhiAccessors. + return isa(f); + } +}; +/// Support casting from VPRecipeBase -> VPPhiAccessors, by down-casting to the +/// recipe types implementing VPPhiAccessors. Used by cast<>, dyn_cast<> & co. +template <> +struct CastInfo + : public CastIsPossible { + + using Self = CastInfo; + + /// doCast is used by cast<>. + static inline VPPhiAccessors *doCast(const VPRecipeBase *R) { + return const_cast([R]() -> const VPPhiAccessors * { + switch (R->getVPDefID()) { + case VPDef::VPInstructionSC: + return cast(R); + case VPDef::VPIRInstructionSC: + return cast(R); + case VPDef::VPWidenPHISC: + return cast(R); + default: + return cast(R); + } + }()); + } + + /// doCastIfPossible is used by dyn_cast<>. + static inline VPPhiAccessors *doCastIfPossible(const VPRecipeBase *f) { + if (!Self::isPossible(f)) + return nullptr; + return doCast(f); + } +}; + /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It /// holds a sequence of zero or more VPRecipe's each representing a sequence of /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index c86815c84d8d9..ac0f30cb4693c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -20,6 +20,26 @@ using namespace llvm; #define DEBUG_TYPE "vplan" +VPTypeAnalysis::VPTypeAnalysis(const VPlan &Plan) + : Ctx(Plan.getScalarHeader()->getIRBasicBlock()->getContext()) { + if (auto LoopRegion = Plan.getVectorLoopRegion()) { + if (const auto *CanIV = dyn_cast( + &LoopRegion->getEntryBasicBlock()->front())) { + CanonicalIVTy = CanIV->getScalarType(); + return; + } + } + + // If there's no canonical IV, retrieve the type from the trip count + // expression. + auto *TC = Plan.getTripCount(); + if (TC->isLiveIn()) { + CanonicalIVTy = TC->getLiveInIRValue()->getType(); + return; + } + CanonicalIVTy = cast(TC)->getSCEV()->getType(); +} + Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) { Type *ResTy = inferScalarType(R->getIncomingValue(0)); for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) { @@ -273,6 +293,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) + .Case( + [](const auto *R) { return R->getResultType(); }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index cc21870bee2e3..941e13959c23b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -58,6 +58,8 @@ class VPTypeAnalysis { VPTypeAnalysis(Type *CanonicalIVTy) : CanonicalIVTy(CanonicalIVTy), Ctx(CanonicalIVTy->getContext()) {} + VPTypeAnalysis(const VPlan &Plan); + /// Infer the type of \p V. Returns the scalar type of \p V. Type *inferScalarType(const VPValue *V); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 58865c296ed8a..f2a7f16e19a79 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -461,21 +461,14 @@ m_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) { return m_Select(Op0, m_True(), Op1); } -using VPCanonicalIVPHI_match = - Recipe_match, 0, false, VPCanonicalIVPHIRecipe>; - -inline VPCanonicalIVPHI_match m_CanonicalIV() { - return VPCanonicalIVPHI_match(); -} - -template +template using VPScalarIVSteps_match = - Recipe_match, 0, false, VPScalarIVStepsRecipe>; + TernaryRecipe_match; -template -inline VPScalarIVSteps_match m_ScalarIVSteps(const Op0_t &Op0, - const Op1_t &Op1) { - return VPScalarIVSteps_match(Op0, Op1); +template +inline VPScalarIVSteps_match +m_ScalarIVSteps(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return VPScalarIVSteps_match({Op0, Op1, Op2}); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3c7ab7d24bf6d..3fa6a21b80b17 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -73,6 +73,8 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -120,6 +122,8 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -157,6 +161,8 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: @@ -1190,7 +1196,7 @@ void VPIRPhi::execute(VPTransformState &State) { PHINode *Phi = &getIRPhi(); for (const auto &[Idx, Op] : enumerate(operands())) { VPValue *ExitValue = Op; - auto Lane = vputils::isUniformAfterVectorization(ExitValue) + auto Lane = vputils::isSingleScalar(ExitValue) ? VPLane::getFirstLane() : VPLane::getLastLaneForVF(State.VF); VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; @@ -2522,15 +2528,11 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); FastMathFlags FMFs = getFastMathFlags(); - // TODO: Support any-of and in-loop reductions. + // TODO: Support any-of reductions. assert( (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || ForceTargetInstructionCost.getNumOccurrences() > 0) && "Any-of reduction not implemented in VPlan-based cost model currently."); - assert( - (!cast(getOperand(0))->isInLoop() || - ForceTargetInstructionCost.getNumOccurrences() > 0) && - "In-loop reduction not implemented in VPlan-based cost model currently."); // Cost = Reduction cost + BinOp cost InstructionCost Cost = @@ -2587,6 +2589,59 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, } O << ")"; } + +void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EXTENDED-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " +"; + O << " reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; + getVecOp()->printAsOperand(O, SlotTracker); + printFlags(O); + O << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType(); + if (isConditional()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} + +void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "MULACC-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " + "; + O << "reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; + O << "mul"; + printFlags(O); + if (isExtended()) + O << "("; + getVecOp0()->printAsOperand(O, SlotTracker); + if (isExtended()) + O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType() + << "), ("; + else + O << ", "; + getVecOp1()->printAsOperand(O, SlotTracker); + if (isExtended()) + O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType() + << ")"; + if (isConditional()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} #endif /// A helper function to scalarize a single Instruction in the innermost loop. @@ -2624,7 +2679,7 @@ static void scalarizeInstruction(const Instruction *Instr, for (const auto &I : enumerate(RepRecipe->operands())) { auto InputLane = Lane; VPValue *Operand = I.value(); - if (vputils::isUniformAfterVectorization(Operand)) + if (vputils::isSingleScalar(Operand)) InputLane = VPLane::getFirstLane(); Cloned->setOperand(I.index(), State.get(Operand, InputLane)); } @@ -2650,7 +2705,7 @@ static void scalarizeInstruction(const Instruction *Instr, void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (State.Lane) { // Generate a single instance. - assert((State.VF.isScalar() || !isUniform()) && + assert((State.VF.isScalar() || !isSingleScalar()) && "uniform recipe shouldn't be predicated"); assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); scalarizeInstruction(UI, this, *State.Lane, State); @@ -2668,7 +2723,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { return; } - if (IsUniform) { + if (IsSingleScalar) { // Uniform within VL means we need to generate lane 0. scalarizeInstruction(UI, this, VPLane(0), State); return; @@ -2676,8 +2731,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // A store of a loop varying value to a uniform address only needs the last // copy of the store. - if (isa(UI) && - vputils::isUniformAfterVectorization(getOperand(1))) { + if (isa(UI) && vputils::isSingleScalar(getOperand(1))) { auto Lane = VPLane::getLastLaneForVF(State.VF); scalarizeInstruction(UI, this, VPLane(Lane), State); return; @@ -2738,7 +2792,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, UI->getOpcode(), ResultTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, Op2Info, Operands, UI, &Ctx.TLI) * - (isUniform() ? 1 : VF.getKnownMinValue()); + (isSingleScalar() ? 1 : VF.getKnownMinValue()); } } @@ -2748,7 +2802,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); + O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE "); if (!getUnderlyingInstr()->getType()->isVoidTy()) { printAsOperand(O, SlotTracker); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 806c20ef8cf73..f1c466e3208be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -151,7 +151,7 @@ static bool sinkScalarOperands(VPlan &Plan) { SinkCandidate->mayReadOrWriteMemory()) continue; if (auto *RepR = dyn_cast(SinkCandidate)) { - if (!ScalarVFOnly && RepR->isUniform()) + if (!ScalarVFOnly && RepR->isSingleScalar()) continue; } else if (!isa(SinkCandidate)) continue; @@ -347,7 +347,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, auto *RecipeWithoutMask = new VPReplicateRecipe( PredRecipe->getUnderlyingInstr(), make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), - PredRecipe->isUniform(), nullptr /*Mask*/, *PredRecipe); + PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe); auto *Pred = Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); @@ -643,12 +643,11 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { // Skip recipes that shouldn't be narrowed. if (!Def || !isa(Def) || Def->getNumUsers() == 0 || !Def->getUnderlyingValue() || - (RepR && (RepR->isUniform() || RepR->isPredicated()))) + (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))) continue; // Skip recipes that may have other lanes than their first used. - if (!vputils::isUniformAfterVectorization(Def) && - !vputils::onlyFirstLaneUsed(Def)) + if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def)) continue; auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(), @@ -2392,6 +2391,81 @@ void VPlanTransforms::createInterleaveGroups( } } +// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe. +static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) { + VPWidenCastRecipe *Ext; + // Only ZExt contains non-neg flags. + if (ExtRed->isZExt()) + Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), + ExtRed->getResultType(), ExtRed->isNonNeg(), + ExtRed->getDebugLoc()); + else + Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), + ExtRed->getResultType(), ExtRed->getDebugLoc()); + + auto *Red = new VPReductionRecipe( + ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext, + ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc()); + Ext->insertBefore(ExtRed); + Red->insertBefore(ExtRed); + ExtRed->replaceAllUsesWith(Red); + ExtRed->eraseFromParent(); +} + +// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) + +// VPReductionRecipe (reduce.add) +// + VPWidenCastRecipe (optional). +static void +expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { + // Generate inner VPWidenCastRecipes if necessary. + // Note that we will drop the extend after mul which transforms + // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)). + VPValue *Op0, *Op1; + if (MulAcc->isExtended()) { + Type *RedTy = MulAcc->getResultType(); + if (MulAcc->isZExt()) + Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(), + RedTy, MulAcc->isNonNeg(), + MulAcc->getDebugLoc()); + else + Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(), + RedTy, MulAcc->getDebugLoc()); + Op0->getDefiningRecipe()->insertBefore(MulAcc); + // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate + // VPWidenCastRecipe. + if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) { + Op1 = Op0; + } else { + if (MulAcc->isZExt()) + Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(), + RedTy, MulAcc->isNonNeg(), + MulAcc->getDebugLoc()); + else + Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(), + RedTy, MulAcc->getDebugLoc()); + Op1->getDefiningRecipe()->insertBefore(MulAcc); + } + } else { + // No extends in this MulAccRecipe. + Op0 = MulAcc->getVecOp0(); + Op1 = MulAcc->getVecOp1(); + } + + std::array MulOps = {Op0, Op1}; + auto *Mul = new VPWidenRecipe( + Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(), + MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc()); + Mul->insertBefore(MulAcc); + + auto *Red = new VPReductionRecipe( + MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul, + MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc()); + Red->insertBefore(MulAcc); + + MulAcc->replaceAllUsesWith(Red); + MulAcc->eraseFromParent(); +} + void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; @@ -2454,6 +2528,14 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, VPI->replaceAllUsesWith(VectorStep); ToRemove.push_back(VPI); } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (auto *ExtRed = dyn_cast(&R)) { + expandVPExtendedReduction(ExtRed); + continue; + } + if (auto *MulAcc = dyn_cast(&R)) + expandVPMulAccumulateReduction(MulAcc); + } } for (VPRecipeBase *R : ToRemove) @@ -2551,6 +2633,171 @@ void VPlanTransforms::handleUncountableEarlyExit( LatchExitingBranch->eraseFromParent(); } +/// This function tries convert extended in-loop reductions to +/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and +/// valid. The created recipe must be lowered to concrete +/// recipes before execution. +static VPExtendedReductionRecipe * +tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, + VFRange &Range) { + using namespace VPlanPatternMatch; + + Type *RedTy = Ctx.Types.inferScalarType(Red); + VPValue *VecOp = Red->getVecOp(); + + // Clamp the range if using extended-reduction is profitable. + auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt, + Type *SrcTy) -> bool { + return LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost( + Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(), + CostKind); + InstructionCost ExtCost = + cast(VecOp)->computeCost(VF, Ctx); + InstructionCost RedCost = Red->computeCost(VF, Ctx); + return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; + }, + Range); + }; + + VPValue *A; + // Match reduce(ext)). + if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) && + IsExtendedRedValidAndClampRange( + RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()), + cast(VecOp)->getOpcode() == + Instruction::CastOps::ZExt, + Ctx.Types.inferScalarType(A))) + return new VPExtendedReductionRecipe(Red, cast(VecOp)); + + return nullptr; +} + +/// This function tries convert extended in-loop reductions to +/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial +/// and valid. The created VPExtendedReductionRecipe must be lower to concrete +/// recipes before execution. Patterns of MulAccumulateReduction: +/// reduce.add(mul(...)), +/// reduce.add(mul(ext(A), ext(B))), +/// reduce.add(ext(mul(ext(A), ext(B)))). +static VPMulAccumulateReductionRecipe * +tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, + VPCostContext &Ctx, VFRange &Range) { + using namespace VPlanPatternMatch; + + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + if (Opcode != Instruction::Add) + return nullptr; + + Type *RedTy = Ctx.Types.inferScalarType(Red); + + // Clamp the range if using multiply-accumulate-reduction is profitable. + auto IsMulAccValidAndClampRange = + [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, + VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { + return LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Type *SrcTy = + Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; + auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); + InstructionCost MulAccCost = + Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulCost = Mul->computeCost(VF, Ctx); + InstructionCost RedCost = Red->computeCost(VF, Ctx); + InstructionCost ExtCost = 0; + if (Ext0) + ExtCost += Ext0->computeCost(VF, Ctx); + if (Ext1) + ExtCost += Ext1->computeCost(VF, Ctx); + if (OuterExt) + ExtCost += OuterExt->computeCost(VF, Ctx); + + return MulAccCost.isValid() && + MulAccCost < ExtCost + MulCost + RedCost; + }, + Range); + }; + + VPValue *VecOp = Red->getVecOp(); + VPValue *A, *B; + // Try to match reduce.add(mul(...)). + if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { + auto *RecipeA = + dyn_cast_if_present(A->getDefiningRecipe()); + auto *RecipeB = + dyn_cast_if_present(B->getDefiningRecipe()); + auto *Mul = cast(VecOp->getDefiningRecipe()); + + // Match reduce.add(mul(ext, ext)). + if (RecipeA && RecipeB && + (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && + match(RecipeA, m_ZExtOrSExt(m_VPValue())) && + match(RecipeB, m_ZExtOrSExt(m_VPValue())) && + IsMulAccValidAndClampRange(RecipeA->getOpcode() == + Instruction::CastOps::ZExt, + Mul, RecipeA, RecipeB, nullptr)) + return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB, + RecipeA->getResultType()); + // Match reduce.add(mul). + if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) + return new VPMulAccumulateReductionRecipe(Red, Mul, RedTy); + } + // Match reduce.add(ext(mul(ext(A), ext(B)))). + // All extend recipes must have same opcode or A == B + // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), + m_ZExtOrSExt(m_VPValue()))))) { + auto *Ext = cast(VecOp->getDefiningRecipe()); + auto *Mul = cast(Ext->getOperand(0)->getDefiningRecipe()); + auto *Ext0 = + cast(Mul->getOperand(0)->getDefiningRecipe()); + auto *Ext1 = + cast(Mul->getOperand(1)->getDefiningRecipe()); + if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + Ext0->getOpcode() == Ext1->getOpcode() && + IsMulAccValidAndClampRange(Ext0->getOpcode() == + Instruction::CastOps::ZExt, + Mul, Ext0, Ext1, Ext)) + return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1, + Ext->getResultType()); + } + return nullptr; +} + +/// This function tries to create abstract recipes from the reduction recipe for +/// following optimizations and cost estimation. +static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, + VPCostContext &Ctx, + VFRange &Range) { + VPReductionRecipe *AbstractR = nullptr; + + if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range)) + AbstractR = MulAcc; + else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range)) + AbstractR = ExtRed; + // Cannot create abstract inloop reduction recipes. + if (!AbstractR) + return; + + AbstractR->insertBefore(Red); + Red->replaceAllUsesWith(AbstractR); +} + +void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getVectorLoopRegion()))) { + for (VPRecipeBase &R : *VPBB) { + if (auto *Red = dyn_cast(&R)) + tryToCreateAbstractReductionRecipe(Red, Ctx, Range); + } + } +} + void VPlanTransforms::materializeStepVectors(VPlan &Plan) { for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { auto *IVR = dyn_cast(&Phi); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index d284d916633c8..3a1ed7406b383 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -190,6 +190,13 @@ struct VPlanTransforms { /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// This function converts initial recipes to the abstract recipes and clamps + /// \p Range based on cost model for following optimizations and cost + /// estimations. The converted abstract recipes will lower to concrete + /// recipes before codegen. + static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); + /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 2db4957409c8d..6438c5437b7e3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -62,7 +62,9 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B)))) return B == Plan.getTripCount() && - (match(A, m_ScalarIVSteps(m_CanonicalIV(), m_SpecificInt(1))) || + (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), + m_SpecificInt(1), + m_Specific(&Plan.getVF()))) || IsWideCanonicalIV(A)); return match(V, m_Binary(m_VPValue(A), m_VPValue(B))) && @@ -107,7 +109,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if // all their operands are invariant. // TODO: Further relax the restrictions. - return R->isUniform() && + return R->isSingleScalar() && (isa(R->getUnderlyingValue())) && all_of(R->operands(), isUniformAcrossVFsAndUFs); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 67329a6d6953c..28c1a6af2570b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -37,8 +37,9 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, /// SCEV expression could be constructed. const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE); -/// Returns true if \p VPV is uniform after vectorization. -inline bool isUniformAfterVectorization(const VPValue *VPV) { +/// Returns true if \p VPV is a single scalar, either because it produces the +/// same value for all lanes or only has its first lane used. +inline bool isSingleScalar(const VPValue *VPV) { auto PreservesUniformity = [](unsigned Opcode) -> bool { if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode)) return true; @@ -65,21 +66,19 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { // lanes. if (RegionOfR && RegionOfR->isReplicator()) return false; - return Rep->isUniform() || - (PreservesUniformity(Rep->getOpcode()) && - all_of(Rep->operands(), isUniformAfterVectorization)); + return Rep->isSingleScalar() || (PreservesUniformity(Rep->getOpcode()) && + all_of(Rep->operands(), isSingleScalar)); } if (isa(VPV)) - return all_of(VPV->getDefiningRecipe()->operands(), - isUniformAfterVectorization); + return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar); if (auto *WidenR = dyn_cast(VPV)) { return PreservesUniformity(WidenR->getOpcode()) && - all_of(WidenR->operands(), isUniformAfterVectorization); + all_of(WidenR->operands(), isSingleScalar); } if (auto *VPI = dyn_cast(VPV)) return VPI->isSingleScalar() || VPI->isVectorToScalar() || (PreservesUniformity(VPI->getOpcode()) && - all_of(VPI->operands(), isUniformAfterVectorization)); + all_of(VPI->operands(), isSingleScalar)); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. return isa(VPV); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 638156eab7a84..64065edd315f9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -339,6 +339,8 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPMulAccumulateReductionSC, + VPExtendedReductionSC, VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index b8205545a4f5e..75fc76321db2d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -192,8 +192,7 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { if (!verifyPhiRecipes(VPBB)) return false; - // Verify that defs in VPBB dominate all their uses. The current - // implementation is still incomplete. + // Verify that defs in VPBB dominate all their uses. DenseMap RecipeNumbering; unsigned Cnt = 0; for (const VPRecipeBase &R : *VPBB) @@ -220,12 +219,31 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast(U); - // TODO: check dominance of incoming values for phis properly. - if (!UI || - isa(UI) || - (isa(UI) && - cast(UI)->getOpcode() == Instruction::PHI)) + if (auto *Phi = dyn_cast(UI)) { + for (unsigned Idx = 0; Idx != Phi->getNumIncoming(); ++Idx) { + VPValue *IncomingVPV = Phi->getIncomingValue(Idx); + if (IncomingVPV != V) + continue; + + const VPBasicBlock *IncomingVPBB = Phi->getIncomingBlock(Idx); + if (VPDT.dominates(VPBB, IncomingVPBB)) + continue; + + errs() << "Incoming def at index " << Idx + << " does not dominate incoming block!\n"; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + VPSlotTracker Tracker(VPBB->getPlan()); + IncomingVPV->getDefiningRecipe()->print(errs(), " ", Tracker); + errs() << "\n does not dominate " << IncomingVPBB->getName() + << " for\n"; + UI->print(errs(), " ", Tracker); +#endif + return false; + } + continue; + } + // TODO: Also verify VPPredInstPHIRecipe. + if (isa(UI)) continue; // If the user is in the same block, check it comes after R in the @@ -437,8 +455,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) { bool llvm::verifyVPlanIsValid(const VPlan &Plan) { VPDominatorTree VPDT; VPDT.recalculate(const_cast(Plan)); - VPTypeAnalysis TypeInfo( - const_cast(Plan).getCanonicalIV()->getScalarType()); + VPTypeAnalysis TypeInfo(Plan); VPlanVerifier Verifier(VPDT, TypeInfo); return Verifier.verify(Plan); } diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll index 43bd2066ce520..5367344ce573f 100644 --- a/llvm/test/Analysis/CostModel/AArch64/div.ll +++ b/llvm/test/Analysis/CostModel/AArch64/div.ll @@ -123,17 +123,17 @@ define void @sdiv_uniform() { ; CHECK-LABEL: 'sdiv_uniform' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, %V2i64_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, %V4i64_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, %V8i64_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, %V2i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, %V4i32_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, %V8i32_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:168 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, %V16i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, %V2i16_s @@ -141,9 +141,9 @@ define void @sdiv_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, %V4i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, %V8i16_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:168 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, %V16i16_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:328 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, %V32i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, %V2i8_s @@ -153,9 +153,9 @@ define void @sdiv_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, %V8i8_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:168 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, %V16i8_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:328 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, %V32i8_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:648 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, %V64i8_s ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; @@ -206,17 +206,17 @@ define void @udiv_uniform() { ; CHECK-LABEL: 'udiv_uniform' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = udiv <2 x i64> undef, %V2i64_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = udiv <4 x i64> undef, %V4i64_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = udiv <8 x i64> undef, %V8i64_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = udiv <2 x i32> undef, %V2i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = udiv <4 x i32> undef, %V4i32_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = udiv <8 x i32> undef, %V8i32_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:168 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = udiv <16 x i32> undef, %V16i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = udiv <2 x i16> undef, %V2i16_s @@ -224,9 +224,9 @@ define void @udiv_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = udiv <4 x i16> undef, %V4i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = udiv <8 x i16> undef, %V8i16_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:168 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = udiv <16 x i16> undef, %V16i16_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:328 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = udiv <32 x i16> undef, %V32i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = udiv <2 x i8> undef, %V2i8_s @@ -236,9 +236,9 @@ define void @udiv_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:88 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = udiv <8 x i8> undef, %V8i8_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:168 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = udiv <16 x i8> undef, %V16i8_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:328 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = udiv <32 x i8> undef, %V32i8_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:648 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = udiv <64 x i8> undef, %V64i8_s ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll index 1a56a27422e1f..d684e3af00b83 100644 --- a/llvm/test/Analysis/CostModel/AArch64/rem.ll +++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll @@ -123,17 +123,17 @@ define void @srem_uniform() { ; CHECK-LABEL: 'srem_uniform' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, %V2i64_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, %V4i64_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, %V8i64_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, %V2i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, %V4i32_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, %V8i32_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, %V16i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, %V2i16_s @@ -141,9 +141,9 @@ define void @srem_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, %V4i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, %V8i16_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, %V16i16_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:224 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, %V32i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, %V2i8_s @@ -153,9 +153,9 @@ define void @srem_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, %V8i8_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, %V16i8_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:224 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, %V32i8_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:448 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, %V64i8_s ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; @@ -206,17 +206,17 @@ define void @urem_uniform() { ; CHECK-LABEL: 'urem_uniform' ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i64_s = shufflevector <2 x i64> poison, <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = urem <2 x i64> undef, %V2i64_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i64_s = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = urem <4 x i64> undef, %V4i64_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i64_s = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = urem <8 x i64> undef, %V8i64_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i32_s = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = urem <2 x i32> undef, %V2i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4i32_s = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = urem <4 x i32> undef, %V4i32_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i32_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = urem <8 x i32> undef, %V8i32_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i32_s = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = urem <16 x i32> undef, %V16i32_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i16_s = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = urem <2 x i16> undef, %V2i16_s @@ -224,9 +224,9 @@ define void @urem_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = urem <4 x i16> undef, %V4i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V8i16_s = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = urem <8 x i16> undef, %V8i16_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i16_s = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = urem <16 x i16> undef, %V16i16_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i16_s = shufflevector <32 x i16> poison, <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:224 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = urem <32 x i16> undef, %V32i16_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2i8_s = shufflevector <2 x i8> poison, <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = urem <2 x i8> undef, %V2i8_s @@ -236,9 +236,9 @@ define void @urem_uniform() { ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = urem <8 x i8> undef, %V8i8_s ; CHECK-NEXT: Cost Model: Found costs of 1 for: %V16i8_s = shufflevector <16 x i8> poison, <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = urem <16 x i8> undef, %V16i8_s -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32i8_s = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:224 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = urem <32 x i8> undef, %V32i8_s -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %V64i8_s = shufflevector <64 x i8> poison, <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:448 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = urem <64 x i8> undef, %V64i8_s ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll index 6175ea48c5631..2902c7b989047 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll @@ -10,38 +10,38 @@ define void @broadcast() { ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1i16 = shufflevector <1 x i16> undef, <1 x i16> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1i32 = shufflevector <1 x i32> undef, <1 x i32> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1i64 = shufflevector <1 x i64> undef, <1 x i64> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1i128 = shufflevector <1 x i128> undef, <1 x i128> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v2i128 = shufflevector <2 x i128> undef, <2 x i128> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1f16 = shufflevector <1 x half> undef, <1 x half> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1bf16 = shufflevector <1 x bfloat> undef, <1 x bfloat> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1f32 = shufflevector <1 x float> undef, <1 x float> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v1f64 = shufflevector <1 x double> undef, <1 x double> undef, <1 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %v1i8 = shufflevector <1 x i8> undef, <1 x i8> undef, <1 x i32> zeroinitializer @@ -171,39 +171,39 @@ define void @broadcast_double() { ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v1i16 = shufflevector <1 x i16> undef, <1 x i16> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v1i32 = shufflevector <1 x i32> undef, <1 x i32> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v1i64 = shufflevector <1 x i64> undef, <1 x i64> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v1i128 = shufflevector <1 x i128> undef, <1 x i128> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 0 for: %v2i128 = shufflevector <2 x i128> undef, <2 x i128> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1f16 = shufflevector <1 x half> undef, <1 x half> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1bf16 = shufflevector <1 x bfloat> undef, <1 x bfloat> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1f32 = shufflevector <1 x float> undef, <1 x float> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1f64 = shufflevector <1 x double> undef, <1 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %v1i8 = shufflevector <1 x i8> undef, <1 x i8> undef, <2 x i32> zeroinitializer diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll index 068fffb68c85e..dc80267360ea6 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll @@ -25,17 +25,17 @@ define void @shuffle() { ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv8i16 = load <8 x i16>, ptr undef, align 16 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv8i16 = shufflevector <8 x i16> %lv8i16, <8 x i16> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %lv16i16 = load <16 x i16>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:0 Lat:2 SizeLat:2 for: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv16i16 = shufflevector <16 x i16> %lv16i16, <16 x i16> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv2i32 = load <2 x i32>, ptr undef, align 8 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv2i32 = shufflevector <2 x i32> %lv2i32, <2 x i32> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv4i32 = load <4 x i32>, ptr undef, align 16 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv4i32 = shufflevector <4 x i32> %lv4i32, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %lv8i32 = load <8 x i32>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:0 Lat:2 SizeLat:2 for: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv8i32 = shufflevector <8 x i32> %lv8i32, <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv2i64 = load <2 x i64>, ptr undef, align 16 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv2i64 = shufflevector <2 x i64> %lv2i64, <2 x i64> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %lv4i64 = load <4 x i64>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:0 Lat:2 SizeLat:2 for: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv4i64 = shufflevector <4 x i64> %lv4i64, <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv2f16 = load <2 x half>, ptr undef, align 4 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv2f16 = shufflevector <2 x half> %lv2f16, <2 x half> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv4f16 = load <4 x half>, ptr undef, align 8 @@ -43,17 +43,17 @@ define void @shuffle() { ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv8f16 = load <8 x half>, ptr undef, align 16 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv8f16 = shufflevector <8 x half> %lv8f16, <8 x half> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %lv16f16 = load <16 x half>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:0 Lat:2 SizeLat:2 for: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv16f16 = shufflevector <16 x half> %lv16f16, <16 x half> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv2f32 = load <2 x float>, ptr undef, align 8 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv2f32 = shufflevector <2 x float> %lv2f32, <2 x float> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv4f32 = load <4 x float>, ptr undef, align 16 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv4f32 = shufflevector <4 x float> %lv4f32, <4 x float> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %lv8f32 = load <8 x float>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:0 Lat:2 SizeLat:2 for: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv8f32 = shufflevector <8 x float> %lv8f32, <8 x float> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %lv2f64 = load <2 x double>, ptr undef, align 16 ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv2f64 = shufflevector <2 x double> %lv2f64, <2 x double> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %lv4f64 = load <4 x double>, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:0 Lat:2 SizeLat:2 for: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:0 Lat:1 SizeLat:1 for: %sv4f64 = shufflevector <4 x double> %lv4f64, <4 x double> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %lv2i8 = load <2 x i8>, ptr undef diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll index d152ef1caa672..8d68781d0c1b7 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -12,7 +12,7 @@ define void @shuffle() { ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found costs of 2 for: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found costs of 8 for: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of 16 for: %v10b = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 8 for: %v10b = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found costs of 2 for: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> @@ -365,7 +365,7 @@ define void @multipart() { ; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 4 for: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 16 for: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 16 for: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll index 00030daaaa97b..96dc57936c65b 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll @@ -239,39 +239,39 @@ define void @splatstore(ptr %p) { ; CHECK-NEXT: Cost Model: Found costs of 1 for: store <8 x i8> %v8i8, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: store <16 x i8> %v16i8, ptr %p, align 16 -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store <32 x i8> %v32i8, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:1 SizeLat:4 for: store <64 x i8> %v64i8, ptr %p, align 64 ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: store <4 x i16> %v4i16, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: store <8 x i16> %v8i16, ptr %p, align 16 -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store <16 x i16> %v16i16, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:1 SizeLat:4 for: store <32 x i16> %v32i16, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found costs of 8 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:1 SizeLat:8 for: store <64 x i16> %v64i16, ptr %p, align 128 ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 1 for: store <4 x i32> %v4i32, ptr %p, align 16 -; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store <8 x i32> %v8i32, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:1 SizeLat:4 for: store <16 x i32> %v16i32, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found costs of 8 for: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:1 SizeLat:8 for: store <32 x i32> %v32i32, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found costs of 16 for: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:16 Lat:1 SizeLat:16 for: store <64 x i32> %v64i32, ptr %p, align 256 ; CHECK-NEXT: Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store <4 x i64> %v4i64, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found costs of 4 for: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:1 SizeLat:4 for: store <8 x i64> %v8i64, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found costs of 8 for: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:1 SizeLat:8 for: store <16 x i64> %v16i64, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found costs of 16 for: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v32i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <32 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:16 Lat:1 SizeLat:16 for: store <32 x i64> %v32i64, ptr %p, align 256 -; CHECK-NEXT: Cost Model: Found costs of 32 for: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v64i64 = shufflevector <64 x i64> undef, <64 x i64> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:32 Lat:1 SizeLat:32 for: store <64 x i64> %v64i64, ptr %p, align 512 ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; diff --git a/llvm/test/Assembler/amdgcn-unreachable.ll b/llvm/test/Assembler/amdgcn-unreachable.ll new file mode 100644 index 0000000000000..487a716e69e44 --- /dev/null +++ b/llvm/test/Assembler/amdgcn-unreachable.ll @@ -0,0 +1,32 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +declare amdgpu_cs_chain void @callee() nounwind +declare void @llvm.amdgcn.cs.chain.p0.i64.i32.i32(ptr, i64, i32, i32, i32 immarg, ...) +declare void @llvm.amdgcn.unreachable() +declare void @llvm.dbg.value(metadata, metadata, metadata) + +; @llvm.amdgcn.unreachable is legal after @llvm.amdgcn.cs.chain +; CHECK: define amdgpu_cs_chain void @test_cc_chain_unreachable(i32 %val) +define amdgpu_cs_chain void @test_cc_chain_unreachable(i32 %val) { +tail.block: + %.cond = icmp ne i32 %val, 0 + br i1 %.cond, label %chain.block, label %UnifiedReturnBlock + +chain.block: + call void (ptr, i64, i32, i32, i32, ...) @llvm.amdgcn.cs.chain.p0.i64.i32.i32(ptr @callee, i64 -1, i32 inreg 1, i32 2, i32 1, i32 inreg 32, i32 inreg -1, ptr @callee) + call void @llvm.amdgcn.unreachable() + br label %UnifiedReturnBlock + +UnifiedReturnBlock: + ret void +} + +; debug instructions should be ignored +; CHECK: define amdgpu_cs_chain void @test_cc_chain_unreachable_debug(i32 %val) +define amdgpu_cs_chain void @test_cc_chain_unreachable_debug(i32 %val) { +init: + call void (ptr, i64, i32, i32, i32, ...) @llvm.amdgcn.cs.chain.p0.i64.i32.i32(ptr @callee, i64 -1, i32 inreg 1, i32 2, i32 1, i32 inreg 32, i32 inreg -1, ptr @callee) + call void @llvm.dbg.value(metadata i32 0, metadata !{}, metadata !DIExpression()) + call void @llvm.amdgcn.unreachable() + ret void +} diff --git a/llvm/test/Assembler/autoupgrade-thread-pointer.ll b/llvm/test/Assembler/autoupgrade-thread-pointer.ll index b1ed15a7e4ef8..178e31f50b1bf 100644 --- a/llvm/test/Assembler/autoupgrade-thread-pointer.ll +++ b/llvm/test/Assembler/autoupgrade-thread-pointer.ll @@ -6,14 +6,14 @@ declare ptr @llvm.arm.thread.pointer() define ptr @test1() { ; CHECK-LABEL: define ptr @test1() -; CHECK: call ptr @llvm.thread.pointer() +; CHECK: call ptr @llvm.thread.pointer.p0() %1 = call ptr @llvm.aarch64.thread.pointer() ret ptr %1 } define ptr @test2() { ; CHECK-LABEL: define ptr @test2() -; CHECK: call ptr @llvm.thread.pointer() +; CHECK: call ptr @llvm.thread.pointer.p0() %1 = call ptr @llvm.arm.thread.pointer() ret ptr %1 } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-const.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-const.mir new file mode 100644 index 0000000000000..5d88bb08ebe72 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-const.mir @@ -0,0 +1,27 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -passes="print" %s -o - 2>&1 | FileCheck %s + +--- +name: Cst +tracksRegLiveness: true +body: | + bb.1: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:00000001 SignBits:7 + ; CHECK-NEXT: %1:_ KnownBits:00000001 SignBits:7 + %0:_(s8) = G_CONSTANT i8 1 + %1:_(s8) = COPY %0 +... +--- +name: CstWithClass +# We can't analyze %0 due to the lack of an LLT. We will get a default +# constructed KnownBits back. %0 will have the correct size but we will +# not know any further info. +tracksRegLiveness: true +body: | + bb.1: + ; CHECK-LABEL: name: @CstWithClass + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + %0:gpr32 = MOVi32imm 1 + %1:_(s32) = COPY %0 +... diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme-stubs.ll b/llvm/test/CodeGen/AArch64/aarch64-sme-stubs.ll new file mode 100644 index 0000000000000..f7182e2a166a5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-sme-stubs.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s + +; Checks SME ABI routines can be implemented as stubs without +sme. + +define i1 @__aarch64_sme_accessible() { +; CHECK-LABEL: __aarch64_sme_accessible: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +entry: + ret i1 true +} + +define [2 x i64] @__arm_sme_state() { +; CHECK-LABEL: __arm_sme_state: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: ret +entry: + ret [2 x i64] zeroinitializer +} + +define void @__arm_tpidr2_restore() { +; CHECK-LABEL: __arm_tpidr2_restore: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ret +entry: + ret void +} + +define void @__arm_tpidr2_save() { +; CHECK-LABEL: __arm_tpidr2_save: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ret +entry: + ret void +} + +define void @__arm_za_disable() { +; CHECK-LABEL: __arm_za_disable: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ret +entry: + ret void +} diff --git a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll index b944194dae8fc..f364429b86c38 100644 --- a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll +++ b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-linux-gnu -o - -global-isel %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ;; Check that the llvm aarch64 backend can handle arrays of ;; structs and vice versa when passed from IR. @@ -49,13 +50,22 @@ define [ 8 x double ] @array_8() { ;; > 8 items goes on the stack define [ 9 x double ] @array_9() { -; CHECK-LABEL: array_9: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: str xzr, [x8, #64] -; CHECK-NEXT: stp q0, q0, [x8] -; CHECK-NEXT: stp q0, q0, [x8, #32] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: array_9: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: str xzr, [x8, #64] +; CHECK-SD-NEXT: stp q0, q0, [x8] +; CHECK-SD-NEXT: stp q0, q0, [x8, #32] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: array_9: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: stp xzr, xzr, [x8] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #16] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #32] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #48] +; CHECK-GI-NEXT: str xzr, [x8, #64] +; CHECK-GI-NEXT: ret ret [ 9 x double ] zeroinitializer } @@ -229,13 +239,22 @@ define [ 4 x %T_STRUCT_SAMEM ] @array_of_struct_8_fields() { ;; 5x2 fields = 10 so it is returned in memory. define [ 5 x %T_STRUCT_SAMEM ] @array_of_struct_in_memory() { -; CHECK-LABEL: array_of_struct_in_memory: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: stp q0, q0, [x8, #16] -; CHECK-NEXT: stp q0, q0, [x8, #48] -; CHECK-NEXT: str q0, [x8] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: array_of_struct_in_memory: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: stp q0, q0, [x8, #16] +; CHECK-SD-NEXT: stp q0, q0, [x8, #48] +; CHECK-SD-NEXT: str q0, [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: array_of_struct_in_memory: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: stp xzr, xzr, [x8] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #16] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #32] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #48] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #64] +; CHECK-GI-NEXT: ret ret [ 5 x %T_STRUCT_SAMEM ] zeroinitializer } @@ -347,13 +366,22 @@ define [ 1 x %T_NESTED_STRUCT_SAMEM ] @array_of_struct_nested_same_field_types() ;; 2 x (1 + (2 x 2)) = 10 so this is returned in memory define [ 2 x %T_NESTED_STRUCT_SAMEM ] @array_of_struct_nested_same_field_types_2() { -; CHECK-LABEL: array_of_struct_nested_same_field_types_2: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: stp q0, q0, [x8, #16] -; CHECK-NEXT: stp q0, q0, [x8, #48] -; CHECK-NEXT: str q0, [x8] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: array_of_struct_nested_same_field_types_2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: stp q0, q0, [x8, #16] +; CHECK-SD-NEXT: stp q0, q0, [x8, #48] +; CHECK-SD-NEXT: str q0, [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: array_of_struct_nested_same_field_types_2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: stp xzr, xzr, [x8] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #16] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #32] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #48] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #64] +; CHECK-GI-NEXT: ret ret [ 2 x %T_NESTED_STRUCT_SAMEM ] zeroinitializer } @@ -377,51 +405,94 @@ define %T_IN_BLOCK @return_in_block() { @in_block_store = dso_local global %T_IN_BLOCK zeroinitializer, align 8 define void @caller_in_block() { -; CHECK-LABEL: caller_in_block: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl return_in_block -; CHECK-NEXT: adrp x8, in_block_store -; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d0, d1, [x8] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d4, d5, [x8, #32] -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: caller_in_block: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl return_in_block +; CHECK-SD-NEXT: adrp x8, in_block_store +; CHECK-SD-NEXT: add x8, x8, :lo12:in_block_store +; CHECK-SD-NEXT: stp d0, d1, [x8] +; CHECK-SD-NEXT: stp d2, d3, [x8, #16] +; CHECK-SD-NEXT: stp d4, d5, [x8, #32] +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: caller_in_block: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: bl return_in_block +; CHECK-GI-NEXT: adrp x8, in_block_store +; CHECK-GI-NEXT: str d0, [x8, :lo12:in_block_store] +; CHECK-GI-NEXT: adrp x8, in_block_store +; CHECK-GI-NEXT: add x8, x8, :lo12:in_block_store +; CHECK-GI-NEXT: stp d1, d2, [x8, #8] +; CHECK-GI-NEXT: stp d3, d4, [x8, #24] +; CHECK-GI-NEXT: str d5, [x8, #40] +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret %1 = call %T_IN_BLOCK @return_in_block() store %T_IN_BLOCK %1, ptr @in_block_store ret void } define void @callee_in_block(%T_IN_BLOCK %a) { -; CHECK-LABEL: callee_in_block: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, in_block_store -; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d4, d5, [x8, #32] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d0, d1, [x8] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: callee_in_block: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, in_block_store +; CHECK-SD-NEXT: add x8, x8, :lo12:in_block_store +; CHECK-SD-NEXT: stp d4, d5, [x8, #32] +; CHECK-SD-NEXT: stp d2, d3, [x8, #16] +; CHECK-SD-NEXT: stp d0, d1, [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: callee_in_block: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, in_block_store +; CHECK-GI-NEXT: str d0, [x8, :lo12:in_block_store] +; CHECK-GI-NEXT: adrp x8, in_block_store +; CHECK-GI-NEXT: add x8, x8, :lo12:in_block_store +; CHECK-GI-NEXT: stp d1, d2, [x8, #8] +; CHECK-GI-NEXT: stp d3, d4, [x8, #24] +; CHECK-GI-NEXT: str d5, [x8, #40] +; CHECK-GI-NEXT: ret store %T_IN_BLOCK %a, ptr @in_block_store ret void } define void @argument_in_block() { -; CHECK-LABEL: argument_in_block: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: adrp x8, in_block_store -; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: ldp d4, d5, [x8, #32] -; CHECK-NEXT: ldp d2, d3, [x8, #16] -; CHECK-NEXT: ldp d0, d1, [x8] -; CHECK-NEXT: bl callee_in_block -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: argument_in_block: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: adrp x8, in_block_store +; CHECK-SD-NEXT: add x8, x8, :lo12:in_block_store +; CHECK-SD-NEXT: ldp d4, d5, [x8, #32] +; CHECK-SD-NEXT: ldp d2, d3, [x8, #16] +; CHECK-SD-NEXT: ldp d0, d1, [x8] +; CHECK-SD-NEXT: bl callee_in_block +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: argument_in_block: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: adrp x9, in_block_store +; CHECK-GI-NEXT: add x9, x9, :lo12:in_block_store +; CHECK-GI-NEXT: adrp x8, in_block_store +; CHECK-GI-NEXT: ldp d1, d2, [x9, #8] +; CHECK-GI-NEXT: ldr d0, [x8, :lo12:in_block_store] +; CHECK-GI-NEXT: ldp d3, d4, [x9, #24] +; CHECK-GI-NEXT: ldr d5, [x9, #40] +; CHECK-GI-NEXT: bl callee_in_block +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret %1 = load %T_IN_BLOCK, ptr @in_block_store call void @callee_in_block(%T_IN_BLOCK %1) ret void @@ -430,82 +501,157 @@ define void @argument_in_block() { %T_IN_MEMORY = type [ 3 x { double, { double, double } } ] define %T_IN_MEMORY @return_in_memory() { -; CHECK-LABEL: return_in_memory: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: str xzr, [x8, #64] -; CHECK-NEXT: stp q0, q0, [x8] -; CHECK-NEXT: stp q0, q0, [x8, #32] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: return_in_memory: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: str xzr, [x8, #64] +; CHECK-SD-NEXT: stp q0, q0, [x8] +; CHECK-SD-NEXT: stp q0, q0, [x8, #32] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: return_in_memory: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: stp xzr, xzr, [x8] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #16] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #32] +; CHECK-GI-NEXT: stp xzr, xzr, [x8, #48] +; CHECK-GI-NEXT: str xzr, [x8, #64] +; CHECK-GI-NEXT: ret ret %T_IN_MEMORY zeroinitializer } @in_memory_store = dso_local global %T_IN_MEMORY zeroinitializer, align 8 define void @caller_in_memory() { -; CHECK-LABEL: caller_in_memory: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: add x8, sp, #8 -; CHECK-NEXT: bl return_in_memory -; CHECK-NEXT: ldur q0, [sp, #24] -; CHECK-NEXT: ldur q1, [sp, #8] -; CHECK-NEXT: adrp x8, in_memory_store -; CHECK-NEXT: add x8, x8, :lo12:in_memory_store -; CHECK-NEXT: ldr d2, [sp, #72] -; CHECK-NEXT: ldur q3, [sp, #56] -; CHECK-NEXT: ldur q4, [sp, #40] -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: stp q1, q0, [x8] -; CHECK-NEXT: str d2, [x8, #64] -; CHECK-NEXT: stp q4, q3, [x8, #32] -; CHECK-NEXT: add sp, sp, #96 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: caller_in_memory: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #96 +; CHECK-SD-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: add x8, sp, #8 +; CHECK-SD-NEXT: bl return_in_memory +; CHECK-SD-NEXT: ldur q0, [sp, #24] +; CHECK-SD-NEXT: ldur q1, [sp, #8] +; CHECK-SD-NEXT: adrp x8, in_memory_store +; CHECK-SD-NEXT: add x8, x8, :lo12:in_memory_store +; CHECK-SD-NEXT: ldr d2, [sp, #72] +; CHECK-SD-NEXT: ldur q3, [sp, #56] +; CHECK-SD-NEXT: ldur q4, [sp, #40] +; CHECK-SD-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-SD-NEXT: stp q1, q0, [x8] +; CHECK-SD-NEXT: str d2, [x8, #64] +; CHECK-SD-NEXT: stp q4, q3, [x8, #32] +; CHECK-SD-NEXT: add sp, sp, #96 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: caller_in_memory: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #96 +; CHECK-GI-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: bl return_in_memory +; CHECK-GI-NEXT: ldp x8, x9, [sp, #8] +; CHECK-GI-NEXT: adrp x10, in_memory_store +; CHECK-GI-NEXT: ldp x11, x12, [sp, #24] +; CHECK-GI-NEXT: ldp x13, x14, [sp, #40] +; CHECK-GI-NEXT: ldp x15, x16, [sp, #56] +; CHECK-GI-NEXT: ldp x17, x30, [sp, #72] // 8-byte Folded Reload +; CHECK-GI-NEXT: str x8, [x10, :lo12:in_memory_store] +; CHECK-GI-NEXT: adrp x8, in_memory_store +; CHECK-GI-NEXT: add x8, x8, :lo12:in_memory_store +; CHECK-GI-NEXT: stp x9, x11, [x8, #8] +; CHECK-GI-NEXT: stp x12, x13, [x8, #24] +; CHECK-GI-NEXT: stp x14, x15, [x8, #40] +; CHECK-GI-NEXT: stp x16, x17, [x8, #56] +; CHECK-GI-NEXT: add sp, sp, #96 +; CHECK-GI-NEXT: ret %1 = call %T_IN_MEMORY @return_in_memory() store %T_IN_MEMORY %1, ptr @in_memory_store ret void } define void @callee_in_memory(%T_IN_MEMORY %a) { -; CHECK-LABEL: callee_in_memory: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [sp, #32] -; CHECK-NEXT: adrp x8, in_memory_store -; CHECK-NEXT: add x8, x8, :lo12:in_memory_store -; CHECK-NEXT: ldr d0, [sp, #64] -; CHECK-NEXT: str d0, [x8, #64] -; CHECK-NEXT: ldr q0, [sp, #16] -; CHECK-NEXT: str q2, [x8, #48] -; CHECK-NEXT: ldr q2, [sp] -; CHECK-NEXT: stp q0, q1, [x8, #16] -; CHECK-NEXT: str q2, [x8] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: callee_in_memory: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldp q1, q2, [sp, #32] +; CHECK-SD-NEXT: adrp x8, in_memory_store +; CHECK-SD-NEXT: add x8, x8, :lo12:in_memory_store +; CHECK-SD-NEXT: ldr d0, [sp, #64] +; CHECK-SD-NEXT: str d0, [x8, #64] +; CHECK-SD-NEXT: ldr q0, [sp, #16] +; CHECK-SD-NEXT: str q2, [x8, #48] +; CHECK-SD-NEXT: ldr q2, [sp] +; CHECK-SD-NEXT: stp q0, q1, [x8, #16] +; CHECK-SD-NEXT: str q2, [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: callee_in_memory: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldp x8, x9, [sp] +; CHECK-GI-NEXT: adrp x10, in_memory_store +; CHECK-GI-NEXT: ldp x11, x12, [sp, #16] +; CHECK-GI-NEXT: str x8, [x10, :lo12:in_memory_store] +; CHECK-GI-NEXT: adrp x8, in_memory_store +; CHECK-GI-NEXT: add x8, x8, :lo12:in_memory_store +; CHECK-GI-NEXT: stp x9, x11, [x8, #8] +; CHECK-GI-NEXT: ldp x9, x10, [sp, #32] +; CHECK-GI-NEXT: stp x12, x9, [x8, #24] +; CHECK-GI-NEXT: ldp x9, x11, [sp, #48] +; CHECK-GI-NEXT: str x10, [x8, #40] +; CHECK-GI-NEXT: ldr x10, [sp, #64] +; CHECK-GI-NEXT: stp x9, x11, [x8, #48] +; CHECK-GI-NEXT: str x10, [x8, #64] +; CHECK-GI-NEXT: ret store %T_IN_MEMORY %a, ptr @in_memory_store ret void } define void @argument_in_memory() { -; CHECK-LABEL: argument_in_memory: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: adrp x8, in_memory_store -; CHECK-NEXT: add x8, x8, :lo12:in_memory_store -; CHECK-NEXT: ldp q0, q1, [x8] -; CHECK-NEXT: ldr d4, [x8, #64] -; CHECK-NEXT: ldp q2, q3, [x8, #32] -; CHECK-NEXT: str d4, [sp, #64] -; CHECK-NEXT: stp q0, q1, [sp] -; CHECK-NEXT: stp q2, q3, [sp, #32] -; CHECK-NEXT: bl callee_in_memory -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: argument_in_memory: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #96 +; CHECK-SD-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: adrp x8, in_memory_store +; CHECK-SD-NEXT: add x8, x8, :lo12:in_memory_store +; CHECK-SD-NEXT: ldp q0, q1, [x8] +; CHECK-SD-NEXT: ldr d4, [x8, #64] +; CHECK-SD-NEXT: ldp q2, q3, [x8, #32] +; CHECK-SD-NEXT: str d4, [sp, #64] +; CHECK-SD-NEXT: stp q0, q1, [sp] +; CHECK-SD-NEXT: stp q2, q3, [sp, #32] +; CHECK-SD-NEXT: bl callee_in_memory +; CHECK-SD-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #96 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: argument_in_memory: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #96 +; CHECK-GI-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: adrp x9, in_memory_store +; CHECK-GI-NEXT: add x9, x9, :lo12:in_memory_store +; CHECK-GI-NEXT: adrp x8, in_memory_store +; CHECK-GI-NEXT: ldp x10, x11, [x9, #8] +; CHECK-GI-NEXT: ldr x8, [x8, :lo12:in_memory_store] +; CHECK-GI-NEXT: ldp x12, x13, [x9, #24] +; CHECK-GI-NEXT: ldp x14, x15, [x9, #40] +; CHECK-GI-NEXT: ldp x16, x9, [x9, #56] +; CHECK-GI-NEXT: stp x8, x10, [sp] +; CHECK-GI-NEXT: stp x11, x12, [sp, #16] +; CHECK-GI-NEXT: stp x13, x14, [sp, #32] +; CHECK-GI-NEXT: stp x15, x16, [sp, #48] +; CHECK-GI-NEXT: str x9, [sp, #64] +; CHECK-GI-NEXT: bl callee_in_memory +; CHECK-GI-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #96 +; CHECK-GI-NEXT: ret %1 = load %T_IN_MEMORY, ptr @in_memory_store call void @callee_in_memory(%T_IN_MEMORY %1) ret void @@ -527,54 +673,97 @@ define %T_NO_BLOCK @return_no_block() { @no_block_store = dso_local global %T_NO_BLOCK zeroinitializer, align 8 define void @caller_no_block() { -; CHECK-LABEL: caller_no_block: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl return_no_block -; CHECK-NEXT: adrp x8, no_block_store -; CHECK-NEXT: add x8, x8, :lo12:no_block_store -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: str w0, [x8, #8] -; CHECK-NEXT: str d1, [x8, #16] -; CHECK-NEXT: str w1, [x8, #24] -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: caller_no_block: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: bl return_no_block +; CHECK-SD-NEXT: adrp x8, no_block_store +; CHECK-SD-NEXT: add x8, x8, :lo12:no_block_store +; CHECK-SD-NEXT: str d0, [x8] +; CHECK-SD-NEXT: str w0, [x8, #8] +; CHECK-SD-NEXT: str d1, [x8, #16] +; CHECK-SD-NEXT: str w1, [x8, #24] +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: caller_no_block: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: bl return_no_block +; CHECK-GI-NEXT: adrp x8, no_block_store +; CHECK-GI-NEXT: str d0, [x8, :lo12:no_block_store] +; CHECK-GI-NEXT: adrp x8, no_block_store +; CHECK-GI-NEXT: add x8, x8, :lo12:no_block_store +; CHECK-GI-NEXT: str w0, [x8, #8] +; CHECK-GI-NEXT: str d1, [x8, #16] +; CHECK-GI-NEXT: str w1, [x8, #24] +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret %1 = call %T_NO_BLOCK @return_no_block() store %T_NO_BLOCK %1, ptr @no_block_store ret void } define void @callee_no_block(%T_NO_BLOCK %a) { -; CHECK-LABEL: callee_no_block: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, no_block_store -; CHECK-NEXT: add x8, x8, :lo12:no_block_store -; CHECK-NEXT: str w1, [x8, #24] -; CHECK-NEXT: str d1, [x8, #16] -; CHECK-NEXT: str w0, [x8, #8] -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: callee_no_block: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, no_block_store +; CHECK-SD-NEXT: add x8, x8, :lo12:no_block_store +; CHECK-SD-NEXT: str w1, [x8, #24] +; CHECK-SD-NEXT: str d1, [x8, #16] +; CHECK-SD-NEXT: str w0, [x8, #8] +; CHECK-SD-NEXT: str d0, [x8] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: callee_no_block: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, no_block_store +; CHECK-GI-NEXT: str d0, [x8, :lo12:no_block_store] +; CHECK-GI-NEXT: adrp x8, no_block_store +; CHECK-GI-NEXT: add x8, x8, :lo12:no_block_store +; CHECK-GI-NEXT: str w0, [x8, #8] +; CHECK-GI-NEXT: str d1, [x8, #16] +; CHECK-GI-NEXT: str w1, [x8, #24] +; CHECK-GI-NEXT: ret store %T_NO_BLOCK %a, ptr @no_block_store ret void } define void @argument_no_block() { -; CHECK-LABEL: argument_no_block: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: adrp x8, no_block_store -; CHECK-NEXT: add x8, x8, :lo12:no_block_store -; CHECK-NEXT: ldr w1, [x8, #24] -; CHECK-NEXT: ldr d1, [x8, #16] -; CHECK-NEXT: ldr w0, [x8, #8] -; CHECK-NEXT: ldr d0, [x8] -; CHECK-NEXT: bl callee_no_block -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-SD-LABEL: argument_no_block: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: adrp x8, no_block_store +; CHECK-SD-NEXT: add x8, x8, :lo12:no_block_store +; CHECK-SD-NEXT: ldr w1, [x8, #24] +; CHECK-SD-NEXT: ldr d1, [x8, #16] +; CHECK-SD-NEXT: ldr w0, [x8, #8] +; CHECK-SD-NEXT: ldr d0, [x8] +; CHECK-SD-NEXT: bl callee_no_block +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: argument_no_block: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: adrp x8, no_block_store +; CHECK-GI-NEXT: adrp x9, no_block_store +; CHECK-GI-NEXT: add x9, x9, :lo12:no_block_store +; CHECK-GI-NEXT: ldr d0, [x8, :lo12:no_block_store] +; CHECK-GI-NEXT: ldr w0, [x9, #8] +; CHECK-GI-NEXT: ldr d1, [x9, #16] +; CHECK-GI-NEXT: ldr w1, [x9, #24] +; CHECK-GI-NEXT: bl callee_no_block +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret %1 = load %T_NO_BLOCK, ptr @no_block_store call void @callee_no_block(%T_NO_BLOCK %1) ret void diff --git a/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll b/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll index e2d530ab421ef..07c4dbcf41096 100644 --- a/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll +++ b/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll @@ -193,7 +193,7 @@ define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone { define i32 @uqxtn_ext(<4 x i32> noundef %a, <4 x i32> noundef %b, i32 %c, float %d, <2 x i64> %e) { ; CHECK-LABEL: uqxtn_ext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.d[0], v3.d[1] +; CHECK-NEXT: mov d0, v3.d[1] ; CHECK-NEXT: uqxtn s0, d0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -219,7 +219,7 @@ entry: define <4 x i32> @sqxtun_insext(<4 x i32> noundef %a, <2 x i64> %e) { ; CHECK-LABEL: sqxtun_insext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v1.d[0], v1.d[1] +; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: sqxtun s1, d1 ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll index 2f543cc324bc2..a7f9ca8d73c1f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll @@ -95,6 +95,7 @@ ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli4h ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli2s ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli1d_imm0 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli16b ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli8h ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli4s @@ -4088,6 +4089,16 @@ define <1 x i64> @sli1d(ptr %A, ptr %B) nounwind { ret <1 x i64> %tmp3 } +; Ensure we can select scalar SLI with a zero shift (see issue #139879). +define <1 x i64> @sli1d_imm0(<1 x i64> %a, <1 x i64> %b) { +; CHECK-LABEL: sli1d_imm0: +; CHECK: // %bb.0: +; CHECK-NEXT: sli d0, d1, #0 +; CHECK-NEXT: ret + %r = call <1 x i64> @llvm.aarch64.neon.vsli(<1 x i64> %a, <1 x i64> %b, i32 0) + ret <1 x i64> %r +} + define <16 x i8> @sli16b(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: sli16b: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll index 20ff5fc5bc5e1..f964484c0c2d4 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll @@ -24,11 +24,15 @@ define void @has_varargs(...) hybrid_patchable nounwind { ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: "#has_varargs$hp_target": // @"#has_varargs$hp_target" ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: stp x0, x1, [x4, #-32] -; CHECK-NEXT: stp x2, x3, [x4, #-16] -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: stp x0, x1, [x4, #-32]! +; CHECK-NEXT: stp x2, x3, [x4, #16] +; CHECK-NEXT: str x4, [sp, #8] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret + %valist = alloca ptr + call void @llvm.va_start(ptr %valist) + call void @llvm.va_end(ptr %valist) ret void } diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll index 844fc52ddade6..5796b6f3216a7 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll @@ -102,40 +102,58 @@ define void @varargs_many_argscalleer() nounwind { define void @varargs_caller_tail() nounwind { ; CHECK-LABEL: varargs_caller_tail: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: mov x4, sp -; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: mov x9, #4617315517961601024 // =0x4014000000000000 -; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 -; CHECK-NEXT: mov w1, #2 // =0x2 -; CHECK-NEXT: mov x2, #4613937818241073152 // =0x4008000000000000 -; CHECK-NEXT: mov w3, #4 // =0x4 -; CHECK-NEXT: mov w5, #16 // =0x10 -; CHECK-NEXT: stp xzr, x30, [sp, #24] // 8-byte Folded Spill -; CHECK-NEXT: stp x9, x8, [sp] -; CHECK-NEXT: str xzr, [sp, #16] -; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF -; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF -; CHECK-NEXT: bl "#varargs_callee" -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: add x4, sp, #48 -; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 -; CHECK-NEXT: mov w1, #4 // =0x4 -; CHECK-NEXT: mov w2, #3 // =0x3 -; CHECK-NEXT: mov w3, #2 // =0x2 -; CHECK-NEXT: mov x5, xzr -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF -; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF -; CHECK-NEXT: b "#varargs_callee" +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: mov x4, sp +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x9, #4617315517961601024 // =0x4014000000000000 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov w1, #2 // =0x2 +; CHECK-NEXT: mov x2, #4613937818241073152 // =0x4008000000000000 +; CHECK-NEXT: mov w3, #4 // =0x4 +; CHECK-NEXT: mov w5, #16 // =0x10 +; CHECK-NEXT: stp xzr, x30, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp x9, x8, [sp] +; CHECK-NEXT: str xzr, [sp, #16] +; CHECK-NEXT: .weak_anti_dep varargs_callee +; CHECK-NEXT: .set varargs_callee, "#varargs_callee"@WEAKREF +; CHECK-NEXT: .weak_anti_dep "#varargs_callee" +; CHECK-NEXT: .set "#varargs_callee", varargs_callee@WEAKREF +; CHECK-NEXT: bl "#varargs_callee" +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: add x4, sp, #48 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov w1, #4 // =0x4 +; CHECK-NEXT: mov w2, #3 // =0x3 +; CHECK-NEXT: mov w3, #2 // =0x2 +; CHECK-NEXT: mov x5, xzr +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: .weak_anti_dep varargs_callee +; CHECK-NEXT: .set varargs_callee, "#varargs_callee"@WEAKREF +; CHECK-NEXT: .weak_anti_dep "#varargs_callee" +; CHECK-NEXT: .set "#varargs_callee", varargs_callee@WEAKREF +; CHECK-NEXT: b "#varargs_callee" call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> ) tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2) ret void } -declare void @llvm.va_start(ptr) +; Check we spill/restore x4 and x5, and don't dereference x4. +define void @varargs_thunk(ptr noundef %0, ...) "thunk" { +; CHECK-LABEL: varargs_thunk: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x11, [x0] +; CHECK-NEXT: mov x9, x5 +; CHECK-NEXT: mov x10, x4 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x11, [x11] +; CHECK-NEXT: mov x4, x10 +; CHECK-NEXT: mov x5, x9 +; CHECK-NEXT: br x11 + call void asm "","~{x4},~{x5}"() + %vtable = load ptr, ptr %0, align 8 + %vtablefn = load ptr, ptr %vtable, align 8 + musttail call void (ptr, ...) %vtablefn(ptr noundef %0, ...) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll index 33238ccf86a39..3133d0efb4b9b 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll @@ -70,8 +70,8 @@ define <4 x i64> @z_i32_v4i64(i32 %x) { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmov s0, w0 ; CHECK-SD-NEXT: movi v1.2d, #0x000000000000ff -; CHECK-SD-NEXT: mov v2.b[0], v0.b[0] -; CHECK-SD-NEXT: mov v3.b[0], v0.b[2] +; CHECK-SD-NEXT: mov b2, v0.b[0] +; CHECK-SD-NEXT: mov b3, v0.b[2] ; CHECK-SD-NEXT: mov v2.b[4], v0.b[1] ; CHECK-SD-NEXT: mov v3.b[4], v0.b[3] ; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0 @@ -172,8 +172,8 @@ define <4 x i64> @s_i32_v4i64(i32 %x) { ; CHECK-SD-LABEL: s_i32_v4i64: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: fmov s0, w0 -; CHECK-SD-NEXT: mov v1.b[0], v0.b[0] -; CHECK-SD-NEXT: mov v2.b[0], v0.b[2] +; CHECK-SD-NEXT: mov b1, v0.b[0] +; CHECK-SD-NEXT: mov b2, v0.b[2] ; CHECK-SD-NEXT: mov v1.b[4], v0.b[1] ; CHECK-SD-NEXT: mov v2.b[4], v0.b[3] ; CHECK-SD-NEXT: ushll v0.2d, v1.2s, #0 diff --git a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll index da176894c48a9..5d3c755d0d73d 100644 --- a/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll +++ b/llvm/test/CodeGen/AArch64/darwinpcs-tail.ll @@ -8,8 +8,8 @@ ; CHECK-LABEL: _tailTest: ; CHECK: b __ZN1C3addEPKcz ; CHECK-LABEL: __ZThn8_N1C1fEiiiiiiiiiz: -; CHECK: ldr w8, [sp, #4] -; CHECK: str w8, [sp, #4] +; CHECK: ldr w9, [sp, #4] +; CHECK: str w9, [sp, #4] ; CHECK: b __ZN1C1fEiiiiiiiiiz %class.C = type { %class.A.base, [4 x i8], %class.B.base, [4 x i8] } diff --git a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll index e90b6cb7f809b..65da95e0163f4 100644 --- a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll +++ b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll @@ -5,7 +5,7 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECKLE-LABEL: test_reconstructshuffle: ; CHECKLE: // %bb.0: -; CHECKLE-NEXT: mov v2.b[0], v0.b[3] +; CHECKLE-NEXT: mov b2, v0.b[3] ; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECKLE-NEXT: mov v2.b[2], v0.b[2] ; CHECKLE-NEXT: mov v2.b[4], v0.b[1] @@ -21,7 +21,7 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECKBE-NEXT: rev64 v1.16b, v1.16b ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECKBE-NEXT: mov v2.b[0], v0.b[3] +; CHECKBE-NEXT: mov b2, v0.b[3] ; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECKBE-NEXT: mov v2.b[2], v0.b[2] ; CHECKBE-NEXT: mov v2.b[4], v0.b[1] diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll index 97c3a4937cda7..05422d3cc6051 100644 --- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -347,9 +347,8 @@ define half @get_lane_64(<4 x half> %a) #0 { ; CHECK-LABEL: get_lane_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 ; CHECK-NEXT: ret entry: %0 = bitcast <4 x half> %a to <4 x i16> @@ -362,9 +361,8 @@ entry: define half @get_lane_128(<8 x half> %a) #0 { ; CHECK-LABEL: get_lane_128: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 ; CHECK-NEXT: ret entry: %0 = bitcast <8 x half> %a to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index fb2bdb4d63f47..34858940370e9 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -3443,10 +3443,10 @@ define <8 x double> @stofp_v8i8_v8f64(<8 x i8> %a) { ; CHECK-SD-LABEL: stofp_v8i8_v8f64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov v1.b[0], v0.b[0] -; CHECK-SD-NEXT: mov v2.b[0], v0.b[2] -; CHECK-SD-NEXT: mov v3.b[0], v0.b[4] -; CHECK-SD-NEXT: mov v4.b[0], v0.b[6] +; CHECK-SD-NEXT: mov b1, v0.b[0] +; CHECK-SD-NEXT: mov b2, v0.b[2] +; CHECK-SD-NEXT: mov b3, v0.b[4] +; CHECK-SD-NEXT: mov b4, v0.b[6] ; CHECK-SD-NEXT: mov v1.b[4], v0.b[1] ; CHECK-SD-NEXT: mov v2.b[4], v0.b[3] ; CHECK-SD-NEXT: mov v3.b[4], v0.b[5] @@ -3492,10 +3492,10 @@ define <8 x double> @utofp_v8i8_v8f64(<8 x i8> %a) { ; CHECK-SD-LABEL: utofp_v8i8_v8f64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov v2.b[0], v0.b[0] -; CHECK-SD-NEXT: mov v3.b[0], v0.b[2] -; CHECK-SD-NEXT: mov v4.b[0], v0.b[4] -; CHECK-SD-NEXT: mov v5.b[0], v0.b[6] +; CHECK-SD-NEXT: mov b2, v0.b[0] +; CHECK-SD-NEXT: mov b3, v0.b[2] +; CHECK-SD-NEXT: mov b4, v0.b[4] +; CHECK-SD-NEXT: mov b5, v0.b[6] ; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff ; CHECK-SD-NEXT: mov v2.b[4], v0.b[1] ; CHECK-SD-NEXT: mov v3.b[4], v0.b[3] @@ -3538,14 +3538,14 @@ define <16 x double> @stofp_v16i8_v16f64(<16 x i8> %a) { ; CHECK-SD-LABEL: stofp_v16i8_v16f64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: mov v2.b[0], v0.b[0] -; CHECK-SD-NEXT: mov v3.b[0], v0.b[2] -; CHECK-SD-NEXT: mov v4.b[0], v0.b[4] -; CHECK-SD-NEXT: mov v5.b[0], v0.b[6] -; CHECK-SD-NEXT: mov v6.b[0], v1.b[0] -; CHECK-SD-NEXT: mov v7.b[0], v1.b[2] -; CHECK-SD-NEXT: mov v16.b[0], v1.b[4] -; CHECK-SD-NEXT: mov v17.b[0], v1.b[6] +; CHECK-SD-NEXT: mov b2, v0.b[0] +; CHECK-SD-NEXT: mov b3, v0.b[2] +; CHECK-SD-NEXT: mov b4, v0.b[4] +; CHECK-SD-NEXT: mov b5, v0.b[6] +; CHECK-SD-NEXT: mov b6, v1.b[0] +; CHECK-SD-NEXT: mov b7, v1.b[2] +; CHECK-SD-NEXT: mov b16, v1.b[4] +; CHECK-SD-NEXT: mov b17, v1.b[6] ; CHECK-SD-NEXT: mov v2.b[4], v0.b[1] ; CHECK-SD-NEXT: mov v3.b[4], v0.b[3] ; CHECK-SD-NEXT: mov v4.b[4], v0.b[5] @@ -3622,15 +3622,15 @@ define <16 x double> @utofp_v16i8_v16f64(<16 x i8> %a) { ; CHECK-SD-LABEL: utofp_v16i8_v16f64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: mov v3.b[0], v0.b[0] -; CHECK-SD-NEXT: mov v4.b[0], v0.b[2] -; CHECK-SD-NEXT: mov v5.b[0], v0.b[4] -; CHECK-SD-NEXT: mov v6.b[0], v0.b[6] +; CHECK-SD-NEXT: mov b3, v0.b[0] +; CHECK-SD-NEXT: mov b4, v0.b[2] +; CHECK-SD-NEXT: mov b5, v0.b[4] +; CHECK-SD-NEXT: mov b6, v0.b[6] ; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff -; CHECK-SD-NEXT: mov v7.b[0], v2.b[0] -; CHECK-SD-NEXT: mov v16.b[0], v2.b[2] -; CHECK-SD-NEXT: mov v17.b[0], v2.b[4] -; CHECK-SD-NEXT: mov v18.b[0], v2.b[6] +; CHECK-SD-NEXT: mov b7, v2.b[0] +; CHECK-SD-NEXT: mov b16, v2.b[2] +; CHECK-SD-NEXT: mov b17, v2.b[4] +; CHECK-SD-NEXT: mov b18, v2.b[6] ; CHECK-SD-NEXT: mov v3.b[4], v0.b[1] ; CHECK-SD-NEXT: mov v4.b[4], v0.b[3] ; CHECK-SD-NEXT: mov v5.b[4], v0.b[5] @@ -3699,18 +3699,18 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-SD-LABEL: stofp_v32i8_v32f64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-SD-NEXT: mov v5.b[0], v1.b[6] -; CHECK-SD-NEXT: mov v17.b[0], v1.b[4] -; CHECK-SD-NEXT: mov v20.b[0], v1.b[2] -; CHECK-SD-NEXT: mov v21.b[0], v1.b[0] -; CHECK-SD-NEXT: mov v18.b[0], v0.b[0] -; CHECK-SD-NEXT: mov v19.b[0], v0.b[6] -; CHECK-SD-NEXT: mov v22.b[0], v0.b[4] +; CHECK-SD-NEXT: mov b5, v1.b[6] +; CHECK-SD-NEXT: mov b17, v1.b[4] +; CHECK-SD-NEXT: mov b20, v1.b[2] +; CHECK-SD-NEXT: mov b21, v1.b[0] +; CHECK-SD-NEXT: mov b18, v0.b[0] +; CHECK-SD-NEXT: mov b19, v0.b[6] +; CHECK-SD-NEXT: mov b22, v0.b[4] ; CHECK-SD-NEXT: ext v16.16b, v1.16b, v1.16b, #8 -; CHECK-SD-NEXT: mov v2.b[0], v3.b[0] -; CHECK-SD-NEXT: mov v4.b[0], v3.b[2] -; CHECK-SD-NEXT: mov v6.b[0], v3.b[4] -; CHECK-SD-NEXT: mov v7.b[0], v3.b[6] +; CHECK-SD-NEXT: mov b2, v3.b[0] +; CHECK-SD-NEXT: mov b4, v3.b[2] +; CHECK-SD-NEXT: mov b6, v3.b[4] +; CHECK-SD-NEXT: mov b7, v3.b[6] ; CHECK-SD-NEXT: mov v5.b[4], v1.b[7] ; CHECK-SD-NEXT: mov v17.b[4], v1.b[5] ; CHECK-SD-NEXT: mov v20.b[4], v1.b[3] @@ -3718,16 +3718,16 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-SD-NEXT: mov v19.b[4], v0.b[7] ; CHECK-SD-NEXT: mov v22.b[4], v0.b[5] ; CHECK-SD-NEXT: mov v18.b[4], v0.b[1] -; CHECK-SD-NEXT: mov v23.b[0], v16.b[0] +; CHECK-SD-NEXT: mov b23, v16.b[0] ; CHECK-SD-NEXT: mov v2.b[4], v3.b[1] ; CHECK-SD-NEXT: mov v4.b[4], v3.b[3] ; CHECK-SD-NEXT: mov v6.b[4], v3.b[5] ; CHECK-SD-NEXT: mov v7.b[4], v3.b[7] -; CHECK-SD-NEXT: mov v3.b[0], v0.b[2] +; CHECK-SD-NEXT: mov b3, v0.b[2] ; CHECK-SD-NEXT: shl v5.2s, v5.2s, #24 ; CHECK-SD-NEXT: shl v17.2s, v17.2s, #24 ; CHECK-SD-NEXT: shl v20.2s, v20.2s, #24 -; CHECK-SD-NEXT: mov v24.b[0], v16.b[4] +; CHECK-SD-NEXT: mov b24, v16.b[4] ; CHECK-SD-NEXT: mov v23.b[4], v16.b[1] ; CHECK-SD-NEXT: shl v18.2s, v18.2s, #24 ; CHECK-SD-NEXT: shl v19.2s, v19.2s, #24 @@ -3739,10 +3739,10 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-SD-NEXT: shl v0.2s, v21.2s, #24 ; CHECK-SD-NEXT: shl v4.2s, v6.2s, #24 ; CHECK-SD-NEXT: shl v6.2s, v7.2s, #24 -; CHECK-SD-NEXT: mov v7.b[0], v16.b[2] +; CHECK-SD-NEXT: mov b7, v16.b[2] ; CHECK-SD-NEXT: sshll v5.2d, v5.2s, #0 ; CHECK-SD-NEXT: sshr v20.2s, v20.2s, #24 -; CHECK-SD-NEXT: mov v21.b[0], v16.b[6] +; CHECK-SD-NEXT: mov b21, v16.b[6] ; CHECK-SD-NEXT: sshll v17.2d, v17.2s, #0 ; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: shl v22.2s, v22.2s, #24 @@ -3869,25 +3869,25 @@ entry: define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-SD-LABEL: utofp_v32i8_v32f64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v6.b[0], v1.b[6] -; CHECK-SD-NEXT: mov v7.b[0], v1.b[4] +; CHECK-SD-NEXT: mov b6, v1.b[6] +; CHECK-SD-NEXT: mov b7, v1.b[4] ; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-SD-NEXT: mov v16.b[0], v1.b[2] -; CHECK-SD-NEXT: mov v17.b[0], v1.b[0] -; CHECK-SD-NEXT: mov v19.b[0], v0.b[6] -; CHECK-SD-NEXT: mov v20.b[0], v0.b[4] +; CHECK-SD-NEXT: mov b16, v1.b[2] +; CHECK-SD-NEXT: mov b17, v1.b[0] +; CHECK-SD-NEXT: mov b19, v0.b[6] +; CHECK-SD-NEXT: mov b20, v0.b[4] ; CHECK-SD-NEXT: movi d5, #0x0000ff000000ff -; CHECK-SD-NEXT: mov v24.b[0], v0.b[2] -; CHECK-SD-NEXT: mov v25.b[0], v0.b[0] +; CHECK-SD-NEXT: mov b24, v0.b[2] +; CHECK-SD-NEXT: mov b25, v0.b[0] ; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 ; CHECK-SD-NEXT: mov v6.b[4], v1.b[7] ; CHECK-SD-NEXT: mov v7.b[4], v1.b[5] -; CHECK-SD-NEXT: mov v18.b[0], v3.b[0] -; CHECK-SD-NEXT: mov v21.b[0], v3.b[2] -; CHECK-SD-NEXT: mov v23.b[0], v3.b[4] +; CHECK-SD-NEXT: mov b18, v3.b[0] +; CHECK-SD-NEXT: mov b21, v3.b[2] +; CHECK-SD-NEXT: mov b23, v3.b[4] ; CHECK-SD-NEXT: mov v16.b[4], v1.b[3] ; CHECK-SD-NEXT: mov v17.b[4], v1.b[1] -; CHECK-SD-NEXT: mov v1.b[0], v3.b[6] +; CHECK-SD-NEXT: mov b1, v3.b[6] ; CHECK-SD-NEXT: mov v19.b[4], v0.b[7] ; CHECK-SD-NEXT: mov v20.b[4], v0.b[5] ; CHECK-SD-NEXT: mov v24.b[4], v0.b[3] @@ -3905,15 +3905,15 @@ define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) { ; CHECK-SD-NEXT: ushll v7.2d, v7.2s, #0 ; CHECK-SD-NEXT: and v20.8b, v20.8b, v5.8b ; CHECK-SD-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-SD-NEXT: mov v4.b[0], v2.b[0] -; CHECK-SD-NEXT: mov v22.b[0], v2.b[2] +; CHECK-SD-NEXT: mov b4, v2.b[0] +; CHECK-SD-NEXT: mov b22, v2.b[2] ; CHECK-SD-NEXT: ushll v17.2d, v17.2s, #0 ; CHECK-SD-NEXT: ushll v0.2d, v3.2s, #0 -; CHECK-SD-NEXT: mov v19.b[0], v2.b[4] +; CHECK-SD-NEXT: mov b19, v2.b[4] ; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d ; CHECK-SD-NEXT: ucvtf v3.2d, v7.2d ; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0 -; CHECK-SD-NEXT: mov v7.b[0], v2.b[6] +; CHECK-SD-NEXT: mov b7, v2.b[6] ; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d ; CHECK-SD-NEXT: and v24.8b, v24.8b, v5.8b ; CHECK-SD-NEXT: ucvtf v17.2d, v17.2d diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll index c039da26b7c15..c6aa8701e1721 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll @@ -555,7 +555,7 @@ define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) { ; CHECK-LE-LABEL: bitcast_i16_to_v2i8: ; CHECK-LE: // %bb.0: ; CHECK-LE-NEXT: fmov s1, w0 -; CHECK-LE-NEXT: mov v0.b[0], v1.b[0] +; CHECK-LE-NEXT: mov b0, v1.b[0] ; CHECK-LE-NEXT: mov v0.b[4], v1.b[1] ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret @@ -564,7 +564,7 @@ define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) { ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: fmov s0, w0 ; CHECK-BE-NEXT: rev16 v0.16b, v0.16b -; CHECK-BE-NEXT: mov v1.b[0], v0.b[0] +; CHECK-BE-NEXT: mov b1, v0.b[0] ; CHECK-BE-NEXT: mov v1.b[4], v0.b[1] ; CHECK-BE-NEXT: rev64 v0.2s, v1.2s ; CHECK-BE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll index 0f4eec4fdfda1..bfdf794c1c27a 100644 --- a/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll +++ b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll @@ -360,8 +360,7 @@ define <4 x i32> @test_q_lane4_nxv4i32(<4 x i32> %a, %b) { define <1 x double> @test_lane0_nxv2f64(<1 x double> %a, %b) { ; CHECK-LABEL: test_lane0_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.d[0], v1.d[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v1.d[0] ; CHECK-NEXT: ret %c = extractelement %b, i32 0 %d = insertelement <1 x double> %a, double %c, i32 0 @@ -371,8 +370,7 @@ define <1 x double> @test_lane0_nxv2f64(<1 x double> %a, % define <1 x double> @test_lane1_nxv2f64(<1 x double> %a, %b) { ; CHECK-LABEL: test_lane1_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.d[0], v1.d[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v1.d[1] ; CHECK-NEXT: ret %c = extractelement %b, i32 1 %d = insertelement <1 x double> %a, double %c, i32 0 @@ -416,8 +414,7 @@ define <2 x double> @test_q_lane2_nxv2f64(<2 x double> %a, define <1 x i64> @test_lane0_nxv2i64(<1 x i64> %a, %b) { ; CHECK-LABEL: test_lane0_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.d[0], v1.d[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v1.d[0] ; CHECK-NEXT: ret %c = extractelement %b, i32 0 %d = insertelement <1 x i64> %a, i64 %c, i32 0 @@ -427,8 +424,7 @@ define <1 x i64> @test_lane0_nxv2i64(<1 x i64> %a, %b) { define <1 x i64> @test_lane1_nxv2i64(<1 x i64> %a, %b) { ; CHECK-LABEL: test_lane1_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.d[0], v1.d[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v1.d[1] ; CHECK-NEXT: ret %c = extractelement %b, i32 1 %d = insertelement <1 x i64> %a, i64 %c, i32 0 diff --git a/llvm/test/CodeGen/AArch64/neon-insextbitcast.ll b/llvm/test/CodeGen/AArch64/neon-insextbitcast.ll index ebff3f1370040..d9cdbc2f92aca 100644 --- a/llvm/test/CodeGen/AArch64/neon-insextbitcast.ll +++ b/llvm/test/CodeGen/AArch64/neon-insextbitcast.ll @@ -89,11 +89,58 @@ entry: } +define half @test_vext_v8i16(<8 x i16> %a) { +; CHECK-LABEL: test_vext_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov h0, v0.h[5] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-NEXT: ret +entry: + %b = extractelement <8 x i16> %a, i32 5 + %c = bitcast i16 %b to half + ret half %c +} + +define half @test_vext_v8i16_0(<8 x i16> %a) { +; CHECK-LABEL: test_vext_v8i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-NEXT: ret +entry: + %b = extractelement <8 x i16> %a, i32 0 + %c = bitcast i16 %b to half + ret half %c +} + +define half @test_vext_v4i16(<4 x i16> %a) { +; CHECK-LABEL: test_vext_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov h0, v0.h[1] +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-NEXT: ret +entry: + %b = extractelement <4 x i16> %a, i32 1 + %c = bitcast i16 %b to half + ret half %c +} + +define half @test_vext_v4i16_0(<4 x i16> %a) { +; CHECK-LABEL: test_vext_v4i16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-NEXT: ret +entry: + %b = extractelement <4 x i16> %a, i32 0 + %c = bitcast i16 %b to half + ret half %c +} + define float @test_vext_v4i32(<4 x i32> %a) { ; CHECK-LABEL: test_vext_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.s[0], v0.s[3] -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: mov s0, v0.s[3] ; CHECK-NEXT: ret entry: %b = extractelement <4 x i32> %a, i32 3 @@ -116,8 +163,7 @@ define float @test_vext_v2i32(<2 x i32> %a) { ; CHECK-LABEL: test_vext_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.s[0], v0.s[1] -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: mov s0, v0.s[1] ; CHECK-NEXT: ret entry: %b = extractelement <2 x i32> %a, i32 1 @@ -140,8 +186,7 @@ entry: define double @test_vext_v2i64(<2 x i64> %a) { ; CHECK-LABEL: test_vext_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.d[0], v0.d[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: ret entry: %b = extractelement <2 x i64> %a, i32 1 diff --git a/llvm/test/CodeGen/AArch64/nofpclass.ll b/llvm/test/CodeGen/AArch64/nofpclass.ll new file mode 100644 index 0000000000000..3139aa0ef0bf6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/nofpclass.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define float @f(float nofpclass(nan) %a, float nofpclass(nan) %b) { +; CHECK-LABEL: f: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm s0, s0, s1 +; CHECK-NEXT: ret +entry: + %cond = tail call float @llvm.maximumnum.f32(float %a, float %b) + ret float %cond +} + +define <4 x float> @fv4f32(<4 x float> nofpclass(nan) %a, <4 x float> nofpclass(nan) %b) { +; CHECK-LABEL: fv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %c = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %c +} + +define {float, float} @m({float, float} nofpclass(nan) %a0, {float, float} nofpclass(nan) %a1) { +; CHECK-LABEL: m: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm s1, s1, s3 +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: ret +entry: + %a0f0 = extractvalue {float, float} %a0, 0 + %a0f1 = extractvalue {float, float} %a0, 1 + %a1f0 = extractvalue {float, float} %a1, 0 + %a1f1 = extractvalue {float, float} %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue {float, float} poison, float %max0, 0 + %ret1 = insertvalue {float, float} %ret0, float %max1, 1 + ret {float, float} %ret1 +} + +define [2 x float] @mA([2 x float] nofpclass(nan) %a0, [2 x float] nofpclass(nan) %a1) { +; CHECK-LABEL: mA: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm s1, s1, s3 +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: ret +entry: + %a0f0 = extractvalue [2 x float] %a0, 0 + %a0f1 = extractvalue [2 x float] %a0, 1 + %a1f0 = extractvalue [2 x float] %a1, 0 + %a1f1 = extractvalue [2 x float] %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue [2 x float] poison, float %max0, 0 + %ret1 = insertvalue [2 x float] %ret0, float %max1, 1 + ret [2 x float] %ret1 +} + +define float @fS(float nofpclass(snan) %a, float nofpclass(snan) %b) { +; CHECK-LABEL: fS: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm s0, s0, s1 +; CHECK-NEXT: ret +entry: + %cond = tail call float @llvm.maximumnum.f32(float %a, float %b) + ret float %cond +} + +define <4 x float> @fSv4f32(<4 x float> nofpclass(snan) %a, <4 x float> nofpclass(snan) %b) { +; CHECK-LABEL: fSv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %c = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %c +} + +define {float, float} @mS({float, float} nofpclass(snan) %a0, {float, float} nofpclass(snan) %a1) { +; CHECK-LABEL: mS: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm s1, s1, s3 +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: ret +entry: + %a0f0 = extractvalue {float, float} %a0, 0 + %a0f1 = extractvalue {float, float} %a0, 1 + %a1f0 = extractvalue {float, float} %a1, 0 + %a1f1 = extractvalue {float, float} %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue {float, float} poison, float %max0, 0 + %ret1 = insertvalue {float, float} %ret0, float %max1, 1 + ret {float, float} %ret1 +} + +define [2 x float] @mAS([2 x float] nofpclass(snan) %a0, [2 x float] nofpclass(snan) %a1) { +; CHECK-LABEL: mAS: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmaxnm s1, s1, s3 +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: ret +entry: + %a0f0 = extractvalue [2 x float] %a0, 0 + %a0f1 = extractvalue [2 x float] %a0, 1 + %a1f0 = extractvalue [2 x float] %a1, 0 + %a1f1 = extractvalue [2 x float] %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue [2 x float] poison, float %max0, 0 + %ret1 = insertvalue [2 x float] %ret0, float %max1, 1 + ret [2 x float] %ret1 +} + +define float @fQ(float nofpclass(qnan) %a, float nofpclass(qnan) %b) { +; CHECK-LABEL: fQ: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fminnm s1, s1, s1 +; CHECK-NEXT: fminnm s0, s0, s0 +; CHECK-NEXT: fmaxnm s0, s0, s1 +; CHECK-NEXT: ret +entry: + %cond = tail call float @llvm.maximumnum.f32(float %a, float %b) + ret float %cond +} + +define <4 x float> @fQv4f32(<4 x float> nofpclass(qnan) %a, <4 x float> nofpclass(qnan) %b) { +; CHECK-LABEL: fQv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fminnm v1.4s, v1.4s, v1.4s +; CHECK-NEXT: fminnm v0.4s, v0.4s, v0.4s +; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %c = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b) + ret <4 x float> %c +} + +define {float, float} @mQ({float, float} nofpclass(qnan) %a0, {float, float} nofpclass(qnan) %a1) { +; CHECK-LABEL: mQ: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fminnm s2, s2, s2 +; CHECK-NEXT: fminnm s0, s0, s0 +; CHECK-NEXT: fminnm s3, s3, s3 +; CHECK-NEXT: fminnm s1, s1, s1 +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: fmaxnm s1, s1, s3 +; CHECK-NEXT: ret +entry: + %a0f0 = extractvalue {float, float} %a0, 0 + %a0f1 = extractvalue {float, float} %a0, 1 + %a1f0 = extractvalue {float, float} %a1, 0 + %a1f1 = extractvalue {float, float} %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue {float, float} poison, float %max0, 0 + %ret1 = insertvalue {float, float} %ret0, float %max1, 1 + ret {float, float} %ret1 +} + +define [2 x float] @mAQ([2 x float] nofpclass(qnan) %a0, [2 x float] nofpclass(qnan) %a1) { +; CHECK-LABEL: mAQ: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fminnm s2, s2, s2 +; CHECK-NEXT: fminnm s0, s0, s0 +; CHECK-NEXT: fminnm s3, s3, s3 +; CHECK-NEXT: fminnm s1, s1, s1 +; CHECK-NEXT: fmaxnm s0, s0, s2 +; CHECK-NEXT: fmaxnm s1, s1, s3 +; CHECK-NEXT: ret +entry: + %a0f0 = extractvalue [2 x float] %a0, 0 + %a0f1 = extractvalue [2 x float] %a0, 1 + %a1f0 = extractvalue [2 x float] %a1, 0 + %a1f1 = extractvalue [2 x float] %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue [2 x float] poison, float %max0, 0 + %ret1 = insertvalue [2 x float] %ret0, float %max1, 1 + ret [2 x float] %ret1 +} diff --git a/llvm/test/CodeGen/AArch64/reserveXreg.ll b/llvm/test/CodeGen/AArch64/reserveXreg.ll index 037ccab1525d1..4a02675ec04fa 100644 --- a/llvm/test/CodeGen/AArch64/reserveXreg.ll +++ b/llvm/test/CodeGen/AArch64/reserveXreg.ll @@ -1,8 +1,9 @@ ;; Check if manually reserved registers are always excluded from being saved by ;; the function prolog/epilog, even for callee-saved ones, as per GCC behavior. ;; Look at AArch64Features.td for registers excluded from this test. +;; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. -; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -verify-machineinstrs=0 | FileCheck %s define preserve_mostcc void @t1() "target-features"="+reserve-x1" { ; CHECK-LABEL: t1: diff --git a/llvm/test/CodeGen/AArch64/shuffle-extend.ll b/llvm/test/CodeGen/AArch64/shuffle-extend.ll index 7658e5ab6936b..1e8d053973eb2 100644 --- a/llvm/test/CodeGen/AArch64/shuffle-extend.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-extend.ll @@ -4,7 +4,7 @@ define <2 x i8> @test_v16i8_v2i32_824(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: test_v16i8_v2i32_824: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.b[0], v0.b[8] +; CHECK-NEXT: mov b0, v0.b[8] ; CHECK-NEXT: mov v0.b[4], v1.b[8] ; CHECK-NEXT: add v0.2s, v0.2s, v0.2s ; CHECK-NEXT: ret @@ -16,7 +16,7 @@ define <2 x i8> @test_v16i8_v2i32_824(<16 x i8> %a, <16 x i8> %b) { define <2 x i8> @test_v16i8_v2i32_016(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: test_v16i8_v2i32_016: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.b[0], v0.b[0] +; CHECK-NEXT: mov b0, v0.b[0] ; CHECK-NEXT: mov v0.b[4], v1.b[0] ; CHECK-NEXT: add v0.2s, v0.2s, v0.2s ; CHECK-NEXT: ret @@ -30,7 +30,7 @@ define <2 x i8> @test_v8i8_v2i32_08(<8 x i8> %a, <8 x i8> %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.b[0], v0.b[0] +; CHECK-NEXT: mov b0, v0.b[0] ; CHECK-NEXT: mov v0.b[4], v1.b[0] ; CHECK-NEXT: add v0.2s, v0.2s, v0.2s ; CHECK-NEXT: ret @@ -42,7 +42,7 @@ define <2 x i8> @test_v8i8_v2i32_08(<8 x i8> %a, <8 x i8> %b) { define <2 x i16> @test_v8i16_v2i32_08(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_v8i16_v2i32_08: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.h[0], v0.h[0] +; CHECK-NEXT: mov h0, v0.h[0] ; CHECK-NEXT: mov v0.h[2], v1.h[0] ; CHECK-NEXT: add v0.2s, v0.2s, v0.2s ; CHECK-NEXT: ret @@ -56,7 +56,7 @@ define <2 x i16> @test_v4i16_v2i32_04(<4 x i16> %a, <4 x i16> %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.h[0], v0.h[0] +; CHECK-NEXT: mov h0, v0.h[0] ; CHECK-NEXT: mov v0.h[2], v1.h[0] ; CHECK-NEXT: add v0.2s, v0.2s, v0.2s ; CHECK-NEXT: ret @@ -69,7 +69,7 @@ define <2 x i16> @test_v4i16_v2i32_04(<4 x i16> %a, <4 x i16> %b) { define <4 x i8> @test_v16i8_v4i16_824(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: test_v16i8_v4i16_824: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v2.b[0], v0.b[8] +; CHECK-NEXT: mov b2, v0.b[8] ; CHECK-NEXT: mov v2.b[2], v1.b[8] ; CHECK-NEXT: mov v2.b[4], v0.b[0] ; CHECK-NEXT: mov v2.b[6], v1.b[0] @@ -83,7 +83,7 @@ define <4 x i8> @test_v16i8_v4i16_824(<16 x i8> %a, <16 x i8> %b) { define <4 x i8> @test_v16i8_v4i16_016(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: test_v16i8_v4i16_016: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v2.b[0], v0.b[0] +; CHECK-NEXT: mov b2, v0.b[0] ; CHECK-NEXT: mov v2.b[2], v1.b[0] ; CHECK-NEXT: mov v2.b[4], v0.b[4] ; CHECK-NEXT: mov v2.b[6], v1.b[4] @@ -98,7 +98,7 @@ define <4 x i8> @test_v8i8_v4i16_08(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: test_v8i8_v4i16_08: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v2.b[0], v0.b[0] +; CHECK-NEXT: mov b2, v0.b[0] ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: mov v2.b[2], v1.b[0] ; CHECK-NEXT: mov v2.b[4], v0.b[4] @@ -200,8 +200,8 @@ define i1 @test2(ptr %add.ptr, ptr %result, <2 x i64> %hi, <2 x i64> %lo) { ; CHECK-NEXT: dup v2.2d, x9 ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: and v3.16b, v4.16b, v3.16b -; CHECK-NEXT: mov v5.b[0], v0.b[8] -; CHECK-NEXT: mov v0.b[0], v0.b[0] +; CHECK-NEXT: mov b5, v0.b[8] +; CHECK-NEXT: mov b0, v0.b[0] ; CHECK-NEXT: mov v5.b[4], v3.b[8] ; CHECK-NEXT: mov v0.b[4], v3.b[0] ; CHECK-NEXT: add v3.2s, v5.2s, v5.2s diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index 6ea2267cd22e6..130a316bcc2ba 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -2,11 +2,12 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s declare void @callee() +declare void @callee_sm() "aarch64_pstate_sm_enabled" declare void @callee_farg(float) declare float @callee_farg_fret(float) ; normal caller -> streaming callees -define void @test0() nounwind { +define void @test0(ptr %callee) nounwind { ; CHECK-LABEL: test0: ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill @@ -16,8 +17,8 @@ define void @test0() nounwind { ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: smstart sm -; CHECK-NEXT: bl callee -; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload @@ -25,8 +26,8 @@ define void @test0() nounwind { ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_pstate_sm_enabled" - call void @callee() "aarch64_pstate_sm_enabled" + call void @callee_sm() + call void @callee_sm() ret void } @@ -118,7 +119,7 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB3_2: -; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: tbnz w19, #0, .LBB3_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm @@ -140,7 +141,7 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-NEXT: // %bb.9: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB3_10: -; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: tbnz w19, #0, .LBB3_12 ; CHECK-NEXT: // %bb.11: ; CHECK-NEXT: smstop sm @@ -152,9 +153,9 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_pstate_sm_enabled" + call void @callee_sm() call void @callee() - call void @callee() "aarch64_pstate_sm_enabled" + call void @callee_sm() ret void } @@ -342,7 +343,7 @@ define void @test10() "aarch64_pstate_sm_body" { ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .cfi_restore vg -; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee_sm ; CHECK-NEXT: .cfi_offset vg, -24 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl callee @@ -363,7 +364,7 @@ define void @test10() "aarch64_pstate_sm_body" { ; CHECK-NEXT: .cfi_restore b15 ; CHECK-NEXT: ret call void @callee() - call void @callee() "aarch64_pstate_sm_enabled" + call void @callee_sm() call void @callee() ret void } diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll index 17d689d2c9eb5..0853325e449af 100644 --- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -1098,11 +1098,11 @@ define void @test_rdsvl_right_after_prologue(i64 %x0) nounwind { ; NO-SVE-CHECK-NEXT: ret %some_alloc = alloca i64, align 8 %rdsvl = tail call i64 @llvm.aarch64.sme.cntsd() - call void @bar(i64 %rdsvl, i64 %x0) "aarch64_pstate_sm_enabled" + call void @bar(i64 %rdsvl, i64 %x0) ret void } -declare void @bar(i64, i64) +declare void @bar(i64, i64) "aarch64_pstate_sm_enabled" ; Ensure we still emit async unwind information with -fno-asynchronous-unwind-tables ; if the function contains a streaming-mode change. diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 7361e850d713e..63577e4d217a8 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -1,15 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s -declare void @callee(); - ; ; Private-ZA Callee ; ; Expect spill & fill of ZT0 around call ; Expect smstop/smstart za around call -define void @zt0_in_caller_no_state_callee() "aarch64_in_zt0" nounwind { +define void @zt0_in_caller_no_state_callee(ptr %callee) "aarch64_in_zt0" nounwind { ; CHECK-LABEL: zt0_in_caller_no_state_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 @@ -17,20 +15,20 @@ define void @zt0_in_caller_no_state_callee() "aarch64_in_zt0" nounwind { ; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret - call void @callee(); + call void %callee(); ret void; } ; Expect spill & fill of ZT0 around call ; Expect setup and restore lazy-save around call ; Expect smstart za after call -define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind { +define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: za_zt0_shared_caller_no_state_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill @@ -49,7 +47,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -63,7 +61,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret - call void @callee(); + call void %callee(); ret void; } @@ -72,43 +70,43 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; ; Caller and callee have shared ZT0 state, no spill/fill of ZT0 required -define void @zt0_shared_caller_zt0_shared_callee() "aarch64_in_zt0" nounwind { +define void @zt0_shared_caller_zt0_shared_callee(ptr %callee) "aarch64_in_zt0" nounwind { ; CHECK-LABEL: zt0_shared_caller_zt0_shared_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_in_zt0"; + call void %callee() "aarch64_in_zt0"; ret void; } ; Expect spill & fill of ZT0 around call -define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind { +define void @za_zt0_shared_caller_za_shared_callee(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret - call void @callee() "aarch64_inout_za"; + call void %callee() "aarch64_inout_za"; ret void; } ; Caller and callee have shared ZA & ZT0 -define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind { +define void @za_zt0_shared_caller_za_zt0_shared_callee(ptr %callee) "aarch64_inout_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; + call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } @@ -116,7 +114,7 @@ define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aar ; Expect spill & fill of ZT0 around call ; Expect smstop/smstart za around call -define void @zt0_in_caller_zt0_new_callee() "aarch64_in_zt0" nounwind { +define void @zt0_in_caller_zt0_new_callee(ptr %callee) "aarch64_in_zt0" nounwind { ; CHECK-LABEL: zt0_in_caller_zt0_new_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 @@ -124,13 +122,13 @@ define void @zt0_in_caller_zt0_new_callee() "aarch64_in_zt0" nounwind { ; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret - call void @callee() "aarch64_new_zt0"; + call void %callee() "aarch64_new_zt0"; ret void; } @@ -140,7 +138,7 @@ define void @zt0_in_caller_zt0_new_callee() "aarch64_in_zt0" nounwind { ; Expect smstart ZA & clear ZT0 ; Expect spill & fill of ZT0 around call ; Before return, expect smstop ZA -define void @zt0_new_caller_zt0_new_callee() "aarch64_new_zt0" nounwind { +define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwind { ; CHECK-LABEL: zt0_new_caller_zt0_new_callee: ; CHECK: // %bb.0: // %prelude ; CHECK-NEXT: sub sp, sp, #80 @@ -156,14 +154,14 @@ define void @zt0_new_caller_zt0_new_callee() "aarch64_new_zt0" nounwind { ; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret - call void @callee() "aarch64_new_zt0"; + call void %callee() "aarch64_new_zt0"; ret void; } @@ -207,7 +205,7 @@ declare {i64, i64} @__arm_sme_state() ; Expect commit of lazy-save if ZA is dormant ; Expect smstart ZA & clear ZT0 ; Before return, expect smstop ZA -define void @zt0_new_caller() "aarch64_new_zt0" nounwind { +define void @zt0_new_caller(ptr %callee) "aarch64_new_zt0" nounwind { ; CHECK-LABEL: zt0_new_caller: ; CHECK: // %bb.0: // %prelude ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill @@ -219,18 +217,18 @@ define void @zt0_new_caller() "aarch64_new_zt0" nounwind { ; CHECK-NEXT: .LBB8_2: ; CHECK-NEXT: smstart za ; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_in_zt0"; + call void %callee() "aarch64_in_zt0"; ret void; } ; Expect commit of lazy-save if ZA is dormant ; Expect smstart ZA, clear ZA & clear ZT0 ; Before return, expect smstop ZA -define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { +define void @new_za_zt0_caller(ptr %callee) "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-LABEL: new_za_zt0_caller: ; CHECK: // %bb.0: // %prelude ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill @@ -243,36 +241,36 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: smstart za ; CHECK-NEXT: zero {za} ; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstop za ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; + call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } ; Expect clear ZA on entry -define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwind { +define void @new_za_shared_zt0_caller(ptr %callee) "aarch64_new_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: new_za_shared_zt0_caller: ; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: zero {za} -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; + call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } ; Expect clear ZT0 on entry -define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind { +define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0" nounwind { ; CHECK-LABEL: shared_za_new_zt0: ; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: zero { zt0 } -; CHECK-NEXT: bl callee +; CHECK-NEXT: blr x0 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; + call void %callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; } diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll index 3eed6d45c7710..132caef3763ec 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-prologue.ll @@ -23,7 +23,7 @@ entry: ; INSTR-LABEL: define void @OneVar( ; INSTR: [[BASE:%.*]] = call ptr @llvm.aarch64.irg.sp(i64 0) -; INSTR: [[TLS:%.*]] = call ptr @llvm.thread.pointer() +; INSTR: [[TLS:%.*]] = call ptr @llvm.thread.pointer.p0() ; INSTR: [[TLS_SLOT:%.*]] = getelementptr i8, ptr [[TLS]], i32 -24 ; INSTR: [[TLS_VALUE:%.*]] = load i64, ptr [[TLS_SLOT]], align 8 ; INSTR: [[FP:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll index ab4554428be45..a82998473fe68 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll @@ -128,12 +128,12 @@ define @whilele_b_ii_dont_fold_to_ptrue_overflow() { ret %out } -define @whilele_b_ii_known_always_true() { +define @whilele_b_ii_known_always_true(i32 %a) { ; CHECK-LABEL: whilele_b_ii_known_always_true: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ret - %out = call @llvm.aarch64.sve.whilele.nxv16i1.i32(i32 2147483646, i32 2147483647) + %out = call @llvm.aarch64.sve.whilele.nxv16i1.i32(i32 %a, i32 2147483647) ret %out } @@ -387,12 +387,12 @@ define @whilels_b_ii_dont_fold_to_ptrue_overflow() { ret %out } -define @whilels_b_ii_known_always_true() { +define @whilels_b_ii_known_always_true(i32 %a) { ; CHECK-LABEL: whilels_b_ii_known_always_true: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ret - %out = call @llvm.aarch64.sve.whilels.nxv16i1.i32(i32 4294967294, i32 4294967295) + %out = call @llvm.aarch64.sve.whilels.nxv16i1.i32(i32 %a, i32 4294967295) ret %out } diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll index e524c5d6b453e..8aedeac18f64a 100644 --- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll +++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll @@ -299,3 +299,16 @@ define @codegen_bsl2n_i64( %0, %4, %6 ret %7 } + +; (A ^ B) & C) ^ B -> (A & C) | (B & !C) when BIC instructions are available. +define @bsl_combine_when_bic_available( %a, %b, %c) { +; CHECK-LABEL: bsl_combine_when_bic_available: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %t1 = xor %a, %b + %t2 = and %t1, %c + %t3 = xor %t2, %b + ret %t3 +} diff --git a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll index 7ee76c8ad50a2..2d8f312c9694e 100644 --- a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll +++ b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll @@ -37,12 +37,16 @@ attributes #1 = { noinline optnone "thunk" } ; CHECK: ldr x9, [x9] ; CHECK: mov v0.16b, v16.16b ; CHECK: br x9 -; CHECK-EC: mov v7.16b, v0.16b -; CHECK-EC: ldr x9, [x0] -; CHECK-EC: ldr x11, [x9] -; CHECH-EC: add x4, sp, #96 -; CHECK-EC: mov v0.16b, v7.16b -; CHECK-EC: add x4, sp, #96 -; CHECK-EC: ldr x30, [sp, #48] -; CHECK-EC: add sp, sp, #96 -; CHECK-EC: br x11 +; CHECK-EC: mov v7.16b, v0.16b +; CHECK-EC: ldr x9, [x0] +; CHECK-EC: ldr x11, [x9] +; CHECK-EC: blr x9 +; CHECK-EC-NEXT: mov v0.16b, v7.16b +; CHECK-EC-NEXT: ldr q7, [sp] +; CHECK-EC-NEXT: .seh_startepilogue +; CHECK-EC-NEXT: ldr x30, [sp, #48] +; CHECK-EC-NEXT: .seh_save_reg x30, 48 +; CHECK-EC-NEXT: add sp, sp, #96 +; CHECK-EC-NEXT: .seh_stackalloc 96 +; CHECK-EC-NEXT: .seh_endepilogue +; CHECK-EC-NEXT: br x11 diff --git a/llvm/test/CodeGen/AArch64/variant-pcs.ll b/llvm/test/CodeGen/AArch64/variant-pcs.ll index 49c504177358e..0c995b5b0e8ef 100644 --- a/llvm/test/CodeGen/AArch64/variant-pcs.ll +++ b/llvm/test/CodeGen/AArch64/variant-pcs.ll @@ -2,6 +2,9 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -filetype=obj -o - %s \ ; RUN: | llvm-readobj --symbols - | FileCheck %s --check-prefix=CHECK-OBJ +; Check we don't crash when using a Mach-O object format. +; RUN: llc -mtriple=arm64-apple-macosx15.0.0 -mattr=+sve -filetype=obj -o /dev/null %s + define i32 @base_pcs() { ; CHECK-ASM-LABEL: base_pcs: ; CHECK-ASM-NOT: .variant_pcs diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll index d31659c30f21d..c3b7161feefb5 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll @@ -243,10 +243,10 @@ define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) { ; CHECK-LABEL: sitofp_v8i8_double: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v1.b[0], v0.b[0] -; CHECK-NEXT: mov v2.b[0], v0.b[2] -; CHECK-NEXT: mov v3.b[0], v0.b[4] -; CHECK-NEXT: mov v4.b[0], v0.b[6] +; CHECK-NEXT: mov b1, v0.b[0] +; CHECK-NEXT: mov b2, v0.b[2] +; CHECK-NEXT: mov b3, v0.b[4] +; CHECK-NEXT: mov b4, v0.b[6] ; CHECK-NEXT: mov v1.b[4], v0.b[1] ; CHECK-NEXT: mov v2.b[4], v0.b[3] ; CHECK-NEXT: mov v3.b[4], v0.b[5] @@ -276,14 +276,14 @@ define <16 x double> @sitofp_v16i8_double(<16 x i8> %a) { ; CHECK-LABEL: sitofp_v16i8_double: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov v2.b[0], v0.b[0] -; CHECK-NEXT: mov v3.b[0], v0.b[2] -; CHECK-NEXT: mov v4.b[0], v0.b[4] -; CHECK-NEXT: mov v5.b[0], v0.b[6] -; CHECK-NEXT: mov v6.b[0], v1.b[0] -; CHECK-NEXT: mov v7.b[0], v1.b[2] -; CHECK-NEXT: mov v16.b[0], v1.b[4] -; CHECK-NEXT: mov v17.b[0], v1.b[6] +; CHECK-NEXT: mov b2, v0.b[0] +; CHECK-NEXT: mov b3, v0.b[2] +; CHECK-NEXT: mov b4, v0.b[4] +; CHECK-NEXT: mov b5, v0.b[6] +; CHECK-NEXT: mov b6, v1.b[0] +; CHECK-NEXT: mov b7, v1.b[2] +; CHECK-NEXT: mov b16, v1.b[4] +; CHECK-NEXT: mov b17, v1.b[6] ; CHECK-NEXT: mov v2.b[4], v0.b[1] ; CHECK-NEXT: mov v3.b[4], v0.b[3] ; CHECK-NEXT: mov v4.b[4], v0.b[5] @@ -396,10 +396,10 @@ define <8 x double> @uitofp_v8i8_double(<8 x i8> %a) { ; CHECK-LABEL: uitofp_v8i8_double: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v2.b[0], v0.b[0] -; CHECK-NEXT: mov v3.b[0], v0.b[2] -; CHECK-NEXT: mov v4.b[0], v0.b[4] -; CHECK-NEXT: mov v5.b[0], v0.b[6] +; CHECK-NEXT: mov b2, v0.b[0] +; CHECK-NEXT: mov b3, v0.b[2] +; CHECK-NEXT: mov b4, v0.b[4] +; CHECK-NEXT: mov b5, v0.b[6] ; CHECK-NEXT: movi d1, #0x0000ff000000ff ; CHECK-NEXT: mov v2.b[4], v0.b[1] ; CHECK-NEXT: mov v3.b[4], v0.b[3] @@ -426,15 +426,15 @@ define <16 x double> @uitofp_v16i8_double(<16 x i8> %a) { ; CHECK-LABEL: uitofp_v16i8_double: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov v3.b[0], v0.b[0] -; CHECK-NEXT: mov v4.b[0], v0.b[2] -; CHECK-NEXT: mov v5.b[0], v0.b[4] -; CHECK-NEXT: mov v6.b[0], v0.b[6] +; CHECK-NEXT: mov b3, v0.b[0] +; CHECK-NEXT: mov b4, v0.b[2] +; CHECK-NEXT: mov b5, v0.b[4] +; CHECK-NEXT: mov b6, v0.b[6] ; CHECK-NEXT: movi d1, #0x0000ff000000ff -; CHECK-NEXT: mov v7.b[0], v2.b[0] -; CHECK-NEXT: mov v16.b[0], v2.b[2] -; CHECK-NEXT: mov v17.b[0], v2.b[4] -; CHECK-NEXT: mov v18.b[0], v2.b[6] +; CHECK-NEXT: mov b7, v2.b[0] +; CHECK-NEXT: mov b16, v2.b[2] +; CHECK-NEXT: mov b17, v2.b[4] +; CHECK-NEXT: mov b18, v2.b[6] ; CHECK-NEXT: mov v3.b[4], v0.b[1] ; CHECK-NEXT: mov v4.b[4], v0.b[3] ; CHECK-NEXT: mov v5.b[4], v0.b[5] diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll index 5e6ff1e0740ce..a7cf5ece5d270 100644 --- a/llvm/test/CodeGen/AArch64/vselect-constants.ll +++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll @@ -369,10 +369,9 @@ define <2 x i64> @not_signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) { define @signbit_mask_xor_nxv16i8( %a, %b) #0 { ; CHECK-LABEL: signbit_mask_xor_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: cmplt p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: mov z0.b, p0/m, #0 // =0x0 +; CHECK-NEXT: eor z1.d, z0.d, z1.d +; CHECK-NEXT: asr z0.b, z0.b, #7 +; CHECK-NEXT: bic z0.d, z1.d, z0.d ; CHECK-NEXT: ret %cond = icmp slt %a, zeroinitializer %xor = xor %a, %b diff --git a/llvm/test/CodeGen/AArch64/win64_vararg2.ll b/llvm/test/CodeGen/AArch64/win64_vararg2.ll index dff49148fb772..2d3156a3aadac 100644 --- a/llvm/test/CodeGen/AArch64/win64_vararg2.ll +++ b/llvm/test/CodeGen/AArch64/win64_vararg2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-pc-win32 | FileCheck %s -; RUN: llc < %s -global-isel -mtriple=aarch64-pc-win32 | FileCheck %s --check-prefix=GISEL +; RUN: llc < %s -global-isel -mtriple=aarch64-pc-win32 -global-isel-abort=0 | FileCheck %s --check-prefix=GISEL ; Function Attrs: mustprogress noinline nounwind optnone uwtable define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { @@ -14,13 +14,15 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { ; CHECK-NEXT: str x30, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: .seh_save_reg x30, 24 ; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: stp x3, x4, [sp, #40] ; CHECK-NEXT: stp x5, x6, [sp, #56] ; CHECK-NEXT: str x7, [sp, #72] -; CHECK-NEXT: str w0, [sp, #12] -; CHECK-NEXT: strb w1, [sp, #11] -; CHECK-NEXT: strb w2, [sp, #10] +; CHECK-NEXT: str x8, [sp, #8] +; CHECK-NEXT: str w0, [sp, #4] +; CHECK-NEXT: strb w1, [sp, #3] +; CHECK-NEXT: strb w2, [sp, #2] ; CHECK-NEXT: bl other ; CHECK-NEXT: cmp w19, w0 ; CHECK-NEXT: cset w0, ls @@ -46,13 +48,15 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { ; GISEL-NEXT: str x30, [sp, #24] // 8-byte Folded Spill ; GISEL-NEXT: .seh_save_reg x30, 24 ; GISEL-NEXT: .seh_endprologue -; GISEL-NEXT: stp x3, x4, [sp, #40] +; GISEL-NEXT: add x8, sp, #40 ; GISEL-NEXT: mov w19, w0 +; GISEL-NEXT: stp x3, x4, [sp, #40] ; GISEL-NEXT: stp x5, x6, [sp, #56] ; GISEL-NEXT: str x7, [sp, #72] -; GISEL-NEXT: str w0, [sp, #12] -; GISEL-NEXT: strb w1, [sp, #11] -; GISEL-NEXT: strb w2, [sp, #10] +; GISEL-NEXT: str x8, [sp, #8] +; GISEL-NEXT: str w0, [sp, #4] +; GISEL-NEXT: strb w1, [sp, #3] +; GISEL-NEXT: strb w2, [sp, #2] ; GISEL-NEXT: bl other ; GISEL-NEXT: cmp w19, w0 ; GISEL-NEXT: cset w0, ls @@ -67,6 +71,8 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { ; GISEL-NEXT: ret ; GISEL-NEXT: .seh_endfunclet ; GISEL-NEXT: .seh_endproc + %valist = alloca ptr + call void @llvm.va_start(ptr %valist) %a_alloc = alloca i32, align 4 %b_alloc = alloca i8, align 1 %c_alloc = alloca i8, align 1 @@ -76,6 +82,7 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) { %a_load = load i32, ptr %a_alloc, align 4 %ret = call noundef i32 @other() %cmp = icmp ule i32 %a_load, %ret + call void @llvm.va_end(ptr %valist) ret i1 %cmp } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index c8b82716a9fe1..814acc3be1fc0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -281,12 +281,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat: @@ -323,12 +323,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo: @@ -365,12 +365,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, 4 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi: @@ -408,14 +408,13 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16: @@ -461,14 +460,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs: @@ -517,14 +515,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_rhs: @@ -580,14 +577,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 82d87358e1faf..aea32b3fedba7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -70,30 +70,15 @@ define i8 @v_ashr_i8_7(i8 %value) { } define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) { -; GFX6-LABEL: s_ashr_i8: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i8 s0, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_ashr_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_sext_i32_i8 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_ashr_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_sext_i32_i8 s1, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_ashr_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_sext_i32_i8 s0, s0 +; GCN-NEXT: s_ashr_i32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i8: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_sext_i32_i8 s1, s1 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i8 %value, %amount @@ -642,30 +627,15 @@ define i16 @v_ashr_i16_15(i16 %value) { } define amdgpu_ps i16 @s_ashr_i16(i16 inreg %value, i16 inreg %amount) { -; GFX6-LABEL: s_ashr_i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_sext_i32_i16 s0, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_ashr_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_ashr_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_sext_i32_i16 s1, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_ashr_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: s_ashr_i32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_sext_i32_i16 s1, s1 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount @@ -826,14 +796,15 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_ashr_i32 s2, s2, s3 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_sext_i32_i16 s1, s2 +; GFX8-NEXT: s_ashr_i32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1028,23 +999,25 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s6, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s7, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 -; GFX8-NEXT: s_ashr_i32 s4, s4, s6 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_ashr_i32 s2, s5, s7 +; GFX8-NEXT: s_sext_i32_i16 s2, s4 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_ashr_i32 s2, s2, s6 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s3, s4, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s5 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1235,41 +1208,45 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s9, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s12, s4 -; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s13, s5 -; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s10, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s14, s6 -; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_ashr_i32 s4, s9, s13 +; GFX8-NEXT: s_sext_i32_i16 s4, s8 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_ashr_i32 s4, s4, s12 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s11, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 -; GFX8-NEXT: s_sext_i32_i16 s15, s7 -; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010 -; GFX8-NEXT: s_ashr_i32 s5, s10, s14 +; GFX8-NEXT: s_sext_i32_i16 s5, s9 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_ashr_i32 s5, s5, s13 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_ashr_i32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: s_ashr_i32 s8, s8, s12 -; GFX8-NEXT: s_ashr_i32 s6, s11, s15 +; GFX8-NEXT: s_sext_i32_i16 s6, s10 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_ashr_i32 s6, s6, s14 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_ashr_i32 s3, s3, s7 +; GFX8-NEXT: s_sext_i32_i16 s7, s11 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s4, s5, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s7, s8, 0xffff +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX8-NEXT: s_ashr_i32 s7, s7, s15 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s4, s6, 0xffff -; GFX8-NEXT: s_or_b32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir new file mode 100644 index 0000000000000..77d30f6fa5223 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s + +--- +name: lshr_zext_i16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: lshr_zext_i16 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s16) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_LSHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: ashr_zext_i16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: ashr_zext_i16 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s16) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_ASHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: shl_zext_i16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: shl_zext_i16 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s16) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_SHL %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: lshr_zext_i8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: lshr_zext_i8 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s8) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_LSHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: ashr_zext_i8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: ashr_zext_i8 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s8) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_ASHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: shl_zext_i8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: shl_zext_i8 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s8) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_SHL %src, %zextamt + $sgpr0 = COPY %res +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll index aae999ec0a99a..75913d5219af2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll @@ -9,46 +9,46 @@ define amdgpu_kernel void @call_debug_loc() { ; CHECK: bb.1.entry: ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2, debug-location !7 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1, debug-location !7 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0, debug-location !7 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16, debug-location !7 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15, debug-location !7 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !7 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11, debug-location !7 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7, debug-location !7 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2, debug-location !8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1, debug-location !8 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0, debug-location !8 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16, debug-location !8 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15, debug-location !8 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14, debug-location !8 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11, debug-location !8 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7, debug-location !8 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5, debug-location !8 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, debug-location !7 - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY8]], debug-location !7 - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !7 - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !7 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !7 - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !7 - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !7 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !7 - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10, debug-location !7 - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], debug-location !7 - ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[COPY1]], implicit $exec, debug-location !7 - ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 20, debug-location !7 - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]], debug-location !7 - ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[COPY]], implicit $exec, debug-location !7 - ; CHECK-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY2]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec, debug-location !7 - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !7 - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]], debug-location !7 - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]], debug-location !7 - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]], debug-location !7 - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY9]], debug-location !7 - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]], debug-location !7 - ; CHECK-NEXT: $sgpr12 = COPY [[COPY13]], debug-location !7 - ; CHECK-NEXT: $sgpr13 = COPY [[COPY14]], debug-location !7 - ; CHECK-NEXT: $sgpr14 = COPY [[COPY15]], debug-location !7 - ; CHECK-NEXT: $sgpr15 = COPY [[DEF]], debug-location !7 - ; CHECK-NEXT: $vgpr31 = COPY [[V_OR3_B32_e64_]], debug-location !7 - ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee, target-flags(amdgpu-gotprel32-hi) @callee, implicit-def $scc, debug-location !7 - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0, debug-location !7 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @callee, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, debug-location !7 - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, debug-location !7 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, debug-location !8 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[COPY8]], debug-location !8 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[COPY7]], debug-location !8 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[COPY6]], debug-location !8 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY5]], debug-location !8 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY4]], debug-location !8 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY3]], debug-location !8 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF debug-location !8 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10, debug-location !8 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], debug-location !8 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[COPY1]], implicit $exec, debug-location !8 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 20, debug-location !8 + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]], debug-location !8 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY17]], [[COPY]], implicit $exec, debug-location !8 + ; CHECK-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY2]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec, debug-location !8 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3, debug-location !8 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]], debug-location !8 + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]], debug-location !8 + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]], debug-location !8 + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY9]], debug-location !8 + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY12]], debug-location !8 + ; CHECK-NEXT: $sgpr12 = COPY [[COPY13]], debug-location !8 + ; CHECK-NEXT: $sgpr13 = COPY [[COPY14]], debug-location !8 + ; CHECK-NEXT: $sgpr14 = COPY [[COPY15]], debug-location !8 + ; CHECK-NEXT: $sgpr15 = COPY [[DEF]], debug-location !8 + ; CHECK-NEXT: $vgpr31 = COPY [[V_OR3_B32_e64_]], debug-location !8 + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee, target-flags(amdgpu-gotprel32-hi) @callee, implicit-def $scc, debug-location !8 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0, debug-location !8 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @callee, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, debug-location !8 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, debug-location !8 ; CHECK-NEXT: S_ENDPGM 0 entry: call void @callee(), !dbg !6 @@ -60,11 +60,11 @@ define void @returnaddress_debug_loc(ptr addrspace(1) %ptr) { ; CHECK: bb.1.entry: ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31, debug-location !7 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31, debug-location !8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]], debug-location !7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]], debug-location !8 ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE]], [[COPY3]], 0, 0, implicit $exec :: (store (p0) into %ir.ptr, addrspace 1) ; CHECK-NEXT: SI_RETURN entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 2a186f527ab70..768a4d039aef9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3329,9 +3329,7 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) { ; GFX6-NEXT: v_and_b32_e32 v3, 15, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3486,10 +3484,8 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) ; GFX6-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog @@ -3793,20 +3789,16 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3942,18 +3934,14 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 @@ -4450,28 +4438,22 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4790,37 +4772,29 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index fd89a46ecbf62..e0f12256b4d9e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3077,11 +3077,9 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) { ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3235,9 +3233,7 @@ define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3570,26 +3566,22 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,32 +3727,28 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s4, s4, 14 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_lshr_b32 s4, s4, 14 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 @@ -4358,26 +4346,22 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 @@ -4388,9 +4372,7 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v5, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -4782,26 +4764,22 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; GFX6-NEXT: v_and_b32_e32 v11, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 @@ -4818,20 +4796,16 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll index 96c9f40e317ea..3da3355e51cf9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @asm_convergent() convergent{ ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: INLINEASM &s_barrier, 33 /* sideeffect isconvergent attdialect */, !1 + ; CHECK-NEXT: INLINEASM &s_barrier, 33 /* sideeffect isconvergent attdialect */, !2 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_barrier", ""() convergent, !srcloc !0 ret void @@ -19,8 +19,8 @@ define amdgpu_kernel void @asm_simple_memory_clobber() { ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, !1 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, !1 + ; CHECK-NEXT: INLINEASM &"", 25 /* sideeffect mayload maystore attdialect */, !2 + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, !2 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "", "~{memory}"(), !srcloc !0 call void asm sideeffect "", ""(), !srcloc !0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @asm_simple_vgpr_clobber() { ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0, !1 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 v0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0, !2 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"(), !srcloc !0 ret void @@ -45,7 +45,7 @@ define amdgpu_kernel void @asm_simple_sgpr_clobber() { ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $sgpr0, !1 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 s0, 7", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $sgpr0, !2 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "s_mov_b32 s0, 7", "~{s0}"(), !srcloc !0 ret void @@ -57,7 +57,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() { ; CHECK-NEXT: liveins: $sgpr8_sgpr9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: INLINEASM &"; def a0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $agpr0, !1 + ; CHECK-NEXT: INLINEASM &"; def a0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $agpr0, !2 ; CHECK-NEXT: S_ENDPGM 0 call void asm sideeffect "; def a0", "~{a0}"(), !srcloc !0 ret void @@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() { define i32 @asm_vgpr_early_clobber() { ; CHECK-LABEL: name: asm_vgpr_early_clobber ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %9, !2 ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll index 101bb6c0ed123..296eeaed0a287 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll @@ -5,8 +5,7 @@ define i32 @reloc_constant() { ; CHECK-LABEL: name: reloc_constant ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0 - ; We cannot have any specific metadata check here as ConstantAsMetadata is printed as + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !1 ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), <0x{{[0-9a-f]+}}> ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[INT1]] ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll index b6b4301dadc7a..f1800dc6afcb9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-legalize-range-metadata.ll @@ -77,7 +77,7 @@ define <2 x i64> @global_load_v2i64_align16__rangemd(ptr addrspace(1) %ptr) { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[MV]](p1) :: (load (<2 x s64>) from %ir.ptr, !range !2, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[MV]](p1) :: (load (<2 x s64>) from %ir.ptr, !range !3, addrspace 1) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -119,7 +119,7 @@ define i32 @global_sextload_i8_align1__rangemd(ptr addrspace(1) %ptr) { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[MV]](p1) :: (load (s8) from %ir.ptr, !range !0, addrspace 1) + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[MV]](p1) :: (load (s8) from %ir.ptr, !range !1, addrspace 1) ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %load = load i8, ptr addrspace(1) %ptr, align 1, !range !0, !noundef !1 @@ -135,7 +135,7 @@ define i32 @global_zextload_i8_align1__rangemd(ptr addrspace(1) %ptr) { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[MV]](p1) :: (load (s8) from %ir.ptr, !range !4, addrspace 1) + ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[MV]](p1) :: (load (s8) from %ir.ptr, !range !5, addrspace 1) ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %load = load i8, ptr addrspace(1) %ptr, align 1, !range !4, !noundef !1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 7fa0d23e55938..be1dc7f0c67f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -85,14 +85,27 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) { } define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) { -; GCN-LABEL: s_lshr_i8_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_u32 s0, s0, 0x10007 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i8_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10007 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i8_7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshr_b32 s0, s0, 7 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i8_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshr_b32 s0, s0, 7 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i8_7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x10007 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 7 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 ret i8 %result @@ -619,15 +632,27 @@ define i16 @v_lshr_i16_15(i16 %value) { } define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) { -; GCN-LABEL: s_lshr_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount @@ -635,14 +660,27 @@ define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) { } define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) { -; GCN-LABEL: s_lshr_i16_15: -; GCN: ; %bb.0: -; GCN-NEXT: s_bfe_u32 s0, s0, 0x1000f -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_lshr_i16_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x1000f +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_lshr_i16_15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, 15 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_lshr_i16_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX9-NEXT: s_lshr_b32 s0, s0, 15 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1000f +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 15 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, 15 ret i16 %result @@ -783,13 +821,12 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: s_lshr_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v2i16: @@ -970,21 +1007,19 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s4, s6 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v4i16: @@ -1155,37 +1190,33 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s4, s8, s12 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s5, s9, s13 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s10, s14 -; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s3, s7 ; GFX8-NEXT: s_lshr_b32 s7, s11, s15 -; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll index 44b12a9f6fe81..80243d658ae00 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll @@ -12,11 +12,11 @@ define void @fence_loads(ptr %ptr) { ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !0 - ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4) + ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !1 + ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load acquire (s8) from %ir.ptr, align 4) ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: FLAT_STORE_BYTE [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4) + ; CHECK-NEXT: FLAT_STORE_BYTE [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !3 :: (store release (s8) into %ir.ptr, align 4) ; CHECK-NEXT: SI_RETURN fence release, !mmra !0 %ld = load atomic i8, ptr %ptr acquire, align 4, !mmra !2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 455446aa38c60..f9cb584d27ecc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -8,37 +8,18 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { -; GFX7-LABEL: s_mul_i16: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mul_i32 s0, s0, s1 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_mul_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_mul_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_mul_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den @@ -106,35 +87,27 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_zeroext: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16_zeroext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result @@ -197,42 +170,22 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { } define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) { -; GFX7-LABEL: s_mul_i16_signext: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_mul_i32 s0, s0, s1 -; GFX7-NEXT: s_sext_i32_i16 s0, s0 -; GFX7-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_mul_i16_signext: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_mul_i16_signext: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_mul_i16_signext: +; GCN: ; %bb.0: +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_mul_i16_signext: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: s_mul_i16_signext: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_and_b32 s0, s0, 0xffff -; GFX12-NEXT: s_and_b32 s1, s1, 0xffff -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_mul_i32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_sext_i32_i16 s0, s0 ; GFX12-NEXT: ; return to shader part epilog %result = mul i16 %num, %den diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll index 131970148ed05..46b75eb55cb52 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -40,30 +40,14 @@ define i8 @v_sext_inreg_i8_7(i8 %value) { } define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i8: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 3 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 3 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x50000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 3 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x50000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i8 %value, 3 %ashr = ashr i8 %shl, 3 @@ -71,30 +55,14 @@ define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) { } define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i8_6: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i8_6: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 6 -; GFX8-NEXT: s_sext_i32_i8 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 6 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i8_6: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 6 -; GFX9-NEXT: s_sext_i32_i8 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 6 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i8_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x20000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8_6: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 6 -; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 6 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x20000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i8 %value, 6 %ashr = ashr i8 %shl, 6 @@ -545,30 +513,14 @@ define i16 @v_sext_inreg_i16_15(i16 %value) { } define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i16_9: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x70000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i16_9: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 9 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 9 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i16_9: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 9 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i16_9: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x70000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_9: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x70000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i16 %value, 9 %ashr = ashr i16 %shl, 9 @@ -576,30 +528,14 @@ define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) { } define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) { -; GFX6-LABEL: s_sext_inreg_i16_15: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_sext_inreg_i16_15: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 15 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 15 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_sext_inreg_i16_15: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 15 -; GFX9-NEXT: s_sext_i32_i16 s0, s0 -; GFX9-NEXT: s_ashr_i32 s0, s0, 15 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_sext_inreg_i16_15: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x10000 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15 -; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15 +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x10000 ; GFX10PLUS-NEXT: ; return to shader part epilog %shl = shl i16 %value, 15 %ashr = ashr i16 %shl, 15 @@ -690,15 +626,11 @@ define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) { ; ; GFX8-LABEL: s_sext_inreg_v2i16_11: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 11 -; GFX8-NEXT: s_lshl_b32 s1, s1, 11 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, 11 -; GFX8-NEXT: s_ashr_i32 s1, s1, 11 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_bfe_i32 s1, s0, 0x50000 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x50010 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -823,25 +755,17 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) { ; ; GFX8-LABEL: s_sext_inreg_v4i16_14: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 14 -; GFX8-NEXT: s_lshl_b32 s2, s2, 14 -; GFX8-NEXT: s_lshl_b32 s1, s1, 14 -; GFX8-NEXT: s_lshl_b32 s3, s3, 14 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_ashr_i32 s0, s0, 14 -; GFX8-NEXT: s_ashr_i32 s2, s2, 14 -; GFX8-NEXT: s_ashr_i32 s1, s1, 14 -; GFX8-NEXT: s_ashr_i32 s3, s3, 14 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_bfe_i32 s2, s0, 0x20000 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x20010 +; GFX8-NEXT: s_bfe_i32 s3, s1, 0x20000 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x20010 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 -; GFX8-NEXT: s_lshl_b32 s2, s3, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s2, s3, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1036,45 +960,29 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) { ; ; GFX8-LABEL: s_sext_inreg_v8i16_5: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshl_b32 s4, s4, 5 -; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 5 -; GFX8-NEXT: s_lshl_b32 s5, s5, 5 -; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_lshl_b32 s2, s2, 5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 5 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_ashr_i32 s0, s0, 5 -; GFX8-NEXT: s_ashr_i32 s4, s4, 5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 5 -; GFX8-NEXT: s_lshl_b32 s7, s7, 5 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_ashr_i32 s1, s1, 5 -; GFX8-NEXT: s_ashr_i32 s5, s5, 5 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0xb0010 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0xb0010 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_ashr_i32 s2, s2, 5 -; GFX8-NEXT: s_ashr_i32 s6, s6, 5 -; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: s_lshl_b32 s4, s5, 16 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0xb0010 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_ashr_i32 s3, s3, 5 -; GFX8-NEXT: s_ashr_i32 s7, s7, 5 -; GFX8-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NEXT: s_lshl_b32 s4, s6, 16 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0xb0000 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0xb0010 +; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: s_and_b32 s4, s5, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_lshl_b32 s4, s7, 16 +; GFX8-NEXT: s_or_b32 s1, s4, s1 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NEXT: s_and_b32 s4, s7, 0xffff +; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 993d0f76ea10e..0806eecbcc1dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -617,13 +617,12 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; ; GFX8-LABEL: s_shl_v2i32_zext_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s2, 0x3fff -; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v2i32_zext_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 77917377f1cd6..139652eb55e3d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -64,26 +64,13 @@ define i8 @v_shl_i8_7(i8 %value) { } define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) { -; GFX6-LABEL: s_shl_i8: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_shl_i8: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_shl_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_shl_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, %amount @@ -620,26 +607,13 @@ define i16 @v_shl_i16_15(i16 %value) { } define amdgpu_ps i16 @s_shl_i16(i16 inreg %value, i16 inreg %amount) { -; GFX6-LABEL: s_shl_i16: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, s1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_shl_i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_shl_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: ; return to shader part epilog +; GCN-LABEL: s_shl_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i16: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount @@ -791,13 +765,13 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v2i16: @@ -976,21 +950,21 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s2, s4, s6 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s7 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s2, s0 -; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v4i16: @@ -1157,37 +1131,37 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s4, s8, s12 -; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshl_b32 s5, s9, s13 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s6, s10, s14 -; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s11, s3, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, s7 ; GFX8-NEXT: s_lshl_b32 s7, s11, s15 -; GFX8-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, 0xffff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll index 6c104709f5ee3..9aa393ee137d6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll @@ -244,12 +244,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 -; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, 64 +; GFX8-NEXT: s_add_i32 s0, s0, 64 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat: @@ -284,12 +284,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 -; GFX8-NEXT: s_add_i32 s1, s1, -4 +; GFX8-NEXT: s_add_i32 s1, s1, 0xfffc +; GFX8-NEXT: s_add_i32 s0, s0, 64 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo: @@ -324,12 +324,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_add_i32 s0, s0, -4 -; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, 64 +; GFX8-NEXT: s_add_i32 s0, s0, 0xfffc +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi: @@ -365,14 +365,13 @@ define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX8-LABEL: s_sub_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16: @@ -412,14 +411,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_lhs: @@ -463,14 +461,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_rhs: @@ -516,14 +513,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sub_i32 s1, s2, s3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 17b6f5072116d..7d7452485fdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -35,15 +35,8 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; ; GFX8-LABEL: scalar_xnor_v2i16_one_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_xor_b32 s0, s0, s1 -; GFX8-NEXT: s_mov_b32 s3, s2 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_xor_b32 s0, s0, -1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: scalar_xnor_v2i16_one_use: @@ -127,21 +120,8 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX8-LABEL: scalar_xnor_v4i16_one_use: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_and_b32 s2, s0, 0xffff -; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_and_b32 s6, s1, 0xffff -; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX900-LABEL: scalar_xnor_v4i16_one_use: diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 0deddfb8d7310..50d20e9b0e4d7 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -102,13 +102,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_add_i32 s0, s2, s3 ; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s0, s0, s1 -; VI-NEXT: s_and_b32 s1, s2, 0xffff -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_add_i32 s2, s2, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_lshl_b32 s1, s2, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -167,16 +167,15 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_add_i32 s0, s2, s2 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -225,12 +224,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 ; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s5, s4 ; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s4, 16 +; VI-NEXT: s_lshl_b32 s3, s5, 16 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 8788dc2c059d6..fa73ef0b0ec4c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -1,61 +1,63 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; FIXME: Currently block machineinstr verifier due to SI BUNDLE pass break physical register liveness. Should remove when the issue is fixed up + +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs=0 < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs=0 < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs=0 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i32_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v32f32: ; VI: ; %bb.0: @@ -225,56 +227,376 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v32i32_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v32i32: ; VI: ; %bb.0: @@ -286,7 +608,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -321,7 +643,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -336,7 +658,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -371,7 +693,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -388,7 +710,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 @@ -407,7 +729,7 @@ define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -428,56 +750,360 @@ end: ret <32 x i32> %phi } +define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v32f32_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i32_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v16i64: ; VI: ; %bb.0: @@ -489,7 +1115,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 @@ -524,7 +1150,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -539,7 +1165,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 @@ -574,7 +1200,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -591,7 +1217,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 @@ -626,7 +1252,7 @@ define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -647,56 +1273,376 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v32i32_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v32i32: ; VI: ; %bb.0: @@ -708,7 +1654,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -743,7 +1689,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -758,7 +1704,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -793,7 +1739,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -810,7 +1756,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -853,7 +1799,7 @@ define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -874,182 +1820,510 @@ end: ret <32 x i32> %phi } -define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v32i32_to_v16f64: +; VI-LABEL: bitcast_v16i64_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v32i32_to_v16f64: +; GFX9-LABEL: bitcast_v16i64_to_v32i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32i32_to_v16f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 ; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 ; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 @@ -1072,7 +2346,7 @@ define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1093,40 +2367,360 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v32i32_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v32i32: ; VI: ; %bb.0: @@ -1138,7 +2732,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -1157,7 +2751,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1172,7 +2766,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -1191,7 +2785,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1208,7 +2802,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -1227,7 +2821,7 @@ define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1248,1228 +2842,1478 @@ end: ret <32 x i32> %phi } +define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v16f64_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 -; GCN-NEXT: v_or_b32_e32 v1, v1, v52 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v50, v31 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 -; GCN-NEXT: v_or_b32_e32 v49, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v57 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v29, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v1, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v5, v1, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v36 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v44 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v45 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v56 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v58 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v59 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v60 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i32_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_or_b32_e32 v41, v41, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v39 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v128i8: ; VI: ; %bb.0: @@ -2670,7 +4514,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -2846,9 +4690,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 ; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 @@ -3055,7 +4899,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -3647,7 +5491,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -3841,9 +5685,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 @@ -4069,7 +5913,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -4517,7 +6361,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -4584,9 +6428,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 @@ -4686,7 +6530,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB6_4: ; %end +; GFX11-TRUE16-NEXT: .LBB12_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5135,7 +6979,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -5234,9 +7078,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 @@ -5368,7 +7212,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -5710,1621 +7554,6408 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s30, 0 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_or_b32_e32 v40, s4, v40 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s71, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v54 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v54, v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; SI-NEXT: v_or_b32_e32 v53, s4, v53 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s67, 0xff +; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v51 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s66, 24 +; SI-NEXT: v_or_b32_e32 v54, v40, v54 +; SI-NEXT: v_and_b32_e32 v53, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v54, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v53, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v55, v54, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v52, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v52, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; SI-NEXT: v_or_b32_e32 v50, s4, v50 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v48 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s55, 24 +; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v48, v48, v49 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v48, v50, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v52, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v49, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; SI-NEXT: v_or_b32_e32 v39, s4, v39 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s53, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v37 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s52, 24 +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v48, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v39, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v49, v48, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v38, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_or_b32_e32 v35, s4, v35 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s49, 24 +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v35, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v29, s4, v29 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s39, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s38, 24 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v23, s4, v23 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s35, 24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s31, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s30, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s93, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s91, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s90, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s79, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s76, 24 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s73, 24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s62, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s56, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v32i32_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_lshl_b32 s61, s65, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: s_lshl_b32 s61, s48, 8 +; VI-NEXT: s_and_b32 s63, s64, 0xff +; VI-NEXT: s_or_b32 s61, s63, s61 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s61, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s55, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s59, 0xff +; VI-NEXT: s_lshl_b32 s58, s58, 8 +; VI-NEXT: s_or_b32 s17, s17, s58 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_lshl_b32 s16, s54, 8 +; VI-NEXT: s_and_b32 s17, s18, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s38, 8 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s57, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: s_lshl_b32 s16, s56, 8 +; VI-NEXT: s_and_b32 s17, s20, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s36, 8 +; VI-NEXT: s_and_b32 s18, s47, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xff +; VI-NEXT: s_lshl_b32 s17, s46, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s50, 0xff +; VI-NEXT: s_lshl_b32 s18, s87, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v6, s16 +; VI-NEXT: s_lshl_b32 s16, s86, 8 +; VI-NEXT: s_and_b32 s17, s22, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s34, 8 +; VI-NEXT: s_and_b32 s18, s85, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xff +; VI-NEXT: s_lshl_b32 s17, s84, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s83, 0xff +; VI-NEXT: s_lshl_b32 s18, s82, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: s_lshl_b32 s16, s81, 8 +; VI-NEXT: s_and_b32 s17, s24, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s30, 8 +; VI-NEXT: s_and_b32 s18, s80, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xff +; VI-NEXT: s_lshl_b32 s17, s71, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s70, 0xff +; VI-NEXT: s_lshl_b32 s18, s69, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: s_lshl_b32 s16, s68, 8 +; VI-NEXT: s_and_b32 s17, s26, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s90, 8 +; VI-NEXT: s_and_b32 s18, s67, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: s_and_b32 s16, s27, 0xff +; VI-NEXT: s_lshl_b32 s17, s66, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 59 +; VI-NEXT: v_readlane_b32 s18, v21, 58 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 57 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s28, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 56 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s88, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 55 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: s_and_b32 s16, s29, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 54 +; VI-NEXT: v_readlane_b32 s18, v21, 53 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 52 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s44, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 51 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s78, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 50 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: s_and_b32 s16, s45, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 49 +; VI-NEXT: v_readlane_b32 s18, v21, 48 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 47 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s42, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 46 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s76, 8 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 45 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s43, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 44 +; VI-NEXT: v_readlane_b32 s18, v21, 43 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 42 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s40, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 41 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s74, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 40 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 39 +; VI-NEXT: v_readlane_b32 s18, v21, 38 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 37 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 36 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: v_readlane_b32 s15, v21, 35 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: v_readlane_b32 s15, v21, 34 +; VI-NEXT: v_readlane_b32 s16, v21, 33 +; VI-NEXT: s_and_b32 s15, s15, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 32 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 31 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: v_readlane_b32 s13, v21, 30 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: v_readlane_b32 s13, v21, 29 +; VI-NEXT: v_readlane_b32 s14, v21, 28 +; VI-NEXT: s_and_b32 s13, s13, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 27 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 26 +; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s13, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: v_readlane_b32 s11, v21, 25 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: v_readlane_b32 s11, v21, 24 +; VI-NEXT: v_readlane_b32 s12, v21, 23 +; VI-NEXT: s_and_b32 s11, s11, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 22 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 21 +; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff +; VI-NEXT: v_readlane_b32 s9, v21, 20 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: v_readlane_b32 s9, v21, 19 +; VI-NEXT: v_readlane_b32 s10, v21, 18 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 17 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 16 +; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: v_readlane_b32 s7, v21, 15 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 14 +; VI-NEXT: v_readlane_b32 s8, v21, 13 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 12 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 11 +; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: v_readlane_b32 s5, v21, 10 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s5, v21, 9 +; VI-NEXT: v_readlane_b32 s6, v21, 8 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s15, v21, 1 +; VI-NEXT: v_readlane_b32 s13, v21, 3 +; VI-NEXT: v_readlane_b32 s11, v21, 5 +; VI-NEXT: v_readlane_b32 s9, v21, 7 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s87, v20, 31 +; VI-NEXT: v_readlane_b32 s86, v20, 30 +; VI-NEXT: v_readlane_b32 s85, v20, 29 +; VI-NEXT: v_readlane_b32 s84, v20, 28 +; VI-NEXT: v_readlane_b32 s83, v20, 27 +; VI-NEXT: v_readlane_b32 s82, v20, 26 +; VI-NEXT: v_readlane_b32 s81, v20, 25 +; VI-NEXT: v_readlane_b32 s80, v20, 24 +; VI-NEXT: v_readlane_b32 s71, v20, 23 +; VI-NEXT: v_readlane_b32 s70, v20, 22 +; VI-NEXT: v_readlane_b32 s69, v20, 21 +; VI-NEXT: v_readlane_b32 s68, v20, 20 +; VI-NEXT: v_readlane_b32 s67, v20, 19 +; VI-NEXT: v_readlane_b32 s66, v20, 18 +; VI-NEXT: v_readlane_b32 s65, v20, 17 +; VI-NEXT: v_readlane_b32 s64, v20, 16 +; VI-NEXT: v_readlane_b32 s55, v20, 15 +; VI-NEXT: v_readlane_b32 s54, v20, 14 +; VI-NEXT: v_readlane_b32 s53, v20, 13 +; VI-NEXT: v_readlane_b32 s52, v20, 12 +; VI-NEXT: v_readlane_b32 s51, v20, 11 +; VI-NEXT: v_readlane_b32 s50, v20, 10 +; VI-NEXT: v_readlane_b32 s49, v20, 9 +; VI-NEXT: v_readlane_b32 s48, v20, 8 +; VI-NEXT: v_readlane_b32 s39, v20, 7 +; VI-NEXT: v_readlane_b32 s38, v20, 6 +; VI-NEXT: v_readlane_b32 s37, v20, 5 +; VI-NEXT: v_readlane_b32 s36, v20, 4 +; VI-NEXT: v_readlane_b32 s35, v20, 3 +; VI-NEXT: v_readlane_b32 s34, v20, 2 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v20, s34, 2 +; GFX9-NEXT: v_writelane_b32 v20, s35, 3 +; GFX9-NEXT: v_writelane_b32 v20, s36, 4 +; GFX9-NEXT: v_writelane_b32 v20, s37, 5 +; GFX9-NEXT: v_writelane_b32 v20, s38, 6 +; GFX9-NEXT: v_writelane_b32 v20, s39, 7 +; GFX9-NEXT: v_writelane_b32 v20, s48, 8 +; GFX9-NEXT: v_writelane_b32 v20, s49, 9 +; GFX9-NEXT: v_writelane_b32 v20, s50, 10 +; GFX9-NEXT: v_writelane_b32 v20, s51, 11 +; GFX9-NEXT: v_writelane_b32 v20, s52, 12 +; GFX9-NEXT: v_writelane_b32 v20, s53, 13 +; GFX9-NEXT: v_writelane_b32 v20, s54, 14 +; GFX9-NEXT: v_writelane_b32 v20, s55, 15 +; GFX9-NEXT: v_writelane_b32 v20, s64, 16 +; GFX9-NEXT: v_writelane_b32 v20, s65, 17 +; GFX9-NEXT: v_writelane_b32 v20, s66, 18 +; GFX9-NEXT: v_writelane_b32 v20, s67, 19 +; GFX9-NEXT: v_writelane_b32 v20, s68, 20 +; GFX9-NEXT: v_writelane_b32 v20, s69, 21 +; GFX9-NEXT: v_writelane_b32 v20, s70, 22 +; GFX9-NEXT: v_writelane_b32 v20, s71, 23 +; GFX9-NEXT: v_writelane_b32 v20, s80, 24 +; GFX9-NEXT: v_writelane_b32 v20, s81, 25 +; GFX9-NEXT: v_writelane_b32 v20, s82, 26 +; GFX9-NEXT: v_writelane_b32 v20, s83, 27 +; GFX9-NEXT: v_writelane_b32 v20, s84, 28 +; GFX9-NEXT: v_writelane_b32 v20, s85, 29 +; GFX9-NEXT: v_writelane_b32 v20, s86, 30 +; GFX9-NEXT: v_writelane_b32 v20, s87, 31 +; GFX9-NEXT: v_writelane_b32 v20, s96, 32 +; GFX9-NEXT: v_writelane_b32 v20, s97, 33 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v20, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s44, v1 +; GFX9-NEXT: v_readfirstlane_b32 s45, v2 +; GFX9-NEXT: v_readfirstlane_b32 s42, v3 +; GFX9-NEXT: v_readfirstlane_b32 s43, v4 +; GFX9-NEXT: v_readfirstlane_b32 s40, v5 +; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: v_writelane_b32 v20, s99, 35 +; GFX9-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s5, s5, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: s_add_i32 s4, s4, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: s_add_i32 s45, s45, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_add_i32 s44, s44, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_lshl_b32 s46, s46, 8 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: s_lshl_b32 s46, s36, 8 +; GFX9-NEXT: s_and_b32 s47, s80, 0xff +; GFX9-NEXT: s_or_b32 s46, s47, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s46, s46, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s71, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s69, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: s_lshl_b32 s16, s68, 8 +; GFX9-NEXT: s_and_b32 s17, s18, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s34, 8 +; GFX9-NEXT: s_and_b32 s18, s67, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: s_and_b32 s16, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s66, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s64, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_lshl_b32 s16, s55, 8 +; GFX9-NEXT: s_and_b32 s17, s20, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s30, 8 +; GFX9-NEXT: s_and_b32 s18, s54, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: s_and_b32 s16, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s53, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s51, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: s_lshl_b32 s16, s50, 8 +; GFX9-NEXT: s_and_b32 s17, s22, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s94, 8 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: s_and_b32 s16, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: s_lshl_b32 s16, s99, 8 +; GFX9-NEXT: s_and_b32 s17, s24, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s92, 8 +; GFX9-NEXT: s_and_b32 s18, s98, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: s_and_b32 s16, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s97, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s96, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s87, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: s_lshl_b32 s16, s86, 8 +; GFX9-NEXT: s_and_b32 s17, s26, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s90, 8 +; GFX9-NEXT: s_and_b32 s18, s85, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: s_and_b32 s16, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s84, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s81, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s83, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: s_lshl_b32 s16, s82, 8 +; GFX9-NEXT: s_and_b32 s17, s28, 0xff +; GFX9-NEXT: v_readlane_b32 s18, v21, 50 +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s88, 8 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 49 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: s_and_b32 s16, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 48 +; GFX9-NEXT: v_readlane_b32 s18, v21, 47 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 45 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s78, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 44 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 43 +; GFX9-NEXT: v_readlane_b32 s18, v21, 42 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 40 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s76, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 39 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 38 +; GFX9-NEXT: v_readlane_b32 s18, v21, 37 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 35 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s74, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 33 +; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s72, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: s_lshl_b32 s15, s15, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: v_readlane_b32 s15, v21, 28 +; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: s_and_b32 s15, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s62, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: v_readlane_b32 s13, v21, 23 +; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: s_and_b32 s13, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s60, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: v_readlane_b32 s11, v21, 18 +; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: s_and_b32 s11, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s58, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: v_readlane_b32 s9, v21, 13 +; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: s_and_b32 s9, s9, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_readlane_b32 s7, v21, 8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 5 +; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_readlane_b32 s5, v21, 3 +; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_readlane_b32 s9, v21, 1 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: v_readlane_b32 s99, v20, 35 +; GFX9-NEXT: v_readlane_b32 s98, v20, 34 +; GFX9-NEXT: v_readlane_b32 s97, v20, 33 +; GFX9-NEXT: v_readlane_b32 s96, v20, 32 +; GFX9-NEXT: v_readlane_b32 s87, v20, 31 +; GFX9-NEXT: v_readlane_b32 s86, v20, 30 +; GFX9-NEXT: v_readlane_b32 s85, v20, 29 +; GFX9-NEXT: v_readlane_b32 s84, v20, 28 +; GFX9-NEXT: v_readlane_b32 s83, v20, 27 +; GFX9-NEXT: v_readlane_b32 s82, v20, 26 +; GFX9-NEXT: v_readlane_b32 s81, v20, 25 +; GFX9-NEXT: v_readlane_b32 s80, v20, 24 +; GFX9-NEXT: v_readlane_b32 s71, v20, 23 +; GFX9-NEXT: v_readlane_b32 s70, v20, 22 +; GFX9-NEXT: v_readlane_b32 s69, v20, 21 +; GFX9-NEXT: v_readlane_b32 s68, v20, 20 +; GFX9-NEXT: v_readlane_b32 s67, v20, 19 +; GFX9-NEXT: v_readlane_b32 s66, v20, 18 +; GFX9-NEXT: v_readlane_b32 s65, v20, 17 +; GFX9-NEXT: v_readlane_b32 s64, v20, 16 +; GFX9-NEXT: v_readlane_b32 s55, v20, 15 +; GFX9-NEXT: v_readlane_b32 s54, v20, 14 +; GFX9-NEXT: v_readlane_b32 s53, v20, 13 +; GFX9-NEXT: v_readlane_b32 s52, v20, 12 +; GFX9-NEXT: v_readlane_b32 s51, v20, 11 +; GFX9-NEXT: v_readlane_b32 s50, v20, 10 +; GFX9-NEXT: v_readlane_b32 s49, v20, 9 +; GFX9-NEXT: v_readlane_b32 s48, v20, 8 +; GFX9-NEXT: v_readlane_b32 s39, v20, 7 +; GFX9-NEXT: v_readlane_b32 s38, v20, 6 +; GFX9-NEXT: v_readlane_b32 s37, v20, 5 +; GFX9-NEXT: v_readlane_b32 s36, v20, 4 +; GFX9-NEXT: v_readlane_b32 s35, v20, 3 +; GFX9-NEXT: v_readlane_b32 s34, v20, 2 +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: v_writelane_b32 v21, s82, 0 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: v_writelane_b32 v21, s83, 1 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i32_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s57 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s40, s40, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s99 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s57, s57, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s56, s57, s56 +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s94 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s98 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s46, s46, s47 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s97 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s96 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s56, s56, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s57, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s87 +; GFX11-TRUE16-NEXT: s_or_b32 s56, s56, s57 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s86 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s45, s46, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s85 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s83 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s82 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s18 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s71 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s81 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s70 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s80 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s69 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s68 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s66 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s67 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s64 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s65 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s55 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s76 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s53 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s54 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[3:6], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s52 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s51 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s50 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s48 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s37 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s35 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s104 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s103 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s102 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s16, v18, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s101 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s18, v18, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s19, v18, 4 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 6 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s100 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v18, 10 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v18, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v18, 13 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-TRUE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v19, 2 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 15 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 18 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v18, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v19, 4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v19, 5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v18, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 23 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v19, 6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 25 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v18, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v18, 28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v18, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s5, v18, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v19, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v19, 7 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_branch .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, -1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 5 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 7 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 15 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, s104 +; GFX11-FAKE16-NEXT: s_mov_b32 s104, s57 +; GFX11-FAKE16-NEXT: s_mov_b32 s57, s69 +; GFX11-FAKE16-NEXT: s_mov_b32 s69, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_5 +; GFX11-FAKE16-NEXT: ; %bb.4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s41, s41, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_add_i32 s40, s40, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s25, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-FAKE16-NEXT: .LBB13_5: ; %end +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s74, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 9 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s30, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s44, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v18, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s100, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s99, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v18, 6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s92, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 29 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v19, 28 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 19 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s86, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 21 +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 22 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 20 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s97, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s69, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s18, s72, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s73, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s96, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s87, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s85, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s84, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s29, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s83, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s82, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s81, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s61, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s70, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s58, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s68, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s67, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s66, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s12, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s65, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s64, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s13, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s55, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s54, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s53, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s10, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s52, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s51, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s11, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s50, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s49, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s48, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s8, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s39, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s38, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 11 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s37, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s36, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s35, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s56, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s7, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s34, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_hi, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s46, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 15 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s103, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s102, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32i32: ; VI: ; %bb.0: @@ -7656,7 +14287,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -8129,9 +14760,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -8520,7 +15151,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -8892,7 +15523,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -9366,9 +15997,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -9763,7 +16394,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -10005,15 +16636,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB7_4 -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB7_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h @@ -10371,8 +17002,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-TRUE16-NEXT: .LBB7_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 @@ -10994,7 +17625,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 @@ -11349,9 +17980,9 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB7_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 @@ -11706,7 +18337,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB7_4: ; %end +; GFX11-FAKE16-NEXT: .LBB14_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 @@ -11780,1589 +18411,7616 @@ end: ret <32 x i32> %phi } -define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB15_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB15_3 +; SI-NEXT: .LBB15_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB15_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB15_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB15_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v32i32_to_v64bf16: +; VI-LABEL: bitcast_v128i8_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB15_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB15_3 +; VI-NEXT: .LBB15_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB15_3: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 +; VI-NEXT: s_cbranch_vccnz .LBB15_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: .LBB8_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB15_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v32i32_to_v64bf16: +; GFX9-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB15_3 +; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB15_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: .LBB8_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32i32_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: .LBB8_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB15_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + cmp.true: - %a1 = add <32 x i32> %a, splat (i32 3) - %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x i32> br label %end cmp.false: - %a3 = bitcast <32 x i32> %a to <64 x bfloat> + %a3 = bitcast <128 x i8> %a to <32 x i32> br label %end end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi } -define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v64bf16_to_v32i32: +; VI-LABEL: bitcast_v32i32_to_v64bf16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s80, 24 +; SI-NEXT: v_writelane_b32 v20, s81, 25 +; SI-NEXT: v_writelane_b32 v20, s82, 26 +; SI-NEXT: v_writelane_b32 v20, s83, 27 +; SI-NEXT: v_writelane_b32 v20, s84, 28 +; SI-NEXT: v_writelane_b32 v20, s85, 29 +; SI-NEXT: v_writelane_b32 v20, s86, 30 +; SI-NEXT: v_writelane_b32 v20, s87, 31 +; SI-NEXT: v_writelane_b32 v20, s96, 32 +; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s70, v1 +; SI-NEXT: v_readfirstlane_b32 s71, v2 +; SI-NEXT: v_readfirstlane_b32 s80, v3 +; SI-NEXT: v_readfirstlane_b32 s81, v4 +; SI-NEXT: v_readfirstlane_b32 s82, v5 +; SI-NEXT: v_readfirstlane_b32 s83, v6 +; SI-NEXT: v_readfirstlane_b32 s84, v7 +; SI-NEXT: v_readfirstlane_b32 s85, v8 +; SI-NEXT: v_readfirstlane_b32 s86, v9 +; SI-NEXT: v_readfirstlane_b32 s87, v10 +; SI-NEXT: v_readfirstlane_b32 s96, v11 +; SI-NEXT: v_readfirstlane_b32 s97, v12 +; SI-NEXT: v_readfirstlane_b32 s98, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB17_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s4, 1 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: s_lshl_b32 s4, s16, 16 +; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s9, 16 +; SI-NEXT: s_and_b32 s13, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s8, 16 +; SI-NEXT: s_and_b32 s15, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s7, 16 +; SI-NEXT: s_and_b32 s41, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s6, 16 +; SI-NEXT: s_and_b32 s43, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s99, 16 +; SI-NEXT: s_and_b32 s45, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s98, 16 +; SI-NEXT: s_and_b32 s47, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s97, 16 +; SI-NEXT: s_and_b32 s57, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s96, 16 +; SI-NEXT: s_and_b32 s59, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s87, 16 +; SI-NEXT: s_and_b32 s61, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s86, 16 +; SI-NEXT: s_and_b32 s63, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s85, 16 +; SI-NEXT: s_and_b32 s73, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s84, 16 +; SI-NEXT: s_and_b32 s75, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s83, 16 +; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s82, 16 +; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s81, 16 +; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s80, 16 +; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s71, 16 +; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s70, 16 +; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s28, 16 +; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s27, 16 +; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s26, 16 +; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s25, 16 +; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s24, 16 +; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s23, 16 +; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s22, 16 +; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s21, 16 +; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s20, 16 +; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s19, 16 +; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s18, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 3 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB17_3 +; SI-NEXT: .LBB17_2: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB17_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s10, s12 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s14, s40 +; SI-NEXT: s_mov_b32 s15, s41 +; SI-NEXT: s_mov_b32 s40, s42 +; SI-NEXT: s_mov_b32 s41, s43 +; SI-NEXT: s_mov_b32 s42, s44 +; SI-NEXT: s_mov_b32 s43, s45 +; SI-NEXT: s_mov_b32 s44, s46 +; SI-NEXT: s_mov_b32 s45, s47 +; SI-NEXT: s_mov_b32 s46, s56 +; SI-NEXT: s_mov_b32 s47, s57 +; SI-NEXT: s_mov_b32 s56, s58 +; SI-NEXT: s_mov_b32 s57, s59 +; SI-NEXT: s_mov_b32 s58, s60 +; SI-NEXT: s_mov_b32 s59, s61 +; SI-NEXT: s_mov_b32 s60, s62 +; SI-NEXT: s_mov_b32 s61, s63 +; SI-NEXT: s_mov_b32 s62, s72 +; SI-NEXT: s_mov_b32 s63, s73 +; SI-NEXT: s_mov_b32 s72, s74 +; SI-NEXT: s_mov_b32 s73, s75 +; SI-NEXT: s_mov_b32 s74, s76 +; SI-NEXT: v_readlane_b32 s75, v21, 0 +; SI-NEXT: v_readlane_b32 s76, v21, 1 +; SI-NEXT: s_cbranch_vccnz .LBB17_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_add_i32 s81, s81, 3 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s86, s86, 3 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s96, s96, 3 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: s_add_i32 s99, s99, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_and_b32 s15, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s6, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s5, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s4, s9, 16 +; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s8, 16 +; SI-NEXT: s_and_b32 s13, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s7, 16 +; SI-NEXT: s_and_b32 s41, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s99, 16 +; SI-NEXT: s_and_b32 s43, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s98, 16 +; SI-NEXT: s_and_b32 s45, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s97, 16 +; SI-NEXT: s_and_b32 s47, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s96, 16 +; SI-NEXT: s_and_b32 s57, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s87, 16 +; SI-NEXT: s_and_b32 s59, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s86, 16 +; SI-NEXT: s_and_b32 s61, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s85, 16 +; SI-NEXT: s_and_b32 s63, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s84, 16 +; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s83, 16 +; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s82, 16 +; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s81, 16 +; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s80, 16 +; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s71, 16 +; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s70, 16 +; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s29, 16 +; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s28, 16 +; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s27, 16 +; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s26, 16 +; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s25, 16 +; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s24, 16 +; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s23, 16 +; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s22, 16 +; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s21, 16 +; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s20, 16 +; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s19, 16 +; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s18, 16 +; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s6, 2 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: v_writelane_b32 v21, s6, 3 +; SI-NEXT: .LBB17_5: ; %end +; SI-NEXT: v_readlane_b32 s6, v21, 2 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v21, 3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s98, v20, 34 +; SI-NEXT: v_readlane_b32 s97, v20, 33 +; SI-NEXT: v_readlane_b32 s96, v20, 32 +; SI-NEXT: v_readlane_b32 s87, v20, 31 +; SI-NEXT: v_readlane_b32 s86, v20, 30 +; SI-NEXT: v_readlane_b32 s85, v20, 29 +; SI-NEXT: v_readlane_b32 s84, v20, 28 +; SI-NEXT: v_readlane_b32 s83, v20, 27 +; SI-NEXT: v_readlane_b32 s82, v20, 26 +; SI-NEXT: v_readlane_b32 s81, v20, 25 +; SI-NEXT: v_readlane_b32 s80, v20, 24 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB17_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: .LBB17_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -13942,7 +26600,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -13957,7 +26615,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -14442,7 +27100,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc ; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14459,7 +27117,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -15005,7 +27663,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -15022,7 +27680,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 @@ -15550,7 +28208,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -15571,2422 +28229,8745 @@ end: ret <32 x i32> %phi } -define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v63 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v49 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB10_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v31, v33 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: .LBB10_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v2, v1 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v2, v1 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v44 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v41 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v54 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v49 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v48 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v32i32_to_v64f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: .LBB10_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v32i32_to_v64f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: .LBB10_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32i32_to_v64f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: .LBB10_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <32 x i32> %a, splat (i32 3) - %a2 = bitcast <32 x i32> %a1 to <64 x half> - br label %end - -cmp.false: - %a3 = bitcast <32 x i32> %a to <64 x half> - br label %end - -end: - %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x half> %phi -} - -define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB11_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB11_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v64f16_to_v32i32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v32, 0x200 -; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v33 -; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v33 -; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v33 -; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v33 -; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v33 -; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v33 -; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v33 -; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v33 -; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v33 -; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v33 -; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v33 -; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v33 -; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v33 -; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v33 -; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v33 -; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 -; VI-NEXT: v_or_b32_e32 v31, v31, v33 -; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 -; VI-NEXT: v_or_b32_e32 v30, v30, v33 -; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v33 -; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 -; VI-NEXT: v_or_b32_e32 v28, v28, v33 -; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 -; VI-NEXT: v_or_b32_e32 v26, v26, v33 -; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v33 -; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v33 -; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v33 -; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 -; VI-NEXT: v_or_b32_e32 v22, v22, v33 -; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 -; VI-NEXT: v_or_b32_e32 v21, v21, v33 -; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 -; VI-NEXT: v_or_b32_e32 v20, v20, v33 -; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 -; VI-NEXT: v_or_b32_e32 v19, v19, v33 -; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 -; VI-NEXT: v_or_b32_e32 v18, v18, v33 -; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 -; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 -; VI-NEXT: v_or_b32_e32 v17, v17, v33 -; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB11_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v64f16_to_v32i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v64f16_to_v32i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <64 x half> %a, splat (half 0xH0200) - %a2 = bitcast <64 x half> %a1 to <32 x i32> - br label %end - -cmp.false: - %a3 = bitcast <64 x half> %a to <32 x i32> - br label %end - -end: - %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x i32> %phi -} - -define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i32_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_or_b32_e32 v46, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v55 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v15, v15, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v45 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v19, v19, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v20, v20, v34 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v34 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; GCN-NEXT: v_or_b32_e32 v22, v22, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; GCN-NEXT: v_or_b32_e32 v25, v25, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v54 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; GCN-NEXT: v_or_b32_e32 v27, v27, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v32, v32, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB19_2 ; -; VI-LABEL: bitcast_v32i32_to_v64i16: +; VI-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: .LBB12_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v32i32_to_v64i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: .LBB12_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32i32_to_v64i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v64bf16_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB20_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v56, v28 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: .LBB20_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s35, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s35 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_lshr_b32 s34, s6, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s20, 16 +; SI-NEXT: s_lshr_b32 s59, s21, 16 +; SI-NEXT: s_lshr_b32 s60, s22, 16 +; SI-NEXT: s_lshr_b32 s61, s23, 16 +; SI-NEXT: s_lshr_b32 s62, s24, 16 +; SI-NEXT: s_lshr_b32 s63, s25, 16 +; SI-NEXT: s_lshr_b32 s72, s26, 16 +; SI-NEXT: s_lshr_b32 s73, s27, 16 +; SI-NEXT: s_lshr_b32 s74, s28, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s47, 16 +; SI-NEXT: s_lshr_b32 s77, s46, 16 +; SI-NEXT: s_lshr_b32 s78, s45, 16 +; SI-NEXT: s_lshr_b32 s79, s44, 16 +; SI-NEXT: s_lshr_b32 s88, s43, 16 +; SI-NEXT: s_lshr_b32 s89, s42, 16 +; SI-NEXT: s_lshr_b32 s90, s41, 16 +; SI-NEXT: s_lshr_b32 s91, s40, 16 +; SI-NEXT: s_lshr_b32 s92, s15, 16 +; SI-NEXT: s_lshr_b32 s93, s14, 16 +; SI-NEXT: s_lshr_b32 s94, s13, 16 +; SI-NEXT: s_lshr_b32 s95, s12, 16 +; SI-NEXT: s_lshr_b32 vcc_lo, s11, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s10, 16 +; SI-NEXT: s_lshr_b32 s30, s8, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v9, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v11, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v45, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v43, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v41, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v55, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v53, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v48, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v38, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v36, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v34, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v32, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v30, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v32i32_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v64f16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB22_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB22_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB23_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB23_3 +; SI-NEXT: .LBB23_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB23_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB23_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB23_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v32i32_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i32_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, s4, v16 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v16, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v32i32_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v32i32_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v32i32_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 @@ -18003,7 +36984,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 ; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 ; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 @@ -18020,7 +37000,2037 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 ; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v64i16_to_v32i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB26_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB26_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v33, 3 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_add_u16_e32 v32, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v32, v14 +; VI-NEXT: v_add_u16_e32 v32, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v32, v13 +; VI-NEXT: v_add_u16_e32 v32, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v32, v12 +; VI-NEXT: v_add_u16_e32 v32, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v32, v11 +; VI-NEXT: v_add_u16_e32 v32, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v32, v10 +; VI-NEXT: v_add_u16_e32 v32, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v32, v9 +; VI-NEXT: v_add_u16_e32 v32, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v32, v8 +; VI-NEXT: v_add_u16_e32 v32, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v32, v7 +; VI-NEXT: v_add_u16_e32 v32, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v32, v6 +; VI-NEXT: v_add_u16_e32 v32, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v32, v5 +; VI-NEXT: v_add_u16_e32 v32, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v32, v4 +; VI-NEXT: v_add_u16_e32 v32, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v32, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_add_u16_e32 v32, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v32, v1 +; VI-NEXT: v_add_u16_e32 v32, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v32, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v31 +; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: v_add_u16_e32 v32, 3, v30 +; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: v_add_u16_e32 v32, 3, v29 +; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v32i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s47, v1 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s46, 3 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v32i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v32i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v32f32_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -18028,743 +39038,425 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <32 x i32> %a, splat (i32 3) - %a2 = bitcast <32 x i32> %a1 to <64 x i16> + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v32f32_to_v16i64_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v32f32_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB29_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: .LBB29_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x i64> br label %end cmp.false: - %a3 = bitcast <32 x i32> %a to <64 x i16> + %a3 = bitcast <32 x float> %a to <16 x i64> br label %end end: - %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x i16> %phi + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi } -define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v32i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB13_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB13_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <32 x float> @bitcast_v16i64_to_v32f32(<16 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v16i64_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v64i16_to_v32i32: +; VI-LABEL: bitcast_v16i64_to_v32f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v33, 3 -; VI-NEXT: v_add_u16_e32 v32, 3, v15 -; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v32, v15 -; VI-NEXT: v_add_u16_e32 v32, 3, v14 -; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v32, v14 -; VI-NEXT: v_add_u16_e32 v32, 3, v13 -; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v32, v13 -; VI-NEXT: v_add_u16_e32 v32, 3, v12 -; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v32, v12 -; VI-NEXT: v_add_u16_e32 v32, 3, v11 -; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v32, v11 -; VI-NEXT: v_add_u16_e32 v32, 3, v10 -; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v32, v10 -; VI-NEXT: v_add_u16_e32 v32, 3, v9 -; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v32, v9 -; VI-NEXT: v_add_u16_e32 v32, 3, v8 -; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v32, v8 -; VI-NEXT: v_add_u16_e32 v32, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v32, v7 -; VI-NEXT: v_add_u16_e32 v32, 3, v6 -; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v32, v6 -; VI-NEXT: v_add_u16_e32 v32, 3, v5 -; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v32, v5 -; VI-NEXT: v_add_u16_e32 v32, 3, v4 -; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v32, v4 -; VI-NEXT: v_add_u16_e32 v32, 3, v3 -; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v32, v3 -; VI-NEXT: v_add_u16_e32 v32, 3, v2 -; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v32, v2 -; VI-NEXT: v_add_u16_e32 v32, 3, v1 -; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v32, v1 -; VI-NEXT: v_add_u16_e32 v32, 3, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v32, v0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v32, 3, v31 -; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: v_add_u16_e32 v32, 3, v30 -; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v30, v32, v30 -; VI-NEXT: v_add_u16_e32 v32, 3, v29 -; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v32, v29 -; VI-NEXT: v_add_u16_e32 v32, 3, v28 -; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v32, v28 -; VI-NEXT: v_add_u16_e32 v32, 3, v27 -; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_add_u16_e32 v32, 3, v26 -; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v32, v26 -; VI-NEXT: v_add_u16_e32 v32, 3, v25 -; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v32, v25 -; VI-NEXT: v_add_u16_e32 v32, 3, v24 -; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v32, v24 -; VI-NEXT: v_add_u16_e32 v32, 3, v23 -; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v32, v23 -; VI-NEXT: v_add_u16_e32 v32, 3, v22 -; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v32, v22 -; VI-NEXT: v_add_u16_e32 v32, 3, v21 -; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v32, v21 -; VI-NEXT: v_add_u16_e32 v32, 3, v20 -; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v32, v20 -; VI-NEXT: v_add_u16_e32 v32, 3, v19 -; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v32, v19 -; VI-NEXT: v_add_u16_e32 v32, 3, v18 -; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v32, v18 -; VI-NEXT: v_add_u16_e32 v32, 3, v17 -; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v32, v17 -; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v64i16_to_v32i32: +; GFX9-LABEL: bitcast_v16i64_to_v32f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -18774,47 +39466,47 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v64i16_to_v32i32: +; GFX11-LABEL: bitcast_v16i64_to_v32f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -18826,42 +39518,50 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -18869,71 +39569,399 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <64 x i16> %a, splat (i16 3) - %a2 = bitcast <64 x i16> %a1 to <32 x i32> + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x float> br label %end cmp.false: - %a3 = bitcast <64 x i16> %a to <32 x i32> + %a3 = bitcast <16 x i64> %a to <32 x float> br label %end end: - %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x i32> %phi + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi } -define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x float> @bitcast_v16i64_to_v32f32_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 ; -; VI-LABEL: bitcast_v32f32_to_v16i64: +; VI-LABEL: bitcast_v16i64_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB31_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: .LBB31_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v32f32_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v16f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -18943,7 +39971,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -18978,12 +40006,12 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v32f32_to_v16i64: +; GFX9-LABEL: bitcast_v32f32_to_v16f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 @@ -18993,7 +40021,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -19028,12 +40056,12 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v32f32_to_v16i64: +; GFX11-LABEL: bitcast_v32f32_to_v16f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 @@ -19045,7 +40073,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 @@ -19064,7 +40092,7 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -19073,309 +40101,139 @@ define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { cmp.true: %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <32 x float> %a1 to <16 x i64> - br label %end - -cmp.false: - %a3 = bitcast <32 x float> %a to <16 x i64> - br label %end - -end: - %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <16 x i64> %phi -} - -define <32 x float> @bitcast_v16i64_to_v32f32(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v16i64_to_v32f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB15_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v16i64_to_v32f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB15_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v16i64_to_v32f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <16 x i64> %a, splat (i64 3) - %a2 = bitcast <16 x i64> %a1 to <32 x float> + %a2 = bitcast <32 x float> %a1 to <16 x double> br label %end cmp.false: - %a3 = bitcast <16 x i64> %a to <32 x float> + %a3 = bitcast <32 x float> %a to <16 x double> br label %end end: - %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x float> %phi + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi } -define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x double> @bitcast_v32f32_to_v16f64_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: s_branch .LBB33_2 ; -; VI-LABEL: bitcast_v32f32_to_v16f64: +; VI-LABEL: bitcast_v32f32_to_v16f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 @@ -19389,7 +40247,7 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -19408,24 +40266,53 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: s_branch .LBB33_2 ; -; GFX9-LABEL: bitcast_v32f32_to_v16f64: +; GFX9-LABEL: bitcast_v32f32_to_v16f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 ; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 @@ -19439,7 +40326,7 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -19458,26 +40345,44 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: s_branch .LBB33_2 ; -; GFX11-LABEL: bitcast_v32f32_to_v16f64: +; GFX11-LABEL: bitcast_v32f32_to_v16f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB33_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: .LBB33_4: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 ; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -19494,9 +40399,6 @@ define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB16_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19516,39 +40418,39 @@ end: } define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v32f32: ; VI: ; %bb.0: @@ -19560,7 +40462,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -19579,7 +40481,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -19594,7 +40496,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -19613,7 +40515,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -19630,7 +40532,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 @@ -19649,7 +40551,7 @@ define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -19670,1228 +40572,1478 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v16f64_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB35_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: .LBB35_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_f32_e32 v58, 1.0, v58 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v59, 1.0, v59 -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 -; GCN-NEXT: v_or_b32_e32 v1, v1, v52 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v50, v31 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 -; GCN-NEXT: v_or_b32_e32 v49, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v57 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v29, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v1, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v5, v1, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v36 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v44 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v45 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v56 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v58 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v59 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v60 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB36_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v53, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_or_b32_e32 v41, v41, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v39 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v128i8: ; VI: ; %bb.0: @@ -21092,7 +42244,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -21268,9 +42420,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: .LBB18_2: ; %Flow +; VI-NEXT: .LBB36_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_4 +; VI-NEXT: s_cbranch_execz .LBB36_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 @@ -21477,7 +42629,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: .LBB18_4: ; %end +; VI-NEXT: .LBB36_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -22069,7 +43221,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -22263,9 +43415,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: .LBB36_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 @@ -22491,7 +43643,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 -; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: .LBB36_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -22939,7 +44091,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -23006,9 +44158,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB36_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17 @@ -23091,7 +44243,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_4: ; %end +; GFX11-TRUE16-NEXT: .LBB36_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -23540,7 +44692,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -23639,9 +44791,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB18_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB36_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17 @@ -23756,7 +44908,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB18_4: ; %end +; GFX11-FAKE16-NEXT: .LBB36_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -24098,1621 +45250,7429 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v56, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v47, s17 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v42, s19 +; SI-NEXT: v_mov_b32_e32 v40, s20 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v51, s22 +; SI-NEXT: v_mov_b32_e32 v48, s23 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v35, s25 +; SI-NEXT: v_mov_b32_e32 v33, s26 +; SI-NEXT: v_mov_b32_e32 v30, s27 +; SI-NEXT: v_mov_b32_e32 v28, s28 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 +; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 +; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 +; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 +; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 +; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 +; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 +; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 +; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_add_f32_e32 v47, 1.0, v47 +; SI-NEXT: v_add_f32_e32 v56, 1.0, v56 +; SI-NEXT: v_add_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_add_f32_e32 v44, 1.0, v44 +; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v40, 1.0, v40 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 +; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 +; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 +; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 +; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 +; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 +; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 +; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_and_b32_e32 v54, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v41, v56, v41 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v50 +; SI-NEXT: v_or_b32_e32 v50, v50, v54 +; SI-NEXT: v_and_b32_e32 v54, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v50, v54, v50 +; SI-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v50, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v21, v50, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v32 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v63 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v61 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v29 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v60 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v58 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v45 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v38 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v52 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v33 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v34 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v28 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v31 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v24 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v32f32_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: s_lshr_b32 s80, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s85, s23, 8 +; VI-NEXT: s_lshr_b32 s84, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s52, s21, 8 +; VI-NEXT: s_lshr_b32 s51, s20, 16 +; VI-NEXT: s_lshr_b32 s53, s20, 8 +; VI-NEXT: s_lshr_b32 s54, s19, 24 +; VI-NEXT: s_lshr_b32 s55, s19, 16 +; VI-NEXT: s_lshr_b32 s65, s19, 8 +; VI-NEXT: s_lshr_b32 s64, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s67, s17, 24 +; VI-NEXT: s_lshr_b32 s68, s17, 16 +; VI-NEXT: s_lshr_b32 s70, s17, 8 +; VI-NEXT: s_lshr_b32 s69, s16, 16 +; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: v_add_f32_e64 v4, s7, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s6, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: v_add_f32_e64 v6, s9, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s8, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: v_add_f32_e64 v8, s11, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s10, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: v_add_f32_e64 v10, s13, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s12, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_add_f32_e64 v12, s15, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s14, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: v_add_f32_e64 v14, s41, 1.0 +; VI-NEXT: v_add_f32_e64 v13, s40, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_add_f32_e64 v16, s43, 1.0 +; VI-NEXT: v_add_f32_e64 v15, s42, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: v_add_f32_e64 v18, s45, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s44, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_add_f32_e64 v20, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v19, s28, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v21, s26, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_add_f32_e64 v28, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v27, s20, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_add_f32_e64 v30, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v29, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v24, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v23, s24, 1.0 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_add_f32_e64 v32, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v31, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v26, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v25, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v31 +; VI-NEXT: s_branch .LBB37_5 +; VI-NEXT: .LBB37_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 1 +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 5 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 7 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 9 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 11 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 13 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_mov_b32_e32 v53, s46 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s56 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s60 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s62 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s72 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s74 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s76 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s78 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s88 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s90 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v17, s44 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v13, s40 +; VI-NEXT: v_mov_b32_e32 v14, s41 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v5, s8 +; VI-NEXT: v_mov_b32_e32 v6, s9 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v35, s71 +; VI-NEXT: v_mov_b32_e32 v61, s69 +; VI-NEXT: v_mov_b32_e32 v34, s70 +; VI-NEXT: v_mov_b32_e32 v33, s68 +; VI-NEXT: v_mov_b32_e32 v60, s67 +; VI-NEXT: v_mov_b32_e32 v52, s66 +; VI-NEXT: v_mov_b32_e32 v59, s64 +; VI-NEXT: v_mov_b32_e32 v58, s65 +; VI-NEXT: v_mov_b32_e32 v57, s55 +; VI-NEXT: v_mov_b32_e32 v49, s54 +; VI-NEXT: v_mov_b32_e32 v47, s53 +; VI-NEXT: v_mov_b32_e32 v56, s51 +; VI-NEXT: v_mov_b32_e32 v38, s52 +; VI-NEXT: v_mov_b32_e32 v51, s50 +; VI-NEXT: v_mov_b32_e32 v46, s87 +; VI-NEXT: v_mov_b32_e32 v44, s86 +; VI-NEXT: v_mov_b32_e32 v45, s84 +; VI-NEXT: v_mov_b32_e32 v43, s85 +; VI-NEXT: v_mov_b32_e32 v55, s83 +; VI-NEXT: v_mov_b32_e32 v42, s82 +; VI-NEXT: v_mov_b32_e32 v37, s81 +; VI-NEXT: v_mov_b32_e32 v50, s80 +; VI-NEXT: v_mov_b32_e32 v53, s30 +; VI-NEXT: v_mov_b32_e32 v54, s34 +; VI-NEXT: v_mov_b32_e32 v39, s36 +; VI-NEXT: v_mov_b32_e32 v40, s38 +; VI-NEXT: v_mov_b32_e32 v41, s48 +; VI-NEXT: .LBB37_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; VI-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v41 +; VI-NEXT: v_or_b32_sdwa v31, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v61, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v60 +; VI-NEXT: v_or_b32_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v32, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v40 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v52 +; VI-NEXT: v_or_b32_sdwa v31, v59, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v31, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v58 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v49 +; VI-NEXT: v_or_b32_sdwa v30, v57, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v30, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v47 +; VI-NEXT: v_or_b32_sdwa v29, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v29, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v38 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v46 +; VI-NEXT: v_or_b32_sdwa v28, v51, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v54 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v44 +; VI-NEXT: v_or_b32_sdwa v27, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v27, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v43 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v42 +; VI-NEXT: v_or_b32_sdwa v26, v55, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v26, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v37 +; VI-NEXT: v_or_b32_sdwa v25, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v50 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v36 +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v21, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v17, v17, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s44, v1 +; GFX9-NEXT: v_readfirstlane_b32 s45, v2 +; GFX9-NEXT: v_readfirstlane_b32 s42, v3 +; GFX9-NEXT: v_readfirstlane_b32 s43, v4 +; GFX9-NEXT: v_readfirstlane_b32 s40, v5 +; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s82, s27, 24 +; GFX9-NEXT: s_lshr_b32 s83, s27, 16 +; GFX9-NEXT: s_lshr_b32 s85, s27, 8 +; GFX9-NEXT: s_lshr_b32 s84, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s98, s25, 8 +; GFX9-NEXT: s_lshr_b32 s97, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s49, s23, 8 +; GFX9-NEXT: s_lshr_b32 s48, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s54, s21, 8 +; GFX9-NEXT: s_lshr_b32 s53, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s67, s19, 8 +; GFX9-NEXT: s_lshr_b32 s66, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s80, s17, 8 +; GFX9-NEXT: s_lshr_b32 s71, s16, 16 +; GFX9-NEXT: s_lshr_b32 s81, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e64 v8, s11, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s10, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e64 v10, s13, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s12, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e64 v12, s15, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s14, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e64 v14, s41, 1.0 +; GFX9-NEXT: v_add_f32_e64 v13, s40, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: v_add_f32_e64 v23, s43, 1.0 +; GFX9-NEXT: v_add_f32_e64 v22, s42, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[22:23] +; GFX9-NEXT: v_add_f32_e64 v25, s45, 1.0 +; GFX9-NEXT: v_add_f32_e64 v24, s44, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[24:25] +; GFX9-NEXT: v_add_f32_e64 v27, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v26, s28, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[26:27] +; GFX9-NEXT: v_add_f32_e64 v29, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v28, s26, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[28:29] +; GFX9-NEXT: v_add_f32_e64 v31, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v30, s24, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[30:31] +; GFX9-NEXT: v_add_f32_e64 v33, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v32, s22, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[32:33] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v23 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v29 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: v_add_f32_e64 v35, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v34, s20, 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; GFX9-NEXT: v_add_f32_e64 v37, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v36, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[34:35] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GFX9-NEXT: v_add_f32_e64 v39, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v38, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[36:37] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[38:39] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v38 +; GFX9-NEXT: s_branch .LBB37_5 +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v52, s48 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s39 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s38 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s97 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s96 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s87 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s84 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s83 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s82 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_mov_b32_e32 v49, s52 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s46 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s58 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s62 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s72 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s74 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s78 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s88 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s90 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s92 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, s16 +; GFX9-NEXT: v_mov_b32_e32 v39, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s24 +; GFX9-NEXT: v_mov_b32_e32 v31, s25 +; GFX9-NEXT: v_mov_b32_e32 v28, s26 +; GFX9-NEXT: v_mov_b32_e32 v29, s27 +; GFX9-NEXT: v_mov_b32_e32 v26, s28 +; GFX9-NEXT: v_mov_b32_e32 v27, s29 +; GFX9-NEXT: v_mov_b32_e32 v24, s44 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v22, s42 +; GFX9-NEXT: v_mov_b32_e32 v23, s43 +; GFX9-NEXT: v_mov_b32_e32 v13, s40 +; GFX9-NEXT: v_mov_b32_e32 v14, s41 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v48, s81 +; GFX9-NEXT: v_mov_b32_e32 v21, s71 +; GFX9-NEXT: v_mov_b32_e32 v16, s80 +; GFX9-NEXT: v_mov_b32_e32 v19, s70 +; GFX9-NEXT: v_mov_b32_e32 v20, s69 +; GFX9-NEXT: v_mov_b32_e32 v15, s68 +; GFX9-NEXT: v_mov_b32_e32 v18, s66 +; GFX9-NEXT: v_mov_b32_e32 v61, s67 +; GFX9-NEXT: v_mov_b32_e32 v51, s65 +; GFX9-NEXT: v_mov_b32_e32 v17, s64 +; GFX9-NEXT: v_mov_b32_e32 v54, s55 +; GFX9-NEXT: v_mov_b32_e32 v50, s53 +; GFX9-NEXT: v_mov_b32_e32 v60, s54 +; GFX9-NEXT: v_mov_b32_e32 v49, s51 +; GFX9-NEXT: v_mov_b32_e32 v59, s50 +; GFX9-NEXT: v_mov_b32_e32 v58, s49 +; GFX9-NEXT: v_mov_b32_e32 v57, s99 +; GFX9-NEXT: v_mov_b32_e32 v53, s98 +; GFX9-NEXT: v_mov_b32_e32 v56, s86 +; GFX9-NEXT: v_mov_b32_e32 v47, s85 +; GFX9-NEXT: v_mov_b32_e32 v40, s30 +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: v_mov_b32_e32 v42, s36 +; GFX9-NEXT: .LBB37_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v34, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v33, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v16, v39, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v46 +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v43 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v15, v51, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v42 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v34, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v49 +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v30, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32f32_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s17, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-TRUE16-NEXT: .LBB37_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v37, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v36, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v39, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v38, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s5, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s4, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v51, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v50, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, s15, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s14, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s9, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s8, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[30:31] +; GFX11-TRUE16-NEXT: v_add_f32_e64 v55, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v54, s0, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, s29, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, s28, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, s41, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v21, s40, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s7, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s11, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s13, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s12, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s10, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s6, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[36:37] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[38:39] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[28:29] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 24, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 24, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v54 +; GFX11-TRUE16-NEXT: s_branch .LBB37_5 +; GFX11-TRUE16-NEXT: .LBB37_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB37_2 +; GFX11-TRUE16-NEXT: .LBB37_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 13 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 15 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 17 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s42 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s0 +; GFX11-TRUE16-NEXT: .LBB37_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xff, v72 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v59, 0xff, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, v71, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v60 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v59, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xff, v62 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v56, 8, v56 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v71, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, v57, v56 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v50, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v46, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v39, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v41 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v38, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v68, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v183 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v182 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v67, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xff, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v68, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v50, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v150 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v68, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v160 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v29, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v51, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v55, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v30, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v23, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v48, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v129 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v48, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v51, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, v22, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v17, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v18, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v32, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v48, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v32, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v165 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v164 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v25, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v70, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v25, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v67, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v20, v16 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[28:31], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v74, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s25, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s22, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s19, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s40, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-FAKE16-NEXT: .LBB37_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v21, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v37, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v36, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s8, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_add_f32_e64 v53, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, s0, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v49, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v48, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, s29, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s28, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s41, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s40, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s15, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s14, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s5, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s7, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s11, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s13, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s12, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s10, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s6, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s4, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[30:31] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-FAKE16-NEXT: s_branch .LBB37_5 +; GFX11-FAKE16-NEXT: .LBB37_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB37_2 +; GFX11-FAKE16-NEXT: .LBB37_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v147, s36 :: v_dual_mov_b32 v48, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s3 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s17 :: v_dual_mov_b32 v148, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s20 :: v_dual_mov_b32 v31, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v146, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v145, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s26 :: v_dual_mov_b32 v22, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v18, s29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v144, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v134, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s4 :: v_dual_mov_b32 v10, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s6 :: v_dual_mov_b32 v8, s7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v135, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v133, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s12 :: v_dual_mov_b32 v2, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v74, s35 :: v_dual_mov_b32 v73, s104 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v132, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v72, s34 :: v_dual_mov_b32 v63, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v62, s102 :: v_dual_mov_b32 v61, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v131, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v60, s99 :: v_dual_mov_b32 v59, s100 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v58, s98 :: v_dual_mov_b32 v57, s97 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v129, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v56, s96 :: v_dual_mov_b32 v47, s86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s87 :: v_dual_mov_b32 v45, s85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v130, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v44, s84 :: v_dual_mov_b32 v43, s83 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v42, s81 :: v_dual_mov_b32 v41, s82 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v128, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v40, s80 :: v_dual_mov_b32 v183, s71 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, s70 :: v_dual_mov_b32 v181, s68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v119, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, s69 :: v_dual_mov_b32 v179, s67 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, s66 :: v_dual_mov_b32 v177, s65 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v118, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, s55 :: v_dual_mov_b32 v167, s64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v166, s54 :: v_dual_mov_b32 v165, s53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v116, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v164, s52 :: v_dual_mov_b32 v163, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v162, s51 :: v_dual_mov_b32 v161, s49 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v117, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v160, s48 :: v_dual_mov_b32 v151, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v150, s37 :: v_dual_mov_b32 v149, s38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v115, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s62 :: v_dual_mov_b32 v38, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s72 :: v_dual_mov_b32 v50, s90 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v114, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s74 :: v_dual_mov_b32 v54, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s94 :: v_dual_mov_b32 v65, s30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v113, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s60 :: v_dual_mov_b32 v67, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s56 :: v_dual_mov_b32 v69, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v103, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v70, s44 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v80, s42 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v26, s76 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v32, s78 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v112, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v102, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v101, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v100, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v98, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v99, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v97, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v96, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v87, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 29 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v85, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v86, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v84, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v83, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v51, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v39, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v33, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, s0 +; GFX11-FAKE16-NEXT: .LBB37_5: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, v60, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v60, v52, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, v80, v81 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v48, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v49, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v61, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v80, v81 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v62, v48, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v71, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v63, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v69, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v36, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v52, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v69, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v36, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v52, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v68, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v53, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v49, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v67, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v69, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v30, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v66, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[60:63], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[34:37], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v161 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v67, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v35, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v64, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v49, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v23, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v36, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v17, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v52, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v35, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v48, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v49, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v52, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v13, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v14, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v32, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v49, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v32, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v25, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v25, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v16 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[28:31], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[34:37], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB19_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB19_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB38_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB38_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32f32: ; VI: ; %bb.0: @@ -26044,7 +53004,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -26517,9 +53477,9 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: .LBB38_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: s_cbranch_execz .LBB38_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -26908,7 +53868,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: .LBB38_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -27280,7 +54240,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -27754,9 +54714,9 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: .LBB38_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: s_cbranch_execz .LBB38_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -28151,7 +55111,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: .LBB38_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -28393,15 +55353,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_4 -; GFX11-TRUE16-NEXT: .LBB19_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_4 +; GFX11-TRUE16-NEXT: .LBB38_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB19_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h @@ -28564,62 +55524,548 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l ; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 @@ -28630,1537 +56076,10915 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-TRUE16-NEXT: .LBB19_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: .LBB38_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: .LBB38_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB39_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB39_3 +; SI-NEXT: .LBB39_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB39_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB39_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB39_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB39_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB39_3 +; VI-NEXT: .LBB39_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB39_3: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 +; VI-NEXT: s_cbranch_vccnz .LBB39_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB39_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB39_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB39_3 +; GFX9-NEXT: .LBB39_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB39_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB39_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB39_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_3 +; GFX11-TRUE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB39_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB39_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB39_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_3 +; GFX11-FAKE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB39_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB39_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB39_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v32f32_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v32, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v63 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s40, v11 +; SI-NEXT: v_readfirstlane_b32 s41, v12 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v16 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s47, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s4, 3 +; SI-NEXT: s_lshl_b32 s4, s47, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s4, 1 +; SI-NEXT: s_lshl_b32 s4, s46, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: s_and_b32 s60, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s45, 16 +; SI-NEXT: s_and_b32 s62, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s44, 16 +; SI-NEXT: s_and_b32 s72, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s43, 16 +; SI-NEXT: s_and_b32 s74, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s42, 16 +; SI-NEXT: s_and_b32 s76, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s41, 16 +; SI-NEXT: s_and_b32 s78, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s40, 16 +; SI-NEXT: s_and_b32 s88, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s15, 16 +; SI-NEXT: s_and_b32 s90, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s14, 16 +; SI-NEXT: s_and_b32 s92, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s13, 16 +; SI-NEXT: s_and_b32 s94, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s12, 16 +; SI-NEXT: s_and_b32 s30, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s11, 16 +; SI-NEXT: s_and_b32 s34, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s10, 16 +; SI-NEXT: s_and_b32 s36, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s9, 16 +; SI-NEXT: s_and_b32 s38, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s8, 16 +; SI-NEXT: s_and_b32 s48, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s7, 16 +; SI-NEXT: s_and_b32 s50, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s6, 16 +; SI-NEXT: s_and_b32 s52, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s29, 16 +; SI-NEXT: s_and_b32 s54, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s28, 16 +; SI-NEXT: s_and_b32 s64, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s27, 16 +; SI-NEXT: s_and_b32 s66, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s26, 16 +; SI-NEXT: s_and_b32 s68, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s25, 16 +; SI-NEXT: s_and_b32 s70, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s71, s24, 16 +; SI-NEXT: s_and_b32 s80, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s23, 16 +; SI-NEXT: s_and_b32 s82, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s83, s22, 16 +; SI-NEXT: s_and_b32 s84, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s21, 16 +; SI-NEXT: s_and_b32 s86, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s20, 16 +; SI-NEXT: s_and_b32 s96, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s19, 16 +; SI-NEXT: s_and_b32 s98, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s18, 16 +; SI-NEXT: s_and_b32 s56, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s17, 16 +; SI-NEXT: s_and_b32 s58, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s47, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s46, 1.0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v43, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v41, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v55, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v53, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v51, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v39, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v37, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v35, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s42, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s43, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s45, 1.0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_add_f32_e64 v2, s16, 1.0 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v45 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: v_mov_b32_e32 v2, s59 +; SI-NEXT: v_mov_b32_e32 v3, s58 +; SI-NEXT: v_mov_b32_e32 v61, s57 +; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: v_mov_b32_e32 v59, s99 +; SI-NEXT: v_mov_b32_e32 v60, s98 +; SI-NEXT: v_mov_b32_e32 v57, s97 +; SI-NEXT: v_mov_b32_e32 v58, s96 +; SI-NEXT: v_mov_b32_e32 v47, s87 +; SI-NEXT: v_mov_b32_e32 v56, s86 +; SI-NEXT: v_mov_b32_e32 v45, s85 +; SI-NEXT: v_mov_b32_e32 v46, s84 +; SI-NEXT: v_mov_b32_e32 v43, s83 +; SI-NEXT: v_mov_b32_e32 v44, s82 +; SI-NEXT: v_mov_b32_e32 v41, s81 +; SI-NEXT: v_mov_b32_e32 v42, s80 +; SI-NEXT: v_mov_b32_e32 v55, s71 +; SI-NEXT: v_mov_b32_e32 v40, s70 +; SI-NEXT: v_mov_b32_e32 v53, s69 +; SI-NEXT: v_mov_b32_e32 v54, s68 +; SI-NEXT: v_mov_b32_e32 v51, s67 +; SI-NEXT: v_mov_b32_e32 v52, s66 +; SI-NEXT: v_mov_b32_e32 v49, s65 +; SI-NEXT: v_mov_b32_e32 v50, s64 +; SI-NEXT: v_mov_b32_e32 v39, s55 +; SI-NEXT: v_mov_b32_e32 v48, s54 +; SI-NEXT: v_mov_b32_e32 v37, s53 +; SI-NEXT: v_mov_b32_e32 v38, s52 +; SI-NEXT: v_mov_b32_e32 v35, s51 +; SI-NEXT: v_mov_b32_e32 v36, s50 +; SI-NEXT: v_mov_b32_e32 v33, s49 +; SI-NEXT: v_mov_b32_e32 v34, s48 +; SI-NEXT: v_mov_b32_e32 v31, s39 +; SI-NEXT: v_mov_b32_e32 v32, s38 +; SI-NEXT: v_mov_b32_e32 v29, s37 +; SI-NEXT: v_mov_b32_e32 v30, s36 +; SI-NEXT: v_mov_b32_e32 v27, s35 +; SI-NEXT: v_mov_b32_e32 v28, s34 +; SI-NEXT: v_mov_b32_e32 v25, s31 +; SI-NEXT: v_mov_b32_e32 v26, s30 +; SI-NEXT: v_mov_b32_e32 v23, s95 +; SI-NEXT: v_mov_b32_e32 v24, s94 +; SI-NEXT: v_mov_b32_e32 v21, s93 +; SI-NEXT: v_mov_b32_e32 v22, s92 +; SI-NEXT: v_mov_b32_e32 v19, s91 +; SI-NEXT: v_mov_b32_e32 v20, s90 +; SI-NEXT: v_mov_b32_e32 v17, s89 +; SI-NEXT: v_mov_b32_e32 v18, s88 +; SI-NEXT: v_mov_b32_e32 v15, s79 +; SI-NEXT: v_mov_b32_e32 v16, s78 +; SI-NEXT: v_mov_b32_e32 v13, s77 +; SI-NEXT: v_mov_b32_e32 v14, s76 +; SI-NEXT: v_mov_b32_e32 v11, s75 +; SI-NEXT: v_mov_b32_e32 v12, s74 +; SI-NEXT: v_mov_b32_e32 v9, s73 +; SI-NEXT: v_mov_b32_e32 v10, s72 +; SI-NEXT: v_mov_b32_e32 v7, s63 +; SI-NEXT: v_mov_b32_e32 v8, s62 +; SI-NEXT: v_mov_b32_e32 v5, s61 +; SI-NEXT: v_mov_b32_e32 v6, s60 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB41_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: .LBB41_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB42_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB42_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB19_2: ; %Flow +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB19_4: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <32 x float> + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x float> br label %end cmp.false: - %a3 = bitcast <128 x i8> %a to <32 x float> + %a3 = bitcast <64 x bfloat> %a to <32 x float> br label %end end: @@ -30168,2144 +66992,1217 @@ end: ret <32 x float> %phi } -define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v62 -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v32f32_to_v64bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: .LBB20_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v32f32_to_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: .LBB20_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v32f32_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: .LBB20_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <32 x float> %a1 to <64 x bfloat> - br label %end - -cmp.false: - %a3 = bitcast <32 x float> %a to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi -} - -define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 ; -; VI-LABEL: bitcast_v64bf16_to_v32f32: +; VI-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -32313,1619 +68210,2398 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB21_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 ; -; GFX9-LABEL: bitcast_v64bf16_to_v32f32: +; GFX9-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB21_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: s_branch .LBB43_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -33944,777 +70620,809 @@ end: } define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v63 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v49 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v31, v33 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v63 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v2, v1 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v2, v1 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v44 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v41 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v54 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v53 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v49 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v48 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_f32_e32 v42, 1.0, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v56, v28 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64f16: ; VI: ; %bb.0: @@ -34726,7 +71434,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -34761,7 +71469,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -34776,7 +71484,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -34811,7 +71519,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -34828,7 +71536,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -34847,7 +71555,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -34868,768 +71576,1618 @@ end: ret <64 x half> %phi } +define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e64 v41, s6, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s46, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s42, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 +; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 +; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_add_f32_e64 v34, s11, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v37 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB45_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: .LBB45_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v32f32: ; VI: ; %bb.0: @@ -35641,7 +73199,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -35741,7 +73299,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -35756,7 +73314,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -35792,7 +73350,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -35809,7 +73367,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -35844,7 +73402,7 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -35865,369 +73423,1543 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB47_3 +; SI-NEXT: .LBB47_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB47_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB47_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB47_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f32_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_or_b32_e32 v46, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v55 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v15, v15, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v45 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v19, v19, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v20, v20, v34 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v34 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; GCN-NEXT: v_or_b32_e32 v22, v22, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v40 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; GCN-NEXT: v_or_b32_e32 v25, v25, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v54 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; GCN-NEXT: v_or_b32_e32 v27, v27, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v32, v32, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f32_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v41, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v43, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v64i16: ; VI: ; %bb.0: @@ -36239,7 +74971,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -36274,7 +75006,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -36289,7 +75021,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -36324,7 +75056,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -36341,7 +75073,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -36360,7 +75092,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -36381,613 +75113,1194 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f32_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_mov_b32_e32 v36, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v33, s18 +; SI-NEXT: v_mov_b32_e32 v32, s19 +; SI-NEXT: v_mov_b32_e32 v31, s20 +; SI-NEXT: v_mov_b32_e32 v29, s21 +; SI-NEXT: v_mov_b32_e32 v28, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v25, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v20, s28 +; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v52, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 +; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 +; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v52, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 +; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 +; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v36, v36, v60 +; SI-NEXT: v_or_b32_e32 v23, v35, v23 +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v23, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v58 +; SI-NEXT: v_or_b32_e32 v23, v23, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v23, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v62 +; SI-NEXT: v_or_b32_e32 v23, v23, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 +; SI-NEXT: v_or_b32_e32 v23, v23, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v32f32_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v32f32_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-LABEL: bitcast_v32f32_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB49_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: .LBB49_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v32f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v32f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v32f32: ; VI: ; %bb.0: @@ -36999,7 +76312,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v33, 3 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 @@ -37099,7 +76412,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -37114,7 +76427,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -37149,7 +76462,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -37166,7 +76479,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -37201,7 +76514,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: .LBB50_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -37222,56 +76535,1085 @@ end: ret <32 x float> %phi } +define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v32f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s47, v1 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s46, 3 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v32f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v32f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB51_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v16f64: ; VI: ; %bb.0: @@ -37283,7 +77625,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -37318,7 +77660,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -37333,7 +77675,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -37368,7 +77710,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -37385,7 +77727,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -37428,7 +77770,7 @@ define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -37449,40 +77791,368 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v16i64_to_v16f64_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v16i64_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB53_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: .LBB53_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v16i64: ; VI: ; %bb.0: @@ -37494,7 +78164,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -37513,7 +78183,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -37528,7 +78198,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -37547,7 +78217,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -37564,7 +78234,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -37583,7 +78253,7 @@ define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -37604,1227 +78274,1477 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v16f64_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB55_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_addc_u32_e32 v57, vcc, 0, v57, vcc -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v57 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 -; GCN-NEXT: v_or_b32_e32 v1, v1, v52 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v50, v31 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 -; GCN-NEXT: v_or_b32_e32 v49, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v58 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v29, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v1, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v5, v1, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v36 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v44 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v45 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v56 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v58 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v59 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v60 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_or_b32_e32 v41, v41, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v39 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v128i8: ; VI: ; %bb.0: @@ -39025,7 +79945,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -39201,9 +80121,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: .LBB28_2: ; %Flow +; VI-NEXT: .LBB56_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_4 +; VI-NEXT: s_cbranch_execz .LBB56_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc @@ -39410,7 +80330,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: .LBB28_4: ; %end +; VI-NEXT: .LBB56_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -40002,7 +80922,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -40196,9 +81116,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB28_2: ; %Flow +; GFX9-NEXT: .LBB56_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_4 +; GFX9-NEXT: s_cbranch_execz .LBB56_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc @@ -40423,7 +81343,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 -; GFX9-NEXT: .LBB28_4: ; %end +; GFX9-NEXT: .LBB56_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -40871,7 +81791,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -40938,9 +81858,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB28_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB56_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -41048,7 +81968,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB28_4: ; %end +; GFX11-TRUE16-NEXT: .LBB56_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -41497,7 +82417,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -41596,9 +82516,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB56_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -41738,7 +82658,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB28_4: ; %end +; GFX11-FAKE16-NEXT: .LBB56_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -42080,1621 +83000,6415 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s30, 0 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s47, s47, 3 +; SI-NEXT: s_addc_u32 s46, s46, 0 +; SI-NEXT: s_add_u32 s45, s45, 3 +; SI-NEXT: s_addc_u32 s44, s44, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v22, s45 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v12, s13 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v18, s41 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 +; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 +; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 +; SI-NEXT: v_mov_b32_e32 v22, s47 +; SI-NEXT: v_mov_b32_e32 v23, s28 +; SI-NEXT: v_mov_b32_e32 v29, s26 +; SI-NEXT: v_mov_b32_e32 v35, s24 +; SI-NEXT: v_mov_b32_e32 v39, s22 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v53, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 +; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 +; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 +; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 +; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 +; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 +; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 +; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 +; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 +; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 +; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 +; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 +; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 +; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 +; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 +; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 +; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 +; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 +; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 +; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 +; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 +; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 +; SI-NEXT: s_lshr_b32 s56, s6, 24 +; SI-NEXT: s_lshr_b32 s57, s6, 16 +; SI-NEXT: s_lshr_b32 s58, s6, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s60, s8, 16 +; SI-NEXT: s_lshr_b32 s61, s8, 8 +; SI-NEXT: s_lshr_b32 s62, s10, 24 +; SI-NEXT: s_lshr_b32 s63, s10, 16 +; SI-NEXT: s_lshr_b32 s72, s10, 8 +; SI-NEXT: s_lshr_b32 s73, s12, 24 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 8 +; SI-NEXT: s_lshr_b32 s76, s14, 24 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s14, 8 +; SI-NEXT: s_lshr_b32 s79, s40, 24 +; SI-NEXT: s_lshr_b32 s88, s40, 16 +; SI-NEXT: s_lshr_b32 s89, s40, 8 +; SI-NEXT: s_lshr_b32 s90, s42, 24 +; SI-NEXT: s_lshr_b32 s91, s42, 16 +; SI-NEXT: s_lshr_b32 s92, s42, 8 +; SI-NEXT: s_lshr_b32 s93, s44, 24 +; SI-NEXT: s_lshr_b32 s94, s44, 16 +; SI-NEXT: s_lshr_b32 s95, s44, 8 +; SI-NEXT: s_lshr_b32 s30, s46, 24 +; SI-NEXT: s_lshr_b32 s31, s46, 16 +; SI-NEXT: s_lshr_b32 s34, s46, 8 +; SI-NEXT: s_lshr_b32 s35, s29, 24 +; SI-NEXT: s_lshr_b32 s36, s29, 16 +; SI-NEXT: s_lshr_b32 s37, s29, 8 +; SI-NEXT: s_lshr_b32 s38, s27, 24 +; SI-NEXT: s_lshr_b32 s39, s27, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 8 +; SI-NEXT: s_lshr_b32 s49, s25, 24 +; SI-NEXT: s_lshr_b32 s50, s25, 16 +; SI-NEXT: s_lshr_b32 s51, s25, 8 +; SI-NEXT: s_lshr_b32 s52, s23, 24 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s55, s21, 24 +; SI-NEXT: s_lshr_b32 s64, s21, 16 +; SI-NEXT: s_lshr_b32 s65, s21, 8 +; SI-NEXT: s_lshr_b32 s66, s19, 24 +; SI-NEXT: s_lshr_b32 s67, s19, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 8 +; SI-NEXT: s_lshr_b32 s69, s17, 24 +; SI-NEXT: s_lshr_b32 s70, s17, 16 +; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_or_b32_e32 v40, s4, v40 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s71, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s69, 24 +; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v54 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v54, v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; SI-NEXT: v_or_b32_e32 v53, s4, v53 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s67, 0xff +; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v51 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s66, 24 +; SI-NEXT: v_or_b32_e32 v54, v40, v54 +; SI-NEXT: v_and_b32_e32 v53, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v54, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v51, v53, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v55, v54, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v51, v52, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v52, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; SI-NEXT: v_or_b32_e32 v50, s4, v50 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v48 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s55, 24 +; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v48, v48, v49 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v48, v50, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v52, v51, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v49, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; SI-NEXT: v_or_b32_e32 v39, s4, v39 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s53, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v37 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s52, 24 +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v48, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v39, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v49, v48, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v38, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_or_b32_e32 v35, s4, v35 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s49, 24 +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v35, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v29, s4, v29 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s39, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s38, 24 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v23, s4, v23 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s35, 24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s31, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s30, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s93, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s91, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s90, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s79, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s76, 24 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s73, 24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s62, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s56, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s44, s44, 3 +; VI-NEXT: s_addc_u32 s45, s45, 0 +; VI-NEXT: s_add_u32 s42, s42, 3 +; VI-NEXT: s_addc_u32 s43, s43, 0 +; VI-NEXT: s_add_u32 s40, s40, 3 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_add_u32 s14, s14, 3 +; VI-NEXT: s_addc_u32 s15, s15, 0 +; VI-NEXT: s_add_u32 s12, s12, 3 +; VI-NEXT: s_addc_u32 s13, s13, 0 +; VI-NEXT: s_add_u32 s10, s10, 3 +; VI-NEXT: s_addc_u32 s11, s11, 0 +; VI-NEXT: s_add_u32 s8, s8, 3 +; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s4, s4, 3 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 58 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 59 +; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b32 s66, s27, 8 +; VI-NEXT: s_lshr_b32 s67, s26, 16 +; VI-NEXT: s_lshr_b32 s68, s26, 8 +; VI-NEXT: s_lshr_b32 s69, s25, 24 +; VI-NEXT: s_lshr_b32 s70, s25, 16 +; VI-NEXT: s_lshr_b32 s71, s25, 8 +; VI-NEXT: s_lshr_b32 s80, s24, 16 +; VI-NEXT: s_lshr_b32 s81, s24, 8 +; VI-NEXT: s_lshr_b32 s82, s23, 24 +; VI-NEXT: s_lshr_b32 s83, s23, 16 +; VI-NEXT: s_lshr_b32 s84, s23, 8 +; VI-NEXT: s_lshr_b32 s85, s22, 16 +; VI-NEXT: s_lshr_b32 s86, s22, 8 +; VI-NEXT: s_lshr_b32 s87, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s59, s17, 16 +; VI-NEXT: s_lshr_b32 s55, s17, 8 +; VI-NEXT: s_lshr_b32 s64, s16, 16 +; VI-NEXT: s_lshr_b32 s65, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_lshl_b32 s61, s65, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: s_lshl_b32 s61, s48, 8 +; VI-NEXT: s_and_b32 s63, s64, 0xff +; VI-NEXT: s_or_b32 s61, s63, s61 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s61, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s61 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s55, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s59, 0xff +; VI-NEXT: s_lshl_b32 s58, s58, 8 +; VI-NEXT: s_or_b32 s17, s17, s58 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_lshl_b32 s16, s54, 8 +; VI-NEXT: s_and_b32 s17, s18, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s38, 8 +; VI-NEXT: s_and_b32 s18, s53, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s51, 0xff +; VI-NEXT: s_lshl_b32 s18, s57, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: s_lshl_b32 s16, s56, 8 +; VI-NEXT: s_and_b32 s17, s20, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s36, 8 +; VI-NEXT: s_and_b32 s18, s47, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xff +; VI-NEXT: s_lshl_b32 s17, s46, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s50, 0xff +; VI-NEXT: s_lshl_b32 s18, s87, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v6, s16 +; VI-NEXT: s_lshl_b32 s16, s86, 8 +; VI-NEXT: s_and_b32 s17, s22, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s34, 8 +; VI-NEXT: s_and_b32 s18, s85, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xff +; VI-NEXT: s_lshl_b32 s17, s84, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s83, 0xff +; VI-NEXT: s_lshl_b32 s18, s82, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: s_lshl_b32 s16, s81, 8 +; VI-NEXT: s_and_b32 s17, s24, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s30, 8 +; VI-NEXT: s_and_b32 s18, s80, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xff +; VI-NEXT: s_lshl_b32 s17, s71, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s70, 0xff +; VI-NEXT: s_lshl_b32 s18, s69, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: s_lshl_b32 s16, s68, 8 +; VI-NEXT: s_and_b32 s17, s26, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s90, 8 +; VI-NEXT: s_and_b32 s18, s67, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: s_and_b32 s16, s27, 0xff +; VI-NEXT: s_lshl_b32 s17, s66, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 59 +; VI-NEXT: v_readlane_b32 s18, v21, 58 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 57 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s28, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 56 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s88, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 55 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: s_and_b32 s16, s29, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 54 +; VI-NEXT: v_readlane_b32 s18, v21, 53 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 52 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s44, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 51 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s78, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 50 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: s_and_b32 s16, s45, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 49 +; VI-NEXT: v_readlane_b32 s18, v21, 48 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 47 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s42, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 46 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s76, 8 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 45 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s43, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 44 +; VI-NEXT: v_readlane_b32 s18, v21, 43 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 42 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s40, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 41 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s74, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 40 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 39 +; VI-NEXT: v_readlane_b32 s18, v21, 38 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 37 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 36 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: v_readlane_b32 s15, v21, 35 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: v_readlane_b32 s15, v21, 34 +; VI-NEXT: v_readlane_b32 s16, v21, 33 +; VI-NEXT: s_and_b32 s15, s15, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 32 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 31 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: v_readlane_b32 s13, v21, 30 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: v_readlane_b32 s13, v21, 29 +; VI-NEXT: v_readlane_b32 s14, v21, 28 +; VI-NEXT: s_and_b32 s13, s13, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 27 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 26 +; VI-NEXT: v_readlane_b32 s14, v21, 0 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s13, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: v_readlane_b32 s11, v21, 25 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: v_readlane_b32 s11, v21, 24 +; VI-NEXT: v_readlane_b32 s12, v21, 23 +; VI-NEXT: s_and_b32 s11, s11, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 22 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 21 +; VI-NEXT: v_readlane_b32 s12, v21, 2 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff +; VI-NEXT: v_readlane_b32 s9, v21, 20 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: v_readlane_b32 s9, v21, 19 +; VI-NEXT: v_readlane_b32 s10, v21, 18 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 17 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 16 +; VI-NEXT: v_readlane_b32 s10, v21, 4 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: v_readlane_b32 s7, v21, 15 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 14 +; VI-NEXT: v_readlane_b32 s8, v21, 13 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 12 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 11 +; VI-NEXT: v_readlane_b32 s8, v21, 6 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: v_readlane_b32 s5, v21, 10 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s5, v21, 9 +; VI-NEXT: v_readlane_b32 s6, v21, 8 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s15, v21, 1 +; VI-NEXT: v_readlane_b32 s13, v21, 3 +; VI-NEXT: v_readlane_b32 s11, v21, 5 +; VI-NEXT: v_readlane_b32 s9, v21, 7 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s87, v20, 31 +; VI-NEXT: v_readlane_b32 s86, v20, 30 +; VI-NEXT: v_readlane_b32 s85, v20, 29 +; VI-NEXT: v_readlane_b32 s84, v20, 28 +; VI-NEXT: v_readlane_b32 s83, v20, 27 +; VI-NEXT: v_readlane_b32 s82, v20, 26 +; VI-NEXT: v_readlane_b32 s81, v20, 25 +; VI-NEXT: v_readlane_b32 s80, v20, 24 +; VI-NEXT: v_readlane_b32 s71, v20, 23 +; VI-NEXT: v_readlane_b32 s70, v20, 22 +; VI-NEXT: v_readlane_b32 s69, v20, 21 +; VI-NEXT: v_readlane_b32 s68, v20, 20 +; VI-NEXT: v_readlane_b32 s67, v20, 19 +; VI-NEXT: v_readlane_b32 s66, v20, 18 +; VI-NEXT: v_readlane_b32 s65, v20, 17 +; VI-NEXT: v_readlane_b32 s64, v20, 16 +; VI-NEXT: v_readlane_b32 s55, v20, 15 +; VI-NEXT: v_readlane_b32 s54, v20, 14 +; VI-NEXT: v_readlane_b32 s53, v20, 13 +; VI-NEXT: v_readlane_b32 s52, v20, 12 +; VI-NEXT: v_readlane_b32 s51, v20, 11 +; VI-NEXT: v_readlane_b32 s50, v20, 10 +; VI-NEXT: v_readlane_b32 s49, v20, 9 +; VI-NEXT: v_readlane_b32 s48, v20, 8 +; VI-NEXT: v_readlane_b32 s39, v20, 7 +; VI-NEXT: v_readlane_b32 s38, v20, 6 +; VI-NEXT: v_readlane_b32 s37, v20, 5 +; VI-NEXT: v_readlane_b32 s36, v20, 4 +; VI-NEXT: v_readlane_b32 s35, v20, 3 +; VI-NEXT: v_readlane_b32 s34, v20, 2 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 0 +; VI-NEXT: v_writelane_b32 v21, s61, 1 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 2 +; VI-NEXT: v_writelane_b32 v21, s61, 3 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 4 +; VI-NEXT: v_writelane_b32 v21, s61, 5 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: v_writelane_b32 v21, s60, 6 +; VI-NEXT: v_writelane_b32 v21, s61, 7 +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_writelane_b32 v20, s34, 2 +; GFX9-NEXT: v_writelane_b32 v20, s35, 3 +; GFX9-NEXT: v_writelane_b32 v20, s36, 4 +; GFX9-NEXT: v_writelane_b32 v20, s37, 5 +; GFX9-NEXT: v_writelane_b32 v20, s38, 6 +; GFX9-NEXT: v_writelane_b32 v20, s39, 7 +; GFX9-NEXT: v_writelane_b32 v20, s48, 8 +; GFX9-NEXT: v_writelane_b32 v20, s49, 9 +; GFX9-NEXT: v_writelane_b32 v20, s50, 10 +; GFX9-NEXT: v_writelane_b32 v20, s51, 11 +; GFX9-NEXT: v_writelane_b32 v20, s52, 12 +; GFX9-NEXT: v_writelane_b32 v20, s53, 13 +; GFX9-NEXT: v_writelane_b32 v20, s54, 14 +; GFX9-NEXT: v_writelane_b32 v20, s55, 15 +; GFX9-NEXT: v_writelane_b32 v20, s64, 16 +; GFX9-NEXT: v_writelane_b32 v20, s65, 17 +; GFX9-NEXT: v_writelane_b32 v20, s66, 18 +; GFX9-NEXT: v_writelane_b32 v20, s67, 19 +; GFX9-NEXT: v_writelane_b32 v20, s68, 20 +; GFX9-NEXT: v_writelane_b32 v20, s69, 21 +; GFX9-NEXT: v_writelane_b32 v20, s70, 22 +; GFX9-NEXT: v_writelane_b32 v20, s71, 23 +; GFX9-NEXT: v_writelane_b32 v20, s80, 24 +; GFX9-NEXT: v_writelane_b32 v20, s81, 25 +; GFX9-NEXT: v_writelane_b32 v20, s82, 26 +; GFX9-NEXT: v_writelane_b32 v20, s83, 27 +; GFX9-NEXT: v_writelane_b32 v20, s84, 28 +; GFX9-NEXT: v_writelane_b32 v20, s85, 29 +; GFX9-NEXT: v_writelane_b32 v20, s86, 30 +; GFX9-NEXT: v_writelane_b32 v20, s87, 31 +; GFX9-NEXT: v_writelane_b32 v20, s96, 32 +; GFX9-NEXT: v_writelane_b32 v20, s97, 33 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v20, s98, 34 +; GFX9-NEXT: v_readfirstlane_b32 s44, v1 +; GFX9-NEXT: v_readfirstlane_b32 s45, v2 +; GFX9-NEXT: v_readfirstlane_b32 s42, v3 +; GFX9-NEXT: v_readfirstlane_b32 s43, v4 +; GFX9-NEXT: v_readfirstlane_b32 s40, v5 +; GFX9-NEXT: v_readfirstlane_b32 s41, v6 +; GFX9-NEXT: v_readfirstlane_b32 s14, v7 +; GFX9-NEXT: v_readfirstlane_b32 s15, v8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v9 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s10, v11 +; GFX9-NEXT: v_readfirstlane_b32 s11, v12 +; GFX9-NEXT: v_readfirstlane_b32 s8, v13 +; GFX9-NEXT: v_readfirstlane_b32 s9, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: v_writelane_b32 v20, s99, 35 +; GFX9-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s44, s44, 3 +; GFX9-NEXT: s_addc_u32 s45, s45, 0 +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s4, s4, 3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s12, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s12, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s14, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s14, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s40, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s40, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s42, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s42, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s44, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s44, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 50 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: s_lshr_b32 s82, s28, 8 +; GFX9-NEXT: s_lshr_b32 s83, s27, 24 +; GFX9-NEXT: s_lshr_b32 s81, s27, 16 +; GFX9-NEXT: s_lshr_b32 s84, s27, 8 +; GFX9-NEXT: s_lshr_b32 s85, s26, 16 +; GFX9-NEXT: s_lshr_b32 s86, s26, 8 +; GFX9-NEXT: s_lshr_b32 s87, s25, 24 +; GFX9-NEXT: s_lshr_b32 s96, s25, 16 +; GFX9-NEXT: s_lshr_b32 s97, s25, 8 +; GFX9-NEXT: s_lshr_b32 s98, s24, 16 +; GFX9-NEXT: s_lshr_b32 s99, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s39, s23, 16 +; GFX9-NEXT: s_lshr_b32 s48, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s50, s22, 8 +; GFX9-NEXT: s_lshr_b32 s51, s21, 24 +; GFX9-NEXT: s_lshr_b32 s52, s21, 16 +; GFX9-NEXT: s_lshr_b32 s53, s21, 8 +; GFX9-NEXT: s_lshr_b32 s54, s20, 16 +; GFX9-NEXT: s_lshr_b32 s55, s20, 8 +; GFX9-NEXT: s_lshr_b32 s64, s19, 24 +; GFX9-NEXT: s_lshr_b32 s65, s19, 16 +; GFX9-NEXT: s_lshr_b32 s66, s19, 8 +; GFX9-NEXT: s_lshr_b32 s67, s18, 16 +; GFX9-NEXT: s_lshr_b32 s68, s18, 8 +; GFX9-NEXT: s_lshr_b32 s69, s17, 24 +; GFX9-NEXT: s_lshr_b32 s70, s17, 16 +; GFX9-NEXT: s_lshr_b32 s71, s17, 8 +; GFX9-NEXT: s_lshr_b32 s80, s16, 16 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: s_lshl_b32 s46, s46, 8 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: s_lshl_b32 s46, s36, 8 +; GFX9-NEXT: s_and_b32 s47, s80, 0xff +; GFX9-NEXT: s_or_b32 s46, s47, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s46, s46, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s46 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s71, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s69, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s46 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: s_lshl_b32 s16, s68, 8 +; GFX9-NEXT: s_and_b32 s17, s18, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s34, 8 +; GFX9-NEXT: s_and_b32 s18, s67, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: s_and_b32 s16, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s66, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s64, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_lshl_b32 s16, s55, 8 +; GFX9-NEXT: s_and_b32 s17, s20, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s30, 8 +; GFX9-NEXT: s_and_b32 s18, s54, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-NEXT: s_and_b32 s16, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s53, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s51, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: s_lshl_b32 s16, s50, 8 +; GFX9-NEXT: s_and_b32 s17, s22, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s94, 8 +; GFX9-NEXT: s_and_b32 s18, s49, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-NEXT: s_and_b32 s16, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s48, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s38, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: s_lshl_b32 s16, s99, 8 +; GFX9-NEXT: s_and_b32 s17, s24, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s92, 8 +; GFX9-NEXT: s_and_b32 s18, s98, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: s_and_b32 s16, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s97, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s96, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s87, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: s_lshl_b32 s16, s86, 8 +; GFX9-NEXT: s_and_b32 s17, s26, 0xff +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s90, 8 +; GFX9-NEXT: s_and_b32 s18, s85, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s16 +; GFX9-NEXT: s_and_b32 s16, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s84, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s17, s81, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s83, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-NEXT: s_lshl_b32 s16, s82, 8 +; GFX9-NEXT: s_and_b32 s17, s28, 0xff +; GFX9-NEXT: v_readlane_b32 s18, v21, 50 +; GFX9-NEXT: s_or_b32 s16, s17, s16 +; GFX9-NEXT: s_lshl_b32 s17, s88, 8 +; GFX9-NEXT: s_and_b32 s18, s18, 0xff +; GFX9-NEXT: s_or_b32 s17, s18, s17 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 49 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: s_and_b32 s16, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 48 +; GFX9-NEXT: v_readlane_b32 s18, v21, 47 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 46 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 45 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s78, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 44 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 43 +; GFX9-NEXT: v_readlane_b32 s18, v21, 42 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 41 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 40 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s76, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 39 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 38 +; GFX9-NEXT: v_readlane_b32 s18, v21, 37 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 36 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 35 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s74, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 34 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_and_b32 s16, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s17, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: v_readlane_b32 s17, v21, 33 +; GFX9-NEXT: v_readlane_b32 s18, v21, 32 +; GFX9-NEXT: s_and_b32 s17, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s18, 8 +; GFX9-NEXT: s_or_b32 s17, s17, s18 +; GFX9-NEXT: s_and_b32 s16, s16, 0xffff +; GFX9-NEXT: s_lshl_b32 s17, s17, 16 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 31 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: v_readlane_b32 s16, v21, 30 +; GFX9-NEXT: s_and_b32 s16, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s72, 8 +; GFX9-NEXT: s_or_b32 s16, s16, s17 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s16, s16, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s16 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s14, s15, 0xff +; GFX9-NEXT: v_readlane_b32 s15, v21, 29 +; GFX9-NEXT: s_lshl_b32 s15, s15, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: v_readlane_b32 s15, v21, 28 +; GFX9-NEXT: v_readlane_b32 s16, v21, 27 +; GFX9-NEXT: s_and_b32 s15, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s16, 8 +; GFX9-NEXT: s_or_b32 s15, s15, s16 +; GFX9-NEXT: s_and_b32 s14, s14, 0xffff +; GFX9-NEXT: s_lshl_b32 s15, s15, 16 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 26 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: v_readlane_b32 s14, v21, 25 +; GFX9-NEXT: s_and_b32 s14, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s62, 8 +; GFX9-NEXT: s_or_b32 s14, s14, s15 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s14, s14, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s14 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_and_b32 s12, s13, 0xff +; GFX9-NEXT: v_readlane_b32 s13, v21, 24 +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: v_readlane_b32 s13, v21, 23 +; GFX9-NEXT: v_readlane_b32 s14, v21, 22 +; GFX9-NEXT: s_and_b32 s13, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s14, 8 +; GFX9-NEXT: s_or_b32 s13, s13, s14 +; GFX9-NEXT: s_and_b32 s12, s12, 0xffff +; GFX9-NEXT: s_lshl_b32 s13, s13, 16 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 21 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: v_readlane_b32 s12, v21, 20 +; GFX9-NEXT: s_and_b32 s12, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s60, 8 +; GFX9-NEXT: s_or_b32 s12, s12, s13 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s12, s12, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s12 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff +; GFX9-NEXT: v_readlane_b32 s11, v21, 19 +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: v_readlane_b32 s11, v21, 18 +; GFX9-NEXT: v_readlane_b32 s12, v21, 17 +; GFX9-NEXT: s_and_b32 s11, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_or_b32 s11, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xffff +; GFX9-NEXT: s_lshl_b32 s11, s11, 16 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 16 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: v_readlane_b32 s10, v21, 15 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s58, 8 +; GFX9-NEXT: s_or_b32 s10, s10, s11 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s10 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s8, s9, 0xff +; GFX9-NEXT: v_readlane_b32 s9, v21, 14 +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: v_readlane_b32 s9, v21, 13 +; GFX9-NEXT: v_readlane_b32 s10, v21, 12 +; GFX9-NEXT: s_and_b32 s9, s9, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 11 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 10 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_and_b32 s6, s7, 0xff +; GFX9-NEXT: v_readlane_b32 s7, v21, 9 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_readlane_b32 s7, v21, 8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 7 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 5 +; GFX9-NEXT: v_readlane_b32 s8, v21, 0 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s8, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: v_readlane_b32 s5, v21, 4 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_readlane_b32 s5, v21, 3 +; GFX9-NEXT: v_readlane_b32 s6, v21, 2 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_readlane_b32 s9, v21, 1 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: v_readlane_b32 s99, v20, 35 +; GFX9-NEXT: v_readlane_b32 s98, v20, 34 +; GFX9-NEXT: v_readlane_b32 s97, v20, 33 +; GFX9-NEXT: v_readlane_b32 s96, v20, 32 +; GFX9-NEXT: v_readlane_b32 s87, v20, 31 +; GFX9-NEXT: v_readlane_b32 s86, v20, 30 +; GFX9-NEXT: v_readlane_b32 s85, v20, 29 +; GFX9-NEXT: v_readlane_b32 s84, v20, 28 +; GFX9-NEXT: v_readlane_b32 s83, v20, 27 +; GFX9-NEXT: v_readlane_b32 s82, v20, 26 +; GFX9-NEXT: v_readlane_b32 s81, v20, 25 +; GFX9-NEXT: v_readlane_b32 s80, v20, 24 +; GFX9-NEXT: v_readlane_b32 s71, v20, 23 +; GFX9-NEXT: v_readlane_b32 s70, v20, 22 +; GFX9-NEXT: v_readlane_b32 s69, v20, 21 +; GFX9-NEXT: v_readlane_b32 s68, v20, 20 +; GFX9-NEXT: v_readlane_b32 s67, v20, 19 +; GFX9-NEXT: v_readlane_b32 s66, v20, 18 +; GFX9-NEXT: v_readlane_b32 s65, v20, 17 +; GFX9-NEXT: v_readlane_b32 s64, v20, 16 +; GFX9-NEXT: v_readlane_b32 s55, v20, 15 +; GFX9-NEXT: v_readlane_b32 s54, v20, 14 +; GFX9-NEXT: v_readlane_b32 s53, v20, 13 +; GFX9-NEXT: v_readlane_b32 s52, v20, 12 +; GFX9-NEXT: v_readlane_b32 s51, v20, 11 +; GFX9-NEXT: v_readlane_b32 s50, v20, 10 +; GFX9-NEXT: v_readlane_b32 s49, v20, 9 +; GFX9-NEXT: v_readlane_b32 s48, v20, 8 +; GFX9-NEXT: v_readlane_b32 s39, v20, 7 +; GFX9-NEXT: v_readlane_b32 s38, v20, 6 +; GFX9-NEXT: v_readlane_b32 s37, v20, 5 +; GFX9-NEXT: v_readlane_b32 s36, v20, 4 +; GFX9-NEXT: v_readlane_b32 s35, v20, 3 +; GFX9-NEXT: v_readlane_b32 s34, v20, 2 +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: v_writelane_b32 v21, s82, 0 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: v_writelane_b32 v21, s83, 1 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; kill: killed $sgpr47 +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-TRUE16-LABEL: bitcast_v16i64_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s57 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s40, s40, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s41, s41, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s14, s14, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: .LBB57_3: ; %end +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s99 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s57, s57, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s56, s57, s56 +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s94 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s98 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s47, s47, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s46, s46, s47 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s97 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s96 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s56, s56, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s57, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s87 +; GFX11-TRUE16-NEXT: s_or_b32 s56, s56, s57 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s46, s46, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s56, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s86 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s45, s46, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s45, s45, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s85 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s83 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s82 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s18 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s71 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s81 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s70 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s80 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s69 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s68 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s66 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s67 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s64 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s65 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s55 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s76 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s53 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s54 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[3:6], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s52 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s51 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s50 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s48 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s37 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s35 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s104 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s103 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s102 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s16, v18, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s101 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s17, v18, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s18, v18, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s19, v18, 4 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-TRUE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-TRUE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 6 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s100 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v18, 10 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v18, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s14, v18, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v18, 13 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-TRUE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s12, v19, 2 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 15 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 18 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v18, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v19, 4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v19, 5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v18, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v18, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v18, 23 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v19, 6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v18, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s2, v18, 25 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v18, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v18, 28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s3, v18, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s4, v18, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s5, v18, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s6, v19, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s7, v19, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v19, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v19, 7 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s58, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s100, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s101, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; +; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v16, s32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v18, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v19, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s102, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v16, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s43, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s43, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_branch .LBB57_3 +; GFX11-FAKE16-NEXT: .LBB57_2: +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, -1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 0 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 2 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 5 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 7 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_lo, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, vcc_hi, 15 +; GFX11-FAKE16-NEXT: .LBB57_3: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 +; GFX11-FAKE16-NEXT: s_mov_b32 s101, s104 +; GFX11-FAKE16-NEXT: s_mov_b32 s104, s57 +; GFX11-FAKE16-NEXT: s_mov_b32 s57, s69 +; GFX11-FAKE16-NEXT: s_mov_b32 s69, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_5 +; GFX11-FAKE16-NEXT: ; %bb.4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s40, s40, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s41, s41, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s15, s15, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s5, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s7, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s9, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s8, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s11, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s13, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s27, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s2, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 15 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s1, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 13 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 11 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 9 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 6 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v18, s42, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s62, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v19, s63, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s74, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s94, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 9 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s30, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s44, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: v_readlane_b32 s42, v18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v18, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s100, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s45, s98, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s99, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s42, s42, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s42, s42, s43 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s42, s42, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v18, 6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v18, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s92, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v18, 0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 29 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v17, 4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v17, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v18, 1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v19, 28 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 19 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 31 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 30 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s78, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s86, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 21 +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v17, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 22 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v16, 30 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 25 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v16, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v16, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v19, 18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 20 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s17, s18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s17, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s97, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s2, v19, 17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s69, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s18, s72, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v17, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v16, 21 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s3, v19, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s73, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s96, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s87, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s85, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s84, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 1 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s29, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s83, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s82, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s81, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s18, v19, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s61, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s80, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s18, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s19, v19, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s70, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s16, s16, s17 +; GFX11-FAKE16-NEXT: s_or_b32 s17, s18, s19 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s16, s17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s16, v19, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s58, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s16, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s68, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s67, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s66, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s14, s15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s14, v19, 6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s12, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s65, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s64, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v19, 7 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s14 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s13, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s55, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s54, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s53, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s12, s12, s13 +; GFX11-FAKE16-NEXT: s_or_b32 s13, s14, s15 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s12, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s13, s13, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s12, s13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s12, v19, 8 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s10, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s52, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s51, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s11, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s50, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s49, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s48, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s10, s11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v19, 10 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s8, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s39, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s38, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v19, 11 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s37, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s36, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s35, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s8, s9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v19, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s6, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s56, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s7, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s34, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_hi, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s46, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s6, s7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s6, v19, 14 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s104, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s7, v19, 15 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s5, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s103, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s102, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s17, v19, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v19, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v19, 13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v17, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v16, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v16, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v16, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v16, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v16, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v16, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v16, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v16, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v16, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v16, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v16, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v16, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v16, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v16, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v16, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v16, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v16, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v16, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v16, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v16, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v16, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v16, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v16, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v16, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v16, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v16, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v16, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v16, 2 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v16, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v18, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v19, off, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB29_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB29_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB58_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16i64: ; VI: ; %bb.0: @@ -44026,7 +89740,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -44499,9 +90213,9 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB29_2: ; %Flow +; VI-NEXT: .LBB58_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_4 +; VI-NEXT: s_cbranch_execz .LBB58_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -44890,7 +90604,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB29_4: ; %end +; VI-NEXT: .LBB58_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -45262,7 +90976,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -45736,9 +91450,9 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB29_2: ; %Flow +; GFX9-NEXT: .LBB58_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_4 +; GFX9-NEXT: s_cbranch_execz .LBB58_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -46133,7 +91847,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB29_4: ; %end +; GFX9-NEXT: .LBB58_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -46375,15 +92089,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_4 -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4 +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB29_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h @@ -46559,49 +92273,535 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 @@ -46612,551 +92812,5289 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-TRUE16-NEXT: .LBB29_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: .LBB58_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: .LBB58_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB59_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB59_3 +; SI-NEXT: .LBB59_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB59_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB59_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB59_3 +; VI-NEXT: .LBB59_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB59_3: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 +; VI-NEXT: s_cbranch_vccnz .LBB59_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB59_3 +; GFX9-NEXT: .LBB59_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB59_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB59_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB59_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64: +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 @@ -47166,983 +98104,5564 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %Flow +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB59_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v16i64_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; SI-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB60_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB60_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB60_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB60_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB60_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: .LBB60_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s80, 24 +; SI-NEXT: v_writelane_b32 v20, s81, 25 +; SI-NEXT: v_writelane_b32 v20, s82, 26 +; SI-NEXT: v_writelane_b32 v20, s83, 27 +; SI-NEXT: v_writelane_b32 v20, s84, 28 +; SI-NEXT: v_writelane_b32 v20, s85, 29 +; SI-NEXT: v_writelane_b32 v20, s86, 30 +; SI-NEXT: v_writelane_b32 v20, s87, 31 +; SI-NEXT: v_writelane_b32 v20, s96, 32 +; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s70, v1 +; SI-NEXT: v_readfirstlane_b32 s71, v2 +; SI-NEXT: v_readfirstlane_b32 s80, v3 +; SI-NEXT: v_readfirstlane_b32 s81, v4 +; SI-NEXT: v_readfirstlane_b32 s82, v5 +; SI-NEXT: v_readfirstlane_b32 s83, v6 +; SI-NEXT: v_readfirstlane_b32 s84, v7 +; SI-NEXT: v_readfirstlane_b32 s85, v8 +; SI-NEXT: v_readfirstlane_b32 s86, v9 +; SI-NEXT: v_readfirstlane_b32 s87, v10 +; SI-NEXT: v_readfirstlane_b32 s96, v11 +; SI-NEXT: v_readfirstlane_b32 s97, v12 +; SI-NEXT: v_readfirstlane_b32 s98, v13 +; SI-NEXT: v_readfirstlane_b32 s99, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s8, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: s_lshl_b32 s4, s9, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 1 +; SI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: s_lshl_b32 s4, s8, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 3 +; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s7, 16 +; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s6, 16 +; SI-NEXT: s_and_b32 s15, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s99, 16 +; SI-NEXT: s_and_b32 s41, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s98, 16 +; SI-NEXT: s_and_b32 s43, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s97, 16 +; SI-NEXT: s_and_b32 s45, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s96, 16 +; SI-NEXT: s_and_b32 s47, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s87, 16 +; SI-NEXT: s_and_b32 s57, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s86, 16 +; SI-NEXT: s_and_b32 s59, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s85, 16 +; SI-NEXT: s_and_b32 s61, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s84, 16 +; SI-NEXT: s_and_b32 s63, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s83, 16 +; SI-NEXT: s_and_b32 s73, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s82, 16 +; SI-NEXT: s_and_b32 s75, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s81, 16 +; SI-NEXT: s_and_b32 s77, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s80, 16 +; SI-NEXT: s_and_b32 s79, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s71, 16 +; SI-NEXT: s_and_b32 s89, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s70, 16 +; SI-NEXT: s_and_b32 s91, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s29, 16 +; SI-NEXT: s_and_b32 s93, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s28, 16 +; SI-NEXT: s_and_b32 s95, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s27, 16 +; SI-NEXT: s_and_b32 s31, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s26, 16 +; SI-NEXT: s_and_b32 s35, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s25, 16 +; SI-NEXT: s_and_b32 s37, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s24, 16 +; SI-NEXT: s_and_b32 s39, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s23, 16 +; SI-NEXT: s_and_b32 s49, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s22, 16 +; SI-NEXT: s_and_b32 s51, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s21, 16 +; SI-NEXT: s_and_b32 s53, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s20, 16 +; SI-NEXT: s_and_b32 s55, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s19, 16 +; SI-NEXT: s_and_b32 s65, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s18, 16 +; SI-NEXT: s_and_b32 s67, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s17, 16 +; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s17, s19, 0 +; SI-NEXT: s_add_u32 s18, s20, 3 +; SI-NEXT: s_addc_u32 s19, s21, 0 +; SI-NEXT: s_add_u32 s20, s22, 3 +; SI-NEXT: s_addc_u32 s21, s23, 0 +; SI-NEXT: s_add_u32 s22, s24, 3 +; SI-NEXT: s_addc_u32 s23, s25, 0 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s25, s27, 0 +; SI-NEXT: s_add_u32 s26, s28, 3 +; SI-NEXT: s_addc_u32 s27, s29, 0 +; SI-NEXT: s_add_u32 s28, s70, 3 +; SI-NEXT: s_addc_u32 s29, s71, 0 +; SI-NEXT: s_add_u32 s76, s80, 3 +; SI-NEXT: s_addc_u32 s74, s81, 0 +; SI-NEXT: s_add_u32 s72, s82, 3 +; SI-NEXT: s_addc_u32 s62, s83, 0 +; SI-NEXT: s_add_u32 s60, s84, 3 +; SI-NEXT: s_addc_u32 s58, s85, 0 +; SI-NEXT: s_add_u32 s56, s86, 3 +; SI-NEXT: s_addc_u32 s46, s87, 0 +; SI-NEXT: s_add_u32 s44, s96, 3 +; SI-NEXT: s_addc_u32 s42, s97, 0 +; SI-NEXT: s_add_u32 s40, s98, 3 +; SI-NEXT: s_addc_u32 s14, s99, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_and_b32 s10, s9, 0xffff0000 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s10, 0 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_writelane_b32 v21, s9, 1 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: v_writelane_b32 v21, s9, 2 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s7, 16 +; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s6, 16 +; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s41, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_and_b32 s43, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_and_b32 s45, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_and_b32 s57, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s56, 16 +; SI-NEXT: s_and_b32 s59, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s58, 16 +; SI-NEXT: s_and_b32 s61, s60, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s60, 16 +; SI-NEXT: s_and_b32 s63, s62, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s62, 16 +; SI-NEXT: s_and_b32 s73, s72, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s72, 16 +; SI-NEXT: s_and_b32 s75, s74, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s74, 16 +; SI-NEXT: s_and_b32 s77, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s76, 16 +; SI-NEXT: s_and_b32 s79, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s29, 16 +; SI-NEXT: s_and_b32 s89, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s28, 16 +; SI-NEXT: s_and_b32 s91, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s27, 16 +; SI-NEXT: s_and_b32 s93, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s26, 16 +; SI-NEXT: s_and_b32 s95, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s25, 16 +; SI-NEXT: s_and_b32 s31, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s24, 16 +; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s23, 16 +; SI-NEXT: s_and_b32 s37, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s22, 16 +; SI-NEXT: s_and_b32 s39, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s21, 16 +; SI-NEXT: s_and_b32 s49, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s20, 16 +; SI-NEXT: s_and_b32 s51, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s19, 16 +; SI-NEXT: s_and_b32 s53, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s18, 16 +; SI-NEXT: s_and_b32 s55, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s17, 16 +; SI-NEXT: s_and_b32 s65, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s16, 16 +; SI-NEXT: s_and_b32 s67, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s5, 16 +; SI-NEXT: s_and_b32 s69, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s4, 16 +; SI-NEXT: v_writelane_b32 v21, s8, 3 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_readlane_b32 s4, v21, 2 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v21, 3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_readlane_b32 s4, v21, 0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v21, 1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s98, v20, 34 +; SI-NEXT: v_readlane_b32 s97, v20, 33 +; SI-NEXT: v_readlane_b32 s96, v20, 32 +; SI-NEXT: v_readlane_b32 s87, v20, 31 +; SI-NEXT: v_readlane_b32 s86, v20, 30 +; SI-NEXT: v_readlane_b32 s85, v20, 29 +; SI-NEXT: v_readlane_b32 s84, v20, 28 +; SI-NEXT: v_readlane_b32 s83, v20, 27 +; SI-NEXT: v_readlane_b32 s82, v20, 26 +; SI-NEXT: v_readlane_b32 s81, v20, 25 +; SI-NEXT: v_readlane_b32 s80, v20, 24 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB61_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_3: +; GFX11-NEXT: .LBB61_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB62_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB62_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB62_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB62_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 +; GFX11-TRUE16-NEXT: .LBB62_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB29_4: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB62_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <16 x i64> + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x i64> br label %end cmp.false: - %a3 = bitcast <128 x i8> %a to <16 x i64> + %a3 = bitcast <64 x bfloat> %a to <16 x i64> br label %end end: @@ -48150,3788 +103669,3616 @@ end: ret <16 x i64> %phi } -define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB30_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: .LBB30_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v16i64_to_v64bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB30_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB30_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v16i64_to_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB30_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB30_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v16i64_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB30_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB30_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <16 x i64> %a, splat (i64 3) - %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> - br label %end - -cmp.false: - %a3 = bitcast <16 x i64> %a to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi -} - -define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB31_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB31_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB63_2 ; -; VI-LABEL: bitcast_v64bf16_to_v16i64: +; VI-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB63_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_3 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB31_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v64bf16_to_v16i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB31_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_4: +; VI-NEXT: s_branch .LBB63_2 +; +; GFX9-LABEL: bitcast_v64bf16_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_3 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB31_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB63_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: s_branch .LBB63_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB31_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB31_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB31_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB63_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB63_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB63_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB31_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB63_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB63_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB63_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -51950,775 +107297,808 @@ end: } define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v62 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v63 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB32_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v32, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v34, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v36, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v38, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v50, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v52, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v63 -; GCN-NEXT: v_addc_u32_e32 v54, vcc, 0, v62, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v51 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v50 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v39 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v38 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v37 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v36 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v35 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v33 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v32 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: .LBB32_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v2, v1 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v2, v1 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v2, v1 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v46 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v45 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v44 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v43 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v41 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v55 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v54 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v53 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v48 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v34, v30 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v37 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v38, v35 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v39, v33 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v48, v32 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v49, v31 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB64_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_addc_u32_e32 v44, vcc, 0, v62, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 +; SI-NEXT: v_mov_b32_e32 v50, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v46, v28 +; SI-NEXT: v_mov_b32_e32 v34, v8 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: .LBB64_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v64f16: ; VI: ; %bb.0: @@ -52730,7 +108110,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: s_cbranch_execz .LBB64_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -52765,7 +108145,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB32_2: ; %end +; VI-NEXT: .LBB64_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -52780,7 +108160,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: s_cbranch_execz .LBB64_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -52815,7 +108195,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB32_2: ; %end +; GFX9-NEXT: .LBB64_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -52832,7 +108212,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -52875,7 +108255,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -52896,768 +108276,1637 @@ end: ret <64 x half> %phi } +define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s46, v1 +; SI-NEXT: v_readfirstlane_b32 s47, v2 +; SI-NEXT: v_readfirstlane_b32 s44, v3 +; SI-NEXT: v_readfirstlane_b32 s45, v4 +; SI-NEXT: v_readfirstlane_b32 s42, v5 +; SI-NEXT: v_readfirstlane_b32 s43, v6 +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_readfirstlane_b32 s41, v8 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v12 +; SI-NEXT: v_readfirstlane_b32 s10, v13 +; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s46, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s56, s18, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s58, s20, 16 +; SI-NEXT: s_lshr_b32 s59, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s60, s22, 16 +; SI-NEXT: s_lshr_b32 s61, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s62, s24, 16 +; SI-NEXT: s_lshr_b32 s63, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s72, s26, 16 +; SI-NEXT: s_lshr_b32 s73, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s74, s28, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_add_u32 s46, s46, 3 +; SI-NEXT: s_addc_u32 s47, s47, 0 +; SI-NEXT: s_lshr_b32 s76, s46, 16 +; SI-NEXT: s_lshr_b32 s77, s47, 16 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s78, s44, 16 +; SI-NEXT: s_lshr_b32 s79, s45, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s88, s42, 16 +; SI-NEXT: s_lshr_b32 s89, s43, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s90, s40, 16 +; SI-NEXT: s_lshr_b32 s91, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s92, s14, 16 +; SI-NEXT: s_lshr_b32 s93, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s94, s12, 16 +; SI-NEXT: s_lshr_b32 s95, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 vcc_lo, s10, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s30, s7, 16 +; SI-NEXT: s_lshr_b32 s31, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s35, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s35 +; SI-NEXT: s_lshr_b32 s34, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, s31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v9, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v11, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v45, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v43, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v41, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v55, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v53, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v48, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v38, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v36, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v34, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v32, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v30, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v16i64_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB65_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_3: +; GFX11-NEXT: .LBB65_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB33_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB33_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB66_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16i64: ; VI: ; %bb.0: @@ -53669,7 +109918,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -53769,7 +110018,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -53784,7 +110033,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -53820,7 +110069,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -53837,7 +110086,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -53872,7 +110121,7 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB33_2: ; %end +; GFX11-NEXT: .LBB66_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -53893,368 +110142,1540 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB67_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB67_3 +; SI-NEXT: .LBB67_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB67_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB67_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB67_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB67_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_3 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB67_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_4: +; VI-NEXT: s_branch .LBB67_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_3 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB67_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: s_branch .LBB67_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB67_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_3 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB67_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB67_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i64_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v49, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v54, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v40, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v45, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc -; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v49, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v54, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v40, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v45, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_or_b32_e32 v45, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v54 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v51 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v15, v15, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v46 -; GCN-NEXT: v_or_b32_e32 v18, v18, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v19, v19, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v20, v20, v34 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v34 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 -; GCN-NEXT: v_or_b32_e32 v22, v22, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v41 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 -; GCN-NEXT: v_or_b32_e32 v25, v25, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v55 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 -; GCN-NEXT: v_or_b32_e32 v27, v27, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v30, v30, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v32, v32, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v31, v31, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i64_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v64i16: ; VI: ; %bb.0: @@ -54266,7 +111687,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -54301,7 +111722,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB34_2: ; %end +; VI-NEXT: .LBB68_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -54316,7 +111737,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -54351,7 +111772,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB34_2: ; %end +; GFX9-NEXT: .LBB68_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -54368,7 +111789,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -54411,7 +111832,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB34_2: ; %end +; GFX11-NEXT: .LBB68_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -54432,613 +111853,1206 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i64_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s47, v1 +; SI-NEXT: v_readfirstlane_b32 s46, v2 +; SI-NEXT: v_readfirstlane_b32 s45, v3 +; SI-NEXT: v_readfirstlane_b32 s44, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v5 +; SI-NEXT: v_readfirstlane_b32 s42, v6 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s15, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v12 +; SI-NEXT: v_readfirstlane_b32 s11, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v15 +; SI-NEXT: v_readfirstlane_b32 s8, v16 +; SI-NEXT: v_readfirstlane_b32 s7, v17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s47, s47, 3 +; SI-NEXT: s_addc_u32 s46, s46, 0 +; SI-NEXT: s_add_u32 s45, s45, 3 +; SI-NEXT: s_addc_u32 s44, s44, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s58, s10, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s40, 16 +; SI-NEXT: s_lshr_b32 s62, s42, 16 +; SI-NEXT: s_lshr_b32 s63, s44, 16 +; SI-NEXT: s_lshr_b32 s72, s46, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s27, 16 +; SI-NEXT: s_lshr_b32 s75, s25, 16 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s21, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 16 +; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, s4, v16 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v16, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v16i64_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v16i64_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-LABEL: bitcast_v16i64_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB69_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB69_3: +; GFX11-NEXT: .LBB69_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v16i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB35_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB35_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v16i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB70_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB70_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v16i64: ; VI: ; %bb.0: @@ -55050,7 +113064,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: s_cbranch_execz .LBB70_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v33, 3 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 @@ -55150,7 +113164,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -55165,7 +113179,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: s_cbranch_execz .LBB70_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -55200,7 +113214,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -55217,7 +113231,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -55252,7 +113266,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: .LBB70_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -55273,1249 +113287,2231 @@ end: ret <16 x i64> %phi } +define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v16i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s47, v1 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s46, 3 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v16i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v16i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v51, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v53, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v61, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v62, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v2 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v51, v8, v7, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v53, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v61, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v62, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v2 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v63 -; GCN-NEXT: v_or_b32_e32 v1, v1, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_or_b32_e32 v2, v2, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v33, v34 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v63, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v62, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v61, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v30 -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v53 -; GCN-NEXT: v_or_b32_e32 v48, v48, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v60 -; GCN-NEXT: v_or_b32_e32 v54, v54, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v52 -; GCN-NEXT: v_or_b32_e32 v60, v50, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v59 -; GCN-NEXT: v_or_b32_e32 v59, v49, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v51 -; GCN-NEXT: v_or_b32_e32 v32, v39, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v58 -; GCN-NEXT: v_or_b32_e32 v30, v38, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v28, v37, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v57 -; GCN-NEXT: v_or_b32_e32 v26, v36, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v24, v63, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v56 -; GCN-NEXT: v_or_b32_e32 v22, v62, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v20, v61, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v47 -; GCN-NEXT: v_or_b32_e32 v16, v35, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_or_b32_e32 v15, v15, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v46 -; GCN-NEXT: v_or_b32_e32 v17, v34, v17 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_or_b32_e32 v19, v33, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v45 -; GCN-NEXT: v_or_b32_e32 v21, v18, v21 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GCN-NEXT: v_or_b32_e32 v23, v14, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v44 -; GCN-NEXT: v_or_b32_e32 v25, v13, v14 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_or_b32_e32 v27, v12, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v43 -; GCN-NEXT: v_or_b32_e32 v29, v11, v12 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GCN-NEXT: v_or_b32_e32 v31, v10, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v42 -; GCN-NEXT: v_or_b32_e32 v33, v9, v10 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_or_b32_e32 v35, v8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v41 -; GCN-NEXT: v_or_b32_e32 v52, v7, v8 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GCN-NEXT: v_or_b32_e32 v50, v6, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v40 -; GCN-NEXT: v_or_b32_e32 v49, v5, v6 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NEXT: v_or_b32_e32 v51, v4, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v55 -; GCN-NEXT: v_or_b32_e32 v53, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v55, v2, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v41, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v10, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v12, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v6, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v7, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v8, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v9, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v11, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v13, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v18, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v34, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v36, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v37, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v38, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v39, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v40, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v42, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v43, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v44, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v45, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v46, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v47, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v56, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v57, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v58, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v61, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v62, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v63, v4, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v3, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v54, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v60 -; GCN-NEXT: v_or_b32_e32 v4, v2, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v59 -; GCN-NEXT: v_or_b32_e32 v59, v10, v12 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v5, v10, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v60, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v13, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v37 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v38 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v39 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v40 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v42 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v43 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v31, v31, v44 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v35, v35, v46 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v37, v37, v47 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v39, v39, v56 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v49, v49, v57 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v51, v51, v58 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v53, v53, v61 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GCN-NEXT: v_or_b32_e32 v55, v55, v62 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_or_b32_e32 v41, v41, v63 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v42, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB72_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v42, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v42 +; SI-NEXT: v_or_b32_e32 v42, v42, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v42 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v61 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v58 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v47 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v128i8: ; VI: ; %bb.0: @@ -56717,7 +115713,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill @@ -56893,9 +115889,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 -; VI-NEXT: .LBB36_2: ; %Flow +; VI-NEXT: .LBB72_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_4 +; VI-NEXT: s_cbranch_execz .LBB72_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 @@ -57087,7 +116083,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 -; VI-NEXT: .LBB36_4: ; %end +; VI-NEXT: .LBB72_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v54 ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v55 @@ -57683,7 +116679,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 @@ -57879,9 +116875,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; GFX9-NEXT: .LBB36_2: ; %Flow +; GFX9-NEXT: .LBB72_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_4 +; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(30) ; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 @@ -58092,7 +117088,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 -; GFX9-NEXT: .LBB36_4: ; %end +; GFX9-NEXT: .LBB72_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -58542,7 +117538,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -58609,9 +117605,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB36_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB72_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -58694,7 +117690,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 -; GFX11-TRUE16-NEXT: .LBB36_4: ; %end +; GFX11-TRUE16-NEXT: .LBB72_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -59143,7 +118139,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -59242,9 +118238,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] -; GFX11-FAKE16-NEXT: .LBB36_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB72_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -59359,7 +118355,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: .LBB36_4: ; %end +; GFX11-FAKE16-NEXT: .LBB72_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 @@ -59701,1621 +118697,7397 @@ end: ret <128 x i8> %phi } +define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 +; SI-NEXT: v_alignbit_b32 v48, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v50, v28, v27, 8 +; SI-NEXT: v_alignbit_b32 v52, v30, v29, 24 +; SI-NEXT: v_alignbit_b32 v54, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v40, v30, v29, 8 +; SI-NEXT: v_alignbit_b32 v42, v32, v31, 24 +; SI-NEXT: v_alignbit_b32 v44, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v46, v32, v31, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v32 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 24 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v4, v3, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 24 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 +; SI-NEXT: v_alignbit_b32 v48, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v50, v28, v27, 8 +; SI-NEXT: v_alignbit_b32 v52, v30, v29, 24 +; SI-NEXT: v_alignbit_b32 v54, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v40, v30, v29, 8 +; SI-NEXT: v_alignbit_b32 v42, v32, v31, 24 +; SI-NEXT: v_alignbit_b32 v44, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v46, v32, v31, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v26 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v32 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v31, v31, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v42 +; SI-NEXT: v_or_b32_e32 v42, v42, v44 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v42 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v35 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v40 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v52 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v61 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v50 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v38 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v60 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v58 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v57 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v47 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v45 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v41 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v51 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v37 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v16f64_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s14, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s40, v11 +; VI-NEXT: v_readfirstlane_b32 s41, v12 +; VI-NEXT: v_readfirstlane_b32 s42, v13 +; VI-NEXT: v_readfirstlane_b32 s43, v14 +; VI-NEXT: v_readfirstlane_b32 s44, v15 +; VI-NEXT: v_readfirstlane_b32 s45, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s43, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s43, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s42, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s40, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s17, 8 +; VI-NEXT: s_lshr_b32 s81, s12, 16 +; VI-NEXT: s_lshr_b32 s80, s12, 8 +; VI-NEXT: s_lshr_b32 s83, s10, 16 +; VI-NEXT: s_lshr_b32 s82, s10, 8 +; VI-NEXT: s_lshr_b32 s85, s8, 16 +; VI-NEXT: s_lshr_b32 s84, s8, 8 +; VI-NEXT: s_lshr_b32 s51, s6, 16 +; VI-NEXT: s_lshr_b32 s50, s6, 8 +; VI-NEXT: s_lshr_b32 s52, s28, 16 +; VI-NEXT: s_lshr_b32 s86, s28, 8 +; VI-NEXT: s_lshr_b32 s87, s26, 16 +; VI-NEXT: s_lshr_b32 s53, s26, 8 +; VI-NEXT: s_lshr_b32 s55, s24, 16 +; VI-NEXT: s_lshr_b32 s54, s24, 8 +; VI-NEXT: s_lshr_b32 s65, s22, 16 +; VI-NEXT: s_lshr_b32 s64, s22, 8 +; VI-NEXT: s_lshr_b32 s67, s20, 16 +; VI-NEXT: s_lshr_b32 s66, s20, 8 +; VI-NEXT: s_lshr_b32 s69, s18, 16 +; VI-NEXT: s_lshr_b32 s68, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s71, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[11:12], s[4:5], 1.0 +; VI-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[10:11], 1.0 +; VI-NEXT: v_add_f64 v[15:16], s[8:9], 1.0 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: v_add_f64 v[17:18], s[6:7], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: v_add_f64 v[19:20], s[28:29], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: v_add_f64 v[25:26], s[22:23], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_add_f64 v[27:28], s[20:21], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_add_f64 v[29:30], s[18:19], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: v_add_f64 v[31:32], s[16:17], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v6 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v7 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v10 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v10 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v9 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v14 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v15 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v18 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v18 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v18 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v17 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v20 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v19 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v22 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v21 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v23 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v30 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v31 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: s_branch .LBB73_5 +; VI-NEXT: .LBB73_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v33, s71 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s69 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s68 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s67 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s66 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s65 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s64 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s55 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s54 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s87 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s53 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s52 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s86 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s51 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s50 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s85 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s84 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s83 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s82 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s81 +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s80 +; VI-NEXT: v_mov_b32_e32 v34, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: v_mov_b32_e32 v38, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: v_mov_b32_e32 v52, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 5 +; VI-NEXT: v_mov_b32_e32 v37, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: v_mov_b32_e32 v43, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 9 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 11 +; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 13 +; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: v_mov_b32_e32 v51, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: v_mov_b32_e32 v45, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: v_mov_b32_e32 v47, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_mov_b32_e32 v40, s48 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s38 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s36 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s34 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s90 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s88 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s78 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s76 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s74 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s72 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s62 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s58 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s56 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: v_mov_b32_e32 v53, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: v_mov_b32_e32 v57, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: v_mov_b32_e32 v58, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: v_mov_b32_e32 v54, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: v_mov_b32_e32 v59, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: v_mov_b32_e32 v60, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: v_mov_b32_e32 v49, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: v_mov_b32_e32 v61, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: v_mov_b32_e32 v40, s46 +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v12, s5 +; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: v_mov_b32_e32 v2, s45 +; VI-NEXT: v_mov_b32_e32 v3, s42 +; VI-NEXT: v_mov_b32_e32 v4, s43 +; VI-NEXT: v_mov_b32_e32 v5, s40 +; VI-NEXT: v_mov_b32_e32 v6, s41 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v13, s10 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s6 +; VI-NEXT: v_mov_b32_e32 v18, s7 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v31, s16 +; VI-NEXT: v_mov_b32_e32 v32, s17 +; VI-NEXT: v_mov_b32_e32 v42, s70 +; VI-NEXT: v_mov_b32_e32 v50, s4 +; VI-NEXT: v_mov_b32_e32 v40, v43 +; VI-NEXT: v_mov_b32_e32 v46, v38 +; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: .LBB73_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v50 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v50, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v36 +; VI-NEXT: v_or_b32_sdwa v31, v55, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v32, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v31, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v61 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v39 +; VI-NEXT: v_or_b32_sdwa v30, v49, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v30, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; VI-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v29, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v60 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v48 +; VI-NEXT: v_or_b32_sdwa v28, v59, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; VI-NEXT: v_or_b32_sdwa v25, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v27, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v54 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v58 +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v26, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; VI-NEXT: v_or_b32_sdwa v23, v23, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v57 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v53 +; VI-NEXT: v_or_b32_sdwa v24, v56, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v21, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v17, v17, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_or_b32_sdwa v9, v38, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v2 +; GFX9-NEXT: v_readfirstlane_b32 s8, v3 +; GFX9-NEXT: v_readfirstlane_b32 s9, v4 +; GFX9-NEXT: v_readfirstlane_b32 s10, v5 +; GFX9-NEXT: v_readfirstlane_b32 s11, v6 +; GFX9-NEXT: v_readfirstlane_b32 s12, v7 +; GFX9-NEXT: v_readfirstlane_b32 s13, v8 +; GFX9-NEXT: v_readfirstlane_b32 s14, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v10 +; GFX9-NEXT: v_readfirstlane_b32 s40, v11 +; GFX9-NEXT: v_readfirstlane_b32 s41, v12 +; GFX9-NEXT: v_readfirstlane_b32 s42, v13 +; GFX9-NEXT: v_readfirstlane_b32 s43, v14 +; GFX9-NEXT: v_readfirstlane_b32 s44, v15 +; GFX9-NEXT: v_readfirstlane_b32 s45, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s45, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s45, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s43, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s43, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s43, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s41, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s41, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s15, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s15, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s15, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s11, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s17, 8 +; GFX9-NEXT: s_lshr_b32 s83, s44, 16 +; GFX9-NEXT: s_lshr_b32 s82, s44, 8 +; GFX9-NEXT: s_lshr_b32 s85, s42, 16 +; GFX9-NEXT: s_lshr_b32 s84, s42, 8 +; GFX9-NEXT: s_lshr_b32 s87, s40, 16 +; GFX9-NEXT: s_lshr_b32 s86, s40, 8 +; GFX9-NEXT: s_lshr_b32 s97, s14, 16 +; GFX9-NEXT: s_lshr_b32 s96, s14, 8 +; GFX9-NEXT: s_lshr_b32 s99, s12, 16 +; GFX9-NEXT: s_lshr_b32 s98, s12, 8 +; GFX9-NEXT: s_lshr_b32 s39, s10, 16 +; GFX9-NEXT: s_lshr_b32 s38, s10, 8 +; GFX9-NEXT: s_lshr_b32 s49, s8, 16 +; GFX9-NEXT: s_lshr_b32 s48, s8, 8 +; GFX9-NEXT: s_lshr_b32 s51, s6, 16 +; GFX9-NEXT: s_lshr_b32 s50, s6, 8 +; GFX9-NEXT: s_lshr_b32 s53, s28, 16 +; GFX9-NEXT: s_lshr_b32 s52, s28, 8 +; GFX9-NEXT: s_lshr_b32 s55, s26, 16 +; GFX9-NEXT: s_lshr_b32 s54, s26, 8 +; GFX9-NEXT: s_lshr_b32 s65, s24, 16 +; GFX9-NEXT: s_lshr_b32 s64, s24, 8 +; GFX9-NEXT: s_lshr_b32 s67, s22, 16 +; GFX9-NEXT: s_lshr_b32 s66, s22, 8 +; GFX9-NEXT: s_lshr_b32 s69, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s71, s18, 16 +; GFX9-NEXT: s_lshr_b32 s70, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s81, s16, 16 +; GFX9-NEXT: s_lshr_b32 s80, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[11:12], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], s[8:9], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: v_add_f64 v[23:24], s[6:7], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_add_f64 v[27:28], s[28:29], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_add_f64 v[31:32], s[26:27], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_add_f64 v[35:36], s[22:23], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: v_add_f64 v[48:49], s[18:19], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[19:20] +; GFX9-NEXT: v_add_f64 v[50:51], s[16:17], 1.0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v37 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[50:51] +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v19 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v23 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v33 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v35 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 24, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v36 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v38 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v49 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v50 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: s_branch .LBB73_5 +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v15, s81 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s71 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s69 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s67 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s65 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s53 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s52 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s50 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s49 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s39 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s99 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s96 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s87 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s86 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s85 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s84 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s83 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s82 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: v_mov_b32_e32 v29, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: v_mov_b32_e32 v41, s66 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v40, s36 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s34 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s92 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s90 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s88 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s78 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s74 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s72 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s62 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s58 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: v_mov_b32_e32 v26, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: v_mov_b32_e32 v21, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: v_mov_b32_e32 v22, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: v_mov_b32_e32 v40, s46 +; GFX9-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s44 +; GFX9-NEXT: v_mov_b32_e32 v2, s45 +; GFX9-NEXT: v_mov_b32_e32 v3, s42 +; GFX9-NEXT: v_mov_b32_e32 v4, s43 +; GFX9-NEXT: v_mov_b32_e32 v5, s40 +; GFX9-NEXT: v_mov_b32_e32 v6, s41 +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v13, s10 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v20, s9 +; GFX9-NEXT: v_mov_b32_e32 v23, s6 +; GFX9-NEXT: v_mov_b32_e32 v24, s7 +; GFX9-NEXT: v_mov_b32_e32 v27, s28 +; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v31, s26 +; GFX9-NEXT: v_mov_b32_e32 v32, s27 +; GFX9-NEXT: v_mov_b32_e32 v33, s24 +; GFX9-NEXT: v_mov_b32_e32 v34, s25 +; GFX9-NEXT: v_mov_b32_e32 v35, s22 +; GFX9-NEXT: v_mov_b32_e32 v36, s23 +; GFX9-NEXT: v_mov_b32_e32 v37, s20 +; GFX9-NEXT: v_mov_b32_e32 v38, s21 +; GFX9-NEXT: v_mov_b32_e32 v48, s18 +; GFX9-NEXT: v_mov_b32_e32 v49, s19 +; GFX9-NEXT: v_mov_b32_e32 v50, s16 +; GFX9-NEXT: v_mov_b32_e32 v51, s17 +; GFX9-NEXT: v_mov_b32_e32 v56, s80 +; GFX9-NEXT: v_mov_b32_e32 v45, s70 +; GFX9-NEXT: v_mov_b32_e32 v43, s68 +; GFX9-NEXT: v_mov_b32_e32 v54, s64 +; GFX9-NEXT: v_mov_b32_e32 v52, s54 +; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB73_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v36, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v33, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v34, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v37, v37, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v58, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v37, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v16, v46, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v16, v55, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v31, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v20, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16f64_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v74, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s40, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s10, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s8, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s4, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s4, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s24, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s22, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s18, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s16, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-TRUE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], s[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[50:51], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], s[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[54:55], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], s[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], s[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[31:32], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[37:38], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[66:67], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[37:38] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[66:67] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 8, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 16, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 16, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v66 +; GFX11-TRUE16-NEXT: s_branch .LBB73_5 +; GFX11-TRUE16-NEXT: .LBB73_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s43, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s43, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB73_2 +; GFX11-TRUE16-NEXT: .LBB73_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 29 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s0 +; GFX11-TRUE16-NEXT: .LBB73_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v66, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v47, 8, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v83, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v45, v54, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v43, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v43, 0xff, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v67, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xff, v72 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v183, 0xff, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v66, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, v83, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v47, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v183, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v82, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v43, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v66, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v66, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v82, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v180 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v50, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v82, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v42 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v37, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v50, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v167 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v37, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v177 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v176 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v66, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[80:83], off offset:16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v66, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v165 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v54, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xff, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v54, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v66, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v32, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v27, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v28, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v23, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v32, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v116 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v35, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v38, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, v19, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, v20, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v15, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v23, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v24, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v13, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, v14, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v19, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v20, v17 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[80:83], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[31:34], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x11 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:68 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v75, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v74, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v74, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v74, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v74, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v74, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v74, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v74, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v74, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v74, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v74, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v74, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v74, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v74, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v74, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v74, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v74, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v74, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v74, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v74, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v74, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v74, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v74, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v74, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v74, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v74, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v74, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v74, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v74, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v74, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v74, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v74, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v74, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:92 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s41, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s41, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s40, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s12, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s8, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s6, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s26, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s22, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s20, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[10:11], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v79, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-FAKE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[32:33], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[52:53], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[1:2], s[40:41], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], s[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], s[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[64:65], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[64:65] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 24, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v65 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v64 +; GFX11-FAKE16-NEXT: s_branch .LBB73_5 +; GFX11-FAKE16-NEXT: .LBB73_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB73_2 +; GFX11-FAKE16-NEXT: .LBB73_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s0 :: v_dual_mov_b32 v65, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s14 :: v_dual_mov_b32 v4, s15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v87, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v39, s54 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s12 :: v_dual_mov_b32 v6, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v96, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s8 :: v_dual_mov_b32 v10, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v12, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v99, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s53 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s4 :: v_dual_mov_b32 v14, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v100, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s26 :: v_dual_mov_b32 v20, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s24 :: v_dual_mov_b32 v24, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v101, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s51 :: v_dual_mov_b32 v28, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s23 :: v_dual_mov_b32 v32, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v112, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s49 :: v_dual_mov_b32 v36, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v48, s16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v114, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s48 :: v_dual_mov_b32 v52, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s3 :: v_dual_mov_b32 v44, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v41, s104 :: v_dual_mov_b32 v116, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s34 :: v_dual_mov_b32 v43, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v181, s102 :: v_dual_mov_b32 v182, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v119, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v51, s39 :: v_dual_mov_b32 v176, s100 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v177, s99 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v163, s98 :: v_dual_mov_b32 v160, s96 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v128, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v165, s97 :: v_dual_mov_b32 v148, s86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v161, s87 :: v_dual_mov_b32 v144, s83 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v129, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v71, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v149, s85 :: v_dual_mov_b32 v130, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v135, s84 :: v_dual_mov_b32 v118, s71 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v132, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v131, s81 :: v_dual_mov_b32 v102, s68 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v117, s80 :: v_dual_mov_b32 v98, s65 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v133, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v83, s37 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s70 :: v_dual_mov_b32 v84, s64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s69 :: v_dual_mov_b32 v86, s55 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v134, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v103, s67 :: v_dual_mov_b32 v18, s52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s66 :: v_dual_mov_b32 v22, s50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v145, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v85, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s42 :: v_dual_mov_b32 v38, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s88 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v146, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s60 :: v_dual_mov_b32 v30, s78 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s76 :: v_dual_mov_b32 v25, s74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v147, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v21, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s62 :: v_dual_mov_b32 v80, s44 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v70, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v150, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v68, s58 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v66, s30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v54, s94 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s92 :: v_dual_mov_b32 v151, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v162, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v166, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v167, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v179, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v180, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v183, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v45, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v47, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v56, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v57, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v58, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v59, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v60, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v79, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-FAKE16-NEXT: .LBB73_5: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v64, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, v82, v81 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, v41, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v65, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v72, v64, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v82, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v73, v64, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v82, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v74, v52, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v48, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v81, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v75, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v49, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v70, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v43, v48, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v44, v48, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v64, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v53, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v45, v36, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v46, v37, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v178 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v37, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v52, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, v65, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v32, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, v33, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v28, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, v29, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v36, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v37, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v52, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v148, v23, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v150, v19, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v151, v20, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v130, v15, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v149, v24, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v33, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v131, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v132, v13, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v133, v14, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v26, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v18, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v21, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[72:75], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[43:46], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v17 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[67:70], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[148:151], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[130:133], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:76 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v77, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v77, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v77, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v77, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v77, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v77, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v77, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v77, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v77, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v76, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v76, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v76, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v76, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v76, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v76, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v76, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v76, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v76, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v76, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v76, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v76, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v76, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v76, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v76, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v76, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v76, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v76, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v76, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v76, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v76, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v76, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v76, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v76, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:92 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v47 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v37 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v41 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v63 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v49 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v58 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v62 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v59 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v61 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v33 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v44 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v33, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v55, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v54, v49 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v50 -; GCN-NEXT: v_or_b32_e32 v19, v19, v51 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_or_b32_e32 v21, v21, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v35 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v37 -; GCN-NEXT: v_or_b32_e32 v28, v28, v38 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB37_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v45, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v42, v10 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v41, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v40, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v63, v13 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v50, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v0, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v0, v17 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v49, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v60, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v58, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v25, v62, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v29, v59, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v37, v32, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v50, v61, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v41, v34, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v45, v52, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v56, v35, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v58, v33, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v59, v44, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v57, v36, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v23, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v48, v35 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v53, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v55, v53 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v0, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v0, v47 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v60, v0 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v61, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_or_b32_e32 v4, v36, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v7, v49, v7 -; GCN-NEXT: v_or_b32_e32 v8, v51, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v54, v10 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: v_or_b32_e32 v12, v23, v12 -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: v_or_b32_e32 v15, v27, v15 -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v31, v18 -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: v_or_b32_e32 v20, v33, v20 -; GCN-NEXT: v_or_b32_e32 v21, v34, v21 -; GCN-NEXT: v_or_b32_e32 v22, v35, v25 -; GCN-NEXT: v_or_b32_e32 v23, v48, v29 -; GCN-NEXT: v_or_b32_e32 v24, v53, v37 -; GCN-NEXT: v_or_b32_e32 v25, v55, v50 -; GCN-NEXT: v_or_b32_e32 v26, v40, v41 -; GCN-NEXT: v_or_b32_e32 v27, v42, v45 -; GCN-NEXT: v_or_b32_e32 v28, v43, v56 -; GCN-NEXT: v_or_b32_e32 v29, v44, v58 -; GCN-NEXT: v_or_b32_e32 v30, v46, v59 -; GCN-NEXT: v_or_b32_e32 v31, v47, v57 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 -; GCN-NEXT: .LBB37_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v61 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB74_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v54, v9 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; SI-NEXT: .LBB74_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16f64: ; VI: ; %bb.0: @@ -61647,7 +126419,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload @@ -62120,9 +126892,9 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB37_2: ; %Flow +; VI-NEXT: .LBB74_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_4 +; VI-NEXT: s_cbranch_execz .LBB74_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload @@ -62511,7 +127283,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 -; VI-NEXT: .LBB37_4: ; %end +; VI-NEXT: .LBB74_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -62883,4965 +127655,13858 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB37_2 +; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: .LBB74_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB74_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: .LBB37_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB37_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX9-NEXT: .LBB74_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_4 +; GFX11-TRUE16-NEXT: .LBB74_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 +; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: .LBB74_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-FAKE16-NEXT: .LBB74_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB75_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_mov_b32_e32 v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v17, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_mov_b32_e32 v55, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_mov_b32_e32 v44, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v19, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v21, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_mov_b32_e32 v59, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v22, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v25, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v29, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v30, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB75_3 +; SI-NEXT: .LBB75_2: +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_mov_b32_e32 v54, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: .LBB75_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB75_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB75_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB75_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v29, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v54, v0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v48, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB75_3 +; VI-NEXT: .LBB75_2: +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v32, v54 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v49 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v34, v26 +; VI-NEXT: v_mov_b32_e32 v58, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v63, v42 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB75_3: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v44, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v46, v49 +; VI-NEXT: s_cbranch_vccnz .LBB75_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB75_5: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(42) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(26) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB75_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v60 +; GFX9-NEXT: v_mov_b32_e32 v52, v56 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_mov_b32_e32 v34, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 +; GFX9-NEXT: v_mov_b32_e32 v53, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 -; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB75_3 +; GFX9-NEXT: .LBB75_2: +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_mov_b32_e32 v63, v57 +; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v57, v38 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB75_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB75_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 -; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 -; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 -; GFX9-NEXT: .LBB37_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB75_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152 -; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_3 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_4 -; GFX11-TRUE16-NEXT: .LBB37_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB37_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB37_2 -; GFX11-TRUE16-NEXT: .LBB37_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB75_3 +; GFX11-TRUE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB75_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB75_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB75_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v136, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v137, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v138, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v132, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v161, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v167, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v162 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v163 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v164 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v165 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v166 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v179, 8, v179 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v180, 8, v180 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v181 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v182 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v183, 8, v183 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v162, 8, v136 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v163, 8, v137 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v164, 8, v138 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v165, 8, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v166, 8, v102 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v144, 8, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v145, 8, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v146, 8, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v147, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v148, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v119, 8, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v129, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v130, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v131, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v114, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v116, 8, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v118, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v99, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v101, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v102, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v103, 8, v0 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v127 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB75_3 +; GFX11-FAKE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v107 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v109 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v110 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v106 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v6, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v92 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v78 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v77 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v91 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v41 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v56 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v183 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v162 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v165 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v145 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v147 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v119 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v114 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v101 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v102 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: .LBB37_2: ; %Flow +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB75_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 +; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB75_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB75_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v16f64_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v62 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB76_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB76_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB76_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB76_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: .LBB76_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB77_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s46, s17, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s46, 0 +; SI-NEXT: s_lshl_b32 s46, s17, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 1 +; SI-NEXT: s_and_b32 s46, s16, 0xffff0000 +; SI-NEXT: v_writelane_b32 v62, s46, 2 +; SI-NEXT: s_lshl_b32 s46, s16, 16 +; SI-NEXT: s_and_b32 s59, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s58, s7, 16 +; SI-NEXT: s_and_b32 s57, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s6, 16 +; SI-NEXT: s_and_b32 s99, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s5, 16 +; SI-NEXT: s_and_b32 s97, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s96, s4, 16 +; SI-NEXT: s_and_b32 s87, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s9, 16 +; SI-NEXT: s_and_b32 s85, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s84, s8, 16 +; SI-NEXT: s_and_b32 s83, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s82, s11, 16 +; SI-NEXT: s_and_b32 s81, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s80, s10, 16 +; SI-NEXT: s_and_b32 s71, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s13, 16 +; SI-NEXT: s_and_b32 s69, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s12, 16 +; SI-NEXT: s_and_b32 s67, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s15, 16 +; SI-NEXT: s_and_b32 s65, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s14, 16 +; SI-NEXT: s_and_b32 s55, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s41, 16 +; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s40, 16 +; SI-NEXT: s_and_b32 s51, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s43, 16 +; SI-NEXT: s_and_b32 s49, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s48, s42, 16 +; SI-NEXT: s_and_b32 s39, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s38, s45, 16 +; SI-NEXT: s_and_b32 s37, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s44, 16 +; SI-NEXT: s_and_b32 s35, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s29, 16 +; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s28, 16 +; SI-NEXT: s_and_b32 s95, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s27, 16 +; SI-NEXT: s_and_b32 s93, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s26, 16 +; SI-NEXT: s_and_b32 s91, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s25, 16 +; SI-NEXT: s_and_b32 s89, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s24, 16 +; SI-NEXT: s_and_b32 s79, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s23, 16 +; SI-NEXT: s_and_b32 s77, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s22, 16 +; SI-NEXT: s_and_b32 s75, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s21, 16 +; SI-NEXT: s_and_b32 s73, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s20, 16 +; SI-NEXT: s_and_b32 s63, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s19, 16 +; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 3 +; SI-NEXT: s_cbranch_execnz .LBB77_4 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_branch .LBB77_5 +; SI-NEXT: .LBB77_3: +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: s_branch .LBB77_2 +; SI-NEXT: .LBB77_4: +; SI-NEXT: v_mov_b32_e32 v1, s71 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s69 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s68 +; SI-NEXT: v_mov_b32_e32 v61, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: v_mov_b32_e32 v5, s59 +; SI-NEXT: v_mov_b32_e32 v4, s58 +; SI-NEXT: v_mov_b32_e32 v9, s57 +; SI-NEXT: v_mov_b32_e32 v6, s56 +; SI-NEXT: v_mov_b32_e32 v13, s99 +; SI-NEXT: v_mov_b32_e32 v10, s98 +; SI-NEXT: v_mov_b32_e32 v17, s97 +; SI-NEXT: v_mov_b32_e32 v14, s96 +; SI-NEXT: v_mov_b32_e32 v21, s87 +; SI-NEXT: v_mov_b32_e32 v18, s86 +; SI-NEXT: v_mov_b32_e32 v25, s85 +; SI-NEXT: v_mov_b32_e32 v22, s84 +; SI-NEXT: v_mov_b32_e32 v29, s83 +; SI-NEXT: v_mov_b32_e32 v26, s82 +; SI-NEXT: v_mov_b32_e32 v33, s81 +; SI-NEXT: v_mov_b32_e32 v30, s80 +; SI-NEXT: v_mov_b32_e32 v34, s70 +; SI-NEXT: v_mov_b32_e32 v8, s67 +; SI-NEXT: v_mov_b32_e32 v7, s66 +; SI-NEXT: v_mov_b32_e32 v24, s65 +; SI-NEXT: v_mov_b32_e32 v23, s64 +; SI-NEXT: v_mov_b32_e32 v16, s55 +; SI-NEXT: v_mov_b32_e32 v15, s54 +; SI-NEXT: v_mov_b32_e32 v28, s53 +; SI-NEXT: v_mov_b32_e32 v27, s52 +; SI-NEXT: v_mov_b32_e32 v12, s51 +; SI-NEXT: v_mov_b32_e32 v11, s50 +; SI-NEXT: v_mov_b32_e32 v32, s49 +; SI-NEXT: v_mov_b32_e32 v31, s48 +; SI-NEXT: v_mov_b32_e32 v20, s39 +; SI-NEXT: v_mov_b32_e32 v19, s38 +; SI-NEXT: v_mov_b32_e32 v36, s37 +; SI-NEXT: v_mov_b32_e32 v35, s36 +; SI-NEXT: v_mov_b32_e32 v38, s35 +; SI-NEXT: v_mov_b32_e32 v37, s34 +; SI-NEXT: v_mov_b32_e32 v48, s31 +; SI-NEXT: v_mov_b32_e32 v39, s30 +; SI-NEXT: v_mov_b32_e32 v50, s95 +; SI-NEXT: v_mov_b32_e32 v49, s94 +; SI-NEXT: v_mov_b32_e32 v52, s93 +; SI-NEXT: v_mov_b32_e32 v51, s92 +; SI-NEXT: v_mov_b32_e32 v54, s91 +; SI-NEXT: v_mov_b32_e32 v53, s90 +; SI-NEXT: v_mov_b32_e32 v40, s89 +; SI-NEXT: v_mov_b32_e32 v55, s88 +; SI-NEXT: v_mov_b32_e32 v42, s79 +; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v43, s77 +; SI-NEXT: v_mov_b32_e32 v44, s76 +; SI-NEXT: v_mov_b32_e32 v46, s75 +; SI-NEXT: v_mov_b32_e32 v45, s74 +; SI-NEXT: v_mov_b32_e32 v47, s73 +; SI-NEXT: v_mov_b32_e32 v56, s72 +; SI-NEXT: v_mov_b32_e32 v58, s63 +; SI-NEXT: v_mov_b32_e32 v57, s62 +; SI-NEXT: v_mov_b32_e32 v60, s61 +; SI-NEXT: v_mov_b32_e32 v59, s60 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: .LBB77_5: ; %end +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB77_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_3 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB77_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_4: +; VI-NEXT: s_branch .LBB77_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_3 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB77_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: s_branch .LBB77_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB77_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: .LBB77_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v64bf16_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 +; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 +; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 +; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB78_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB78_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB78_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB78_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB78_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB78_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 +; GFX11-TRUE16-NEXT: .LBB78_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB37_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v55, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v53, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v52, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v51, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v50, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v124, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v125, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v126, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v127, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v36, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v39, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v111, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v120, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v121, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v122, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v107, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v108, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v109, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v38, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v110, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v106, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v32, 3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v92, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v78, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, v77, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v76, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v75, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v74, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v60, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v59, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v93, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v94, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v95, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v104, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v105, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v79, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v88, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v89, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v90, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v91, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v58, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, v44, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v43, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v41, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v40, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v178, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v177, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v176, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v167, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v62, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v63, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v72, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v73, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v45, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v46, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v47, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v56, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v57, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v161, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v160, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, v150, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, v149, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v133, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v132, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v113, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v180, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v181, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v182, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v183, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v162, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v163, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v164, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v165, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v166, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v23, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v25, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, v112, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, v97, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, v87, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v86, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v84, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v83, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v82, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v145, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v146, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v147, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v148, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v119, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v128, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v129, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v130, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v131, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v28, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v30, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, v80, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, v71, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, v70, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, v69, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v68, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, v66, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v65, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v64, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v114, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v115, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v116, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v117, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v118, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v100, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v101, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v102, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v103, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v33, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 -; GFX11-FAKE16-NEXT: .LBB37_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:584 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:588 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <16 x double> - br label %end - -cmp.false: - %a3 = bitcast <128 x i8> %a to <16 x double> - br label %end - -end: - %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <16 x double> %phi -} - -define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v60, v1, v2, 16 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v58, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 12, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 20, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v60, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v16f64_to_v64bf16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB38_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v16f64_to_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v16f64_to_v64bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB78_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x double> br label %end cmp.false: - %a3 = bitcast <16 x double> %a to <64 x bfloat> + %a3 = bitcast <64 x bfloat> %a to <16 x double> br label %end end: - %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x bfloat> %phi + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi } -define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 -; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 -; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 -; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 -; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 -; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 -; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB39_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 -; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 -; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 -; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 -; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 -; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 -; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 -; GCN-NEXT: .LBB39_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v14, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 +; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v11 +; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v12 +; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v14 +; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v14 +; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v38 +; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v57, v11 +; SI-NEXT: v_mov_b32_e32 v47, v10 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB79_2 ; -; VI-LABEL: bitcast_v64bf16_to_v16f64: +; VI-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 -; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 ; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 @@ -67849,1619 +141514,2398 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB39_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: s_branch .LBB79_2 ; -; GFX9-LABEL: bitcast_v64bf16_to_v16f64: +; GFX9-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB39_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc ; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v15 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 ; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v15, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v14 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v13 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v12 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v12, v18, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v11 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v33, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v10, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v10 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v10 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v10, v18, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v9 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v33, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v8 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v8 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v8, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v33, 16, v8 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v7 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v7, v18, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v6 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v5 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v5, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v33, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v4 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v33, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v3 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v33, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v2 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v33, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v1 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v33, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v31 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v31, v33, 16, v31 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v30 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v29 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v29 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v29, v33, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v28 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v28 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v28, v33, 16, v28 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v27 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v27 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v27, v33, 16, v27 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v26 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v26 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v26, v33, 16, v26 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v25 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v25 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v25, v33, 16, v25 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v24 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v24 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v24, v33, 16, v24 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v23 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v23 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v23, v33, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v22 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v22, v33, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v21 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v21 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v21, v33, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v20 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v20 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v20, v33, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v19 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v19 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v19, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v32 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v32, v33, 16, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v17 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v17, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 -; GFX9-NEXT: .LBB39_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: s_branch .LBB79_2 ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v32, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v14, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v38, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v183, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v182, v9 :: v_dual_mov_b32 v169, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v176, v3 :: v_dual_mov_b32 v175, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v173, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-TRUE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v12, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v32, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v33 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v32 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v34 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v32, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v30, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v31, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v30, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v32, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v28, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v29.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v183 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v29, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v28.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v182 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v27.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v26, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v27, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v25, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v24, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v26, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v25.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v24 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v25, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v183, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v182, v32, 16, v33 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v177 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v24, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v32, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v23, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v22.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v35, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v176 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v177, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v32, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v21.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v21, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v33, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v39, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v19.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v20, v32 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v37, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v19, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v16, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v176, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v18, v34 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v17, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v38, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v36, v16 -; GFX11-TRUE16-NEXT: .LBB39_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v39, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v13, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v13, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v11, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v8, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v9, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v181 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v181, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-TRUE16-NEXT: .LBB79_3: ; %end +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v181 :: v_dual_mov_b32 v19, v175 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v21, v176 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-TRUE16-NEXT: s_clause 0x8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v177 :: v_dual_mov_b32 v27, v182 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v183 :: v_dual_mov_b32 v31, v178 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v30, v179 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB79_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-TRUE16-NEXT: s_branch .LBB79_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s5, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v6, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v6, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v7, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v29 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v29, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v27, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v33, v23, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v21, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v32, v21, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v38, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v37, v19, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v36, 0x7fff ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v16, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB39_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: .LBB79_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 +; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB79_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: s_branch .LBB79_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -69480,740 +143924,766 @@ end: } define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v25 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v26 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v27 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v3 -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v63, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v2, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v61, v3, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v60 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v60, v4, v3 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v59 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v59, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v58 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v58, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v57 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v56 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v47 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v44 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v42 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v41 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v40 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v53 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v52 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v51 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v50 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v48 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v37 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v39 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v49 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v63, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: .LBB80_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_f64 v[35:36], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64f16: ; VI: ; %bb.0: @@ -70225,7 +144695,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -70244,7 +144714,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -70259,7 +144729,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -70278,7 +144748,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -70295,7 +144765,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -70314,7 +144784,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -70335,768 +144805,1602 @@ end: ret <64 x half> %phi } +define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB81_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s46 +; SI-NEXT: s_lshr_b32 s46, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s46 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 +; SI-NEXT: s_lshr_b32 s46, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: s_lshr_b32 s46, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 +; SI-NEXT: s_lshr_b32 s46, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 +; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s46 +; SI-NEXT: s_lshr_b32 s46, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 +; SI-NEXT: s_lshr_b32 s46, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s46 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s46 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 +; SI-NEXT: s_lshr_b32 s46, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s46 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 +; SI-NEXT: s_lshr_b32 s46, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s46 +; SI-NEXT: s_lshr_b32 s46, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s46 +; SI-NEXT: s_lshr_b32 s46, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[52:53], s[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_add_f64 v[48:49], s[26:27], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_add_f64 v[36:37], s[28:29], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f64 v[29:30], s[42:43], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_add_f64 v[1:2], s[18:19], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 +; SI-NEXT: v_mov_b32_e32 v7, v61 +; SI-NEXT: v_mov_b32_e32 v61, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v47 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_mov_b32_e32 v15, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 +; SI-NEXT: v_mov_b32_e32 v14, v12 +; SI-NEXT: v_mov_b32_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: .LBB81_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: s_branch .LBB81_2 +; +; VI-LABEL: bitcast_v16f64_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB81_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_3 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB81_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_4: +; VI-NEXT: s_branch .LBB81_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_3 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB81_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: s_branch .LBB81_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB81_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: .LBB81_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v43, v19 -; GCN-NEXT: v_or_b32_e32 v20, v41, v20 -; GCN-NEXT: v_or_b32_e32 v21, v55, v21 -; GCN-NEXT: v_or_b32_e32 v22, v49, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_or_b32_e32 v26, v48, v26 -; GCN-NEXT: v_or_b32_e32 v27, v52, v27 -; GCN-NEXT: v_or_b32_e32 v28, v53, v28 -; GCN-NEXT: v_or_b32_e32 v29, v54, v29 -; GCN-NEXT: v_or_b32_e32 v30, v40, v30 -; GCN-NEXT: v_or_b32_e32 v31, v42, v31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: .LBB41_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v18, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_or_b32_e32 v19, v21, v20 -; GCN-NEXT: v_or_b32_e32 v20, v55, v39 -; GCN-NEXT: v_or_b32_e32 v21, v41, v48 -; GCN-NEXT: v_or_b32_e32 v22, v22, v49 -; GCN-NEXT: v_or_b32_e32 v23, v23, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v33 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_or_b32_e32 v31, v31, v37 -; GCN-NEXT: .LBB41_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v36 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v58, v2 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v25, v51, v25 +; SI-NEXT: v_or_b32_e32 v26, v48, v26 +; SI-NEXT: v_or_b32_e32 v27, v52, v27 +; SI-NEXT: v_or_b32_e32 v28, v39, v28 +; SI-NEXT: v_or_b32_e32 v29, v37, v29 +; SI-NEXT: v_or_b32_e32 v30, v35, v30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v55, v24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB82_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB82_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16f64: ; VI: ; %bb.0: @@ -71108,7 +146412,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -71208,7 +146512,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -71223,7 +146527,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -71259,7 +146563,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -71276,7 +146580,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -71311,7 +146615,7 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: .LBB82_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -71332,490 +146636,1526 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v26 +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v41, v11 +; SI-NEXT: v_mov_b32_e32 v40, v10 +; SI-NEXT: v_mov_b32_e32 v44, v9 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v63 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB83_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v22, v51, v22 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v50, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v49, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v26, v39, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v27, v38, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v28, v37, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v60, v2 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_or_b32_e32 v13, v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v40, v17 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v18, v55, v18 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB83_3 +; SI-NEXT: .LBB83_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_mov_b32_e32 v33, v52 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v49, v25 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB83_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v40, v46 +; SI-NEXT: v_mov_b32_e32 v41, v56 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v43, v60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB83_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: .LBB83_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_3 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: v_add_f16_sdwa v33, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v32, v32, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB83_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: s_branch .LBB83_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB83_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: s_branch .LBB83_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB83_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB83_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f64_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v63 -; GCN-NEXT: v_or_b32_e32 v1, v1, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v30 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v63, 0xffff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v45 -; GCN-NEXT: v_or_b32_e32 v46, v44, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v62 -; GCN-NEXT: v_or_b32_e32 v62, v42, v6 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; GCN-NEXT: v_or_b32_e32 v44, v41, v6 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v61 -; GCN-NEXT: v_or_b32_e32 v45, v4, v6 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v3, v3, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v7, v7, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v58 -; GCN-NEXT: v_or_b32_e32 v9, v9, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v11, v11, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v43 -; GCN-NEXT: v_or_b32_e32 v13, v13, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v15, v15, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v17, v40, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v21, v53, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v51, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v36, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v38, v29 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v33, v33, v36 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v48, v36 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v39, v39, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v35, v35, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v50, v50, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v34, v34, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v54, v54, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v52, v52, v41 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v41, v63, v41 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v46, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f64_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v40, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v64i16: ; VI: ; %bb.0: @@ -71827,7 +148167,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -71846,7 +148186,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB42_2: ; %end +; VI-NEXT: .LBB84_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -71861,7 +148201,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -71880,7 +148220,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB42_2: ; %end +; GFX9-NEXT: .LBB84_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -71897,7 +148237,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -71916,7 +148256,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB42_2: ; %end +; GFX11-NEXT: .LBB84_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -71937,613 +148277,1149 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f64_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB85_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: s_cbranch_execnz .LBB85_3 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: .LBB85_3: ; %end +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v60 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v47 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB85_4: +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_branch .LBB85_2 +; +; VI-LABEL: bitcast_v16f64_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB85_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB85_3 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB85_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB85_4: +; VI-NEXT: s_branch .LBB85_2 +; +; GFX9-LABEL: bitcast_v16f64_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB85_3 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB85_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_mov_b32_e32 v19, v33 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: s_branch .LBB85_2 +; +; GFX11-LABEL: bitcast_v16f64_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v14 :: v_dual_mov_b32 v31, v13 +; GFX11-NEXT: v_dual_mov_b32 v30, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB85_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB85_3: +; GFX11-NEXT: .LBB85_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v16f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v57 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v60 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; kill: killed $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: .LBB43_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v57, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v35, v3 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v4, v60, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_or_b32_e32 v31, v59, v31 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 -; GCN-NEXT: .LBB43_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v23, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v43 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v53 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: v_or_b32_e32 v19, v19, v48 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v49 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v62 +; SI-NEXT: v_or_b32_e32 v2, v2, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: v_or_b32_e32 v4, v4, v59 +; SI-NEXT: v_or_b32_e32 v5, v5, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_or_b32_e32 v7, v7, v56 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v45 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_or_b32_e32 v12, v12, v42 +; SI-NEXT: v_or_b32_e32 v13, v13, v41 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: .LBB86_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: v_or_b32_e32 v21, v36, v21 +; SI-NEXT: v_or_b32_e32 v22, v34, v22 +; SI-NEXT: v_or_b32_e32 v23, v32, v23 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v5, v58, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v47, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v54, v15 +; SI-NEXT: v_or_b32_e32 v18, v49, v18 +; SI-NEXT: v_or_b32_e32 v20, v38, v20 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v51, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v37, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; SI-NEXT: .LBB86_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v16f64: ; VI: ; %bb.0: @@ -72555,7 +149431,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: s_cbranch_execz .LBB86_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v33, 3 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 @@ -72655,7 +149531,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -72670,7 +149546,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: s_cbranch_execz .LBB86_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -72705,7 +149581,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -72722,7 +149598,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB43_2 +; GFX11-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -72757,7 +149633,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB43_2: ; %end +; GFX11-NEXT: .LBB86_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -72778,2153 +149654,3171 @@ end: ret <16 x double> %phi } +define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v16f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v10, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v17, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v19, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v21, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v22, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v27, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_or_b32_e32 v28, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_or_b32_e32 v29, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_or_b32_e32 v8, v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: v_mov_b32_e32 v38, v61 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v36, v41 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v41, v14 +; SI-NEXT: v_mov_b32_e32 v56, v16 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v45, v44 +; SI-NEXT: v_mov_b32_e32 v59, v42 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: v_readfirstlane_b32 s8, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v5 +; VI-NEXT: v_readfirstlane_b32 s10, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v7 +; VI-NEXT: v_readfirstlane_b32 s12, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v9 +; VI-NEXT: v_readfirstlane_b32 s14, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v11 +; VI-NEXT: v_readfirstlane_b32 s40, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v13 +; VI-NEXT: v_readfirstlane_b32 s42, v14 +; VI-NEXT: v_readfirstlane_b32 s43, v15 +; VI-NEXT: v_readfirstlane_b32 s44, v16 +; VI-NEXT: v_readfirstlane_b32 s45, v17 +; VI-NEXT: v_readfirstlane_b32 s46, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s47, v1 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s46, 3 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s46, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s46, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s45, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s45, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s44, 3 +; VI-NEXT: s_add_i32 s45, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s44, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s43, 3 +; VI-NEXT: s_add_i32 s44, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s43, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s42, 3 +; VI-NEXT: s_add_i32 s43, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s42, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s41, 3 +; VI-NEXT: s_add_i32 s42, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s41, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s40, 3 +; VI-NEXT: s_add_i32 s41, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s40, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s15, 3 +; VI-NEXT: s_add_i32 s40, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s15, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s14, 3 +; VI-NEXT: s_add_i32 s15, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s14, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s13, 3 +; VI-NEXT: s_add_i32 s14, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s13, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s12, 3 +; VI-NEXT: s_add_i32 s13, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s12, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s11, 3 +; VI-NEXT: s_add_i32 s12, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s11, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s10, 3 +; VI-NEXT: s_add_i32 s11, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s10, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s9, 3 +; VI-NEXT: s_add_i32 s10, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s8, 3 +; VI-NEXT: s_add_i32 s9, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s8, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_add_i32 s8, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s46 +; VI-NEXT: v_mov_b32_e32 v15, s47 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: v_mov_b32_e32 v18, s8 +; VI-NEXT: v_mov_b32_e32 v19, s9 +; VI-NEXT: v_mov_b32_e32 v20, s10 +; VI-NEXT: v_mov_b32_e32 v21, s11 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s13 +; VI-NEXT: v_mov_b32_e32 v24, s14 +; VI-NEXT: v_mov_b32_e32 v25, s15 +; VI-NEXT: v_mov_b32_e32 v26, s40 +; VI-NEXT: v_mov_b32_e32 v27, s41 +; VI-NEXT: v_mov_b32_e32 v28, s42 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v31, s45 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v16f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v16f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v185, s32 +; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 +; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 +; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 +; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 +; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 +; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 +; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 +; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 +; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 +; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 +; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 +; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 +; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 +; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 +; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-NEXT: .LBB87_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB87_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 +; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 +; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 +; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 +; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 +; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 +; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 +; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 +; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 +; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v185, off, s32 +; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 +; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 +; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 +; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 +; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 +; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 +; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB87_4: +; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 +; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v24 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v10 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:332 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:344 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:376 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:364 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:360 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v55, v1, v15 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GCN-NEXT: v_or_b32_e32 v22, v1, v22 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v1, v52 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 -; GCN-NEXT: v_or_b32_e32 v5, v1, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v13, v1, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v21, v1, v21 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v6, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v10, v9, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v17, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v25, v12, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v12, v8 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v14, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v14, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v24 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v29, v27 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v38, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v19, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v49, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v19, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v19, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; kill: killed $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; kill: killed $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; kill: killed $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; kill: killed $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; kill: killed $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; kill: killed $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; kill: killed $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: .LBB44_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v19 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v56 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v1 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v1, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: v_mov_b32_e32 v2, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v1 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v1, v15 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v1, v17 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v1, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v1 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v1, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v28, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v37, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v40, v2, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v45, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v58, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v60, v25, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v2, v24 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v2, v25 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v2, v26 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v2, v27 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v2, v29 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v2, v30 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v2, v31 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v2, v33 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v2, v34 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v2, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v2, v36 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v2, v38 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v2, v50 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v2, v54 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v2, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v2, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v2, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v2, v44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v2, v46 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v2, v47 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v56, v2, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v61, v61, v2 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v63, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v1, v4 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s7, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s7, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, s7, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, s7, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v60 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v52, v8 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: v_or_b32_e32 v10, v55, v10 -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: v_or_b32_e32 v14, v27, v14 -; GCN-NEXT: v_or_b32_e32 v15, v29, v15 -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: v_or_b32_e32 v17, v31, v17 -; GCN-NEXT: v_or_b32_e32 v18, v33, v18 -; GCN-NEXT: v_or_b32_e32 v19, v34, v19 -; GCN-NEXT: v_or_b32_e32 v20, v35, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v38, v22 -; GCN-NEXT: v_or_b32_e32 v23, v50, v23 -; GCN-NEXT: v_or_b32_e32 v24, v54, v28 -; GCN-NEXT: v_or_b32_e32 v25, v41, v32 -; GCN-NEXT: v_or_b32_e32 v26, v42, v37 -; GCN-NEXT: v_or_b32_e32 v27, v43, v40 -; GCN-NEXT: v_or_b32_e32 v28, v44, v45 -; GCN-NEXT: v_or_b32_e32 v29, v46, v57 -; GCN-NEXT: v_or_b32_e32 v30, v47, v58 -; GCN-NEXT: v_or_b32_e32 v31, v56, v59 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v61 -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v31 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v6 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v39 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v50 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v35 -; GCN-NEXT: .LBB44_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v13, 16 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v63, v5, v11, 16 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v56, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v62, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v61, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v60, v13, v16, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v19, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v59, v22, v23, 16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v27, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_alignbit_b32 v58, v29, v35, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v3, v7, v3, 16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_alignbit_b32 v57, v32, v35, 16 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 48, v0 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_alignbit_b32 v33, v34, v33, 16 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v35, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 56, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 60, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v52, 16 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 64, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v54, 16 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x48, v0 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_alignbit_b32 v17, v36, v17, 16 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x4c, v0 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v42, 16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x54, v0 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_alignbit_b32 v37, v37, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x58, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v12, v14, v12, 16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x5c, v0 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v41, 16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v24, v28, v24, 16 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x64, v0 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v40, 16 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_alignbit_b32 v38, v39, v38, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x6c, v0 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_alignbit_b32 v48, v48, v55, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_alignbit_b32 v51, v51, v53, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v63, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v5, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v27, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v5, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v30, v5, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v9, v5, v26 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v7, v5, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v19, v57, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v16, v13, v61 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v23, v62, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v3, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v25 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_or_b32_e32 v11, v41, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_or_b32_e32 v15, v5, v44 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v59 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_or_b32_e32 v5, v45, v5 +; SI-NEXT: v_or_b32_e32 v17, v63, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB88_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_or_b32_e32 v11, v41, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v11, v38, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v13, v13, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 +; SI-NEXT: v_or_b32_e32 v14, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v15, v9, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v16, v9, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v17, v8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v18, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v19, v8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v20, v8, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v21, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v22, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v23, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v24, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v25, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v26, v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v27, v7, v1 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v28, v23 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s7, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v31 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v32 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v2 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: .LBB88_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64bf16: ; VI: ; %bb.0: @@ -75258,7 +153152,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -75740,9 +153634,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: .LBB44_2: ; %Flow +; VI-NEXT: .LBB88_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_4 +; VI-NEXT: s_cbranch_execz .LBB88_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload @@ -76129,7 +154023,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 ; VI-NEXT: v_or_b32_e32 v21, v39, v21 ; VI-NEXT: v_or_b32_e32 v31, v31, v54 -; VI-NEXT: .LBB44_4: ; %end +; VI-NEXT: .LBB88_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -76505,7 +154399,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: s_cbranch_execz .LBB88_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -76988,9 +154882,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: .LBB44_2: ; %Flow +; GFX9-NEXT: .LBB88_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB44_4 +; GFX9-NEXT: s_cbranch_execz .LBB88_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload @@ -77381,7 +155275,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 ; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 ; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 -; GFX9-NEXT: .LBB44_4: ; %end +; GFX9-NEXT: .LBB88_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -77626,15 +155520,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_4 -; GFX11-TRUE16-NEXT: .LBB44_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_4 +; GFX11-TRUE16-NEXT: .LBB88_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l @@ -77893,8 +155787,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 -; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB88_2 +; GFX11-TRUE16-NEXT: .LBB88_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 @@ -78168,923 +156062,6473 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB88_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: .LBB88_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB88_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: .LBB88_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v43, s19, 0 +; SI-NEXT: v_writelane_b32 v43, s18, 1 +; SI-NEXT: v_writelane_b32 v43, s17, 2 +; SI-NEXT: v_writelane_b32 v43, s16, 3 +; SI-NEXT: s_mov_b32 s60, s24 +; SI-NEXT: v_writelane_b32 v41, s30, 0 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: s_mov_b32 s77, s28 +; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s80, 24 +; SI-NEXT: v_writelane_b32 v41, s81, 25 +; SI-NEXT: v_writelane_b32 v41, s82, 26 +; SI-NEXT: v_writelane_b32 v41, s83, 27 +; SI-NEXT: v_writelane_b32 v41, s84, 28 +; SI-NEXT: v_writelane_b32 v41, s85, 29 +; SI-NEXT: v_writelane_b32 v41, s86, 30 +; SI-NEXT: v_writelane_b32 v41, s87, 31 +; SI-NEXT: v_writelane_b32 v41, s96, 32 +; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: s_mov_b32 s79, s26 +; SI-NEXT: v_readfirstlane_b32 s38, v20 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s39, v19 +; SI-NEXT: v_writelane_b32 v42, s38, 0 +; SI-NEXT: v_readfirstlane_b32 s48, v25 +; SI-NEXT: v_writelane_b32 v42, s39, 1 +; SI-NEXT: v_readfirstlane_b32 s49, v26 +; SI-NEXT: v_writelane_b32 v42, s48, 2 +; SI-NEXT: v_readfirstlane_b32 s50, v24 +; SI-NEXT: v_writelane_b32 v42, s49, 3 +; SI-NEXT: v_readfirstlane_b32 s51, v23 +; SI-NEXT: v_writelane_b32 v42, s50, 4 +; SI-NEXT: v_readfirstlane_b32 s52, v29 +; SI-NEXT: v_writelane_b32 v42, s51, 5 +; SI-NEXT: v_readfirstlane_b32 s53, v30 +; SI-NEXT: v_writelane_b32 v42, s52, 6 +; SI-NEXT: v_readfirstlane_b32 s54, v28 +; SI-NEXT: v_writelane_b32 v42, s53, 7 +; SI-NEXT: v_readfirstlane_b32 s55, v27 +; SI-NEXT: v_writelane_b32 v42, s54, 8 +; SI-NEXT: v_writelane_b32 v42, s55, 9 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s88, v4 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v43, s4, 4 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 +; SI-NEXT: v_writelane_b32 v43, s4, 5 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v43, s4, 6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 +; SI-NEXT: v_writelane_b32 v43, s4, 9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s91, v10 +; SI-NEXT: v_readfirstlane_b32 s92, v8 +; SI-NEXT: v_readfirstlane_b32 s93, v7 +; SI-NEXT: v_readfirstlane_b32 s94, v13 +; SI-NEXT: v_readfirstlane_b32 s95, v14 +; SI-NEXT: v_readfirstlane_b32 s30, v17 +; SI-NEXT: v_readfirstlane_b32 s31, v18 +; SI-NEXT: v_readfirstlane_b32 s34, v16 +; SI-NEXT: v_readfirstlane_b32 s35, v15 +; SI-NEXT: v_readfirstlane_b32 s36, v21 +; SI-NEXT: v_readfirstlane_b32 s37, v22 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 12 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s75, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s61, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 +; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s40, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s63, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 +; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s47, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s24, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: v_readfirstlane_b32 s4, v54 +; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s4, v40 +; SI-NEXT: v_writelane_b32 v43, s4, 33 +; SI-NEXT: v_writelane_b32 v43, s22, 34 +; SI-NEXT: v_writelane_b32 v43, s23, 35 +; SI-NEXT: v_writelane_b32 v43, s72, 36 +; SI-NEXT: v_writelane_b32 v43, s20, 37 +; SI-NEXT: v_writelane_b32 v43, s79, 38 +; SI-NEXT: v_writelane_b32 v43, s76, 39 +; SI-NEXT: v_writelane_b32 v43, s25, 40 +; SI-NEXT: v_writelane_b32 v43, s60, 41 +; SI-NEXT: v_writelane_b32 v43, s29, 42 +; SI-NEXT: v_writelane_b32 v43, s77, 43 +; SI-NEXT: v_writelane_b32 v43, s16, 44 +; SI-NEXT: v_writelane_b32 v43, s17, 45 +; SI-NEXT: v_writelane_b32 v43, s18, 46 +; SI-NEXT: v_writelane_b32 v43, s19, 47 +; SI-NEXT: v_writelane_b32 v43, s88, 48 +; SI-NEXT: v_writelane_b32 v43, s89, 49 +; SI-NEXT: v_writelane_b32 v43, s90, 50 +; SI-NEXT: v_writelane_b32 v43, s91, 51 +; SI-NEXT: v_writelane_b32 v43, s92, 52 +; SI-NEXT: v_writelane_b32 v43, s93, 53 +; SI-NEXT: v_writelane_b32 v43, s94, 54 +; SI-NEXT: v_writelane_b32 v43, s95, 55 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s62, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s10, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s66, v35 +; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s27, v32 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s58, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s69, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s14, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s68, v39 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s11, v49 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s70, v50 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s71, v51 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 57 +; SI-NEXT: v_writelane_b32 v43, s30, 58 +; SI-NEXT: v_writelane_b32 v43, s31, 59 +; SI-NEXT: v_writelane_b32 v43, s34, 60 +; SI-NEXT: v_writelane_b32 v43, s35, 61 +; SI-NEXT: v_writelane_b32 v43, s36, 62 +; SI-NEXT: v_writelane_b32 v43, s37, 63 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s46, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s41, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s87, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s99, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s81, v39 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s26, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s82, v50 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s7, v51 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s97, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s44, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s9, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s80, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s86, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s85, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s8, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s12, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s65, v48 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s64, v49 +; SI-NEXT: v_writelane_b32 v42, s64, 10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s67, v50 +; SI-NEXT: v_writelane_b32 v42, s65, 11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s84, v51 +; SI-NEXT: v_writelane_b32 v42, s67, 12 +; SI-NEXT: v_writelane_b32 v42, s84, 13 +; SI-NEXT: v_writelane_b32 v42, s85, 14 +; SI-NEXT: v_writelane_b32 v42, s86, 15 +; SI-NEXT: v_writelane_b32 v42, s87, 16 +; SI-NEXT: v_writelane_b32 v42, s8, 17 +; SI-NEXT: v_writelane_b32 v42, s99, 18 +; SI-NEXT: v_writelane_b32 v42, s12, 19 +; SI-NEXT: v_writelane_b32 v42, s44, 20 +; SI-NEXT: v_writelane_b32 v42, s97, 21 +; SI-NEXT: v_writelane_b32 v42, s83, 22 +; SI-NEXT: v_writelane_b32 v42, s82, 23 +; SI-NEXT: v_writelane_b32 v42, s98, 24 +; SI-NEXT: v_writelane_b32 v42, s96, 25 +; SI-NEXT: v_writelane_b32 v42, s81, 26 +; SI-NEXT: v_writelane_b32 v42, s9, 27 +; SI-NEXT: v_writelane_b32 v42, s41, 28 +; SI-NEXT: v_writelane_b32 v42, s80, 29 +; SI-NEXT: v_writelane_b32 v42, s7, 30 +; SI-NEXT: v_writelane_b32 v42, s56, 31 +; SI-NEXT: v_writelane_b32 v42, s26, 32 +; SI-NEXT: v_writelane_b32 v42, s15, 33 +; SI-NEXT: v_writelane_b32 v42, s14, 34 +; SI-NEXT: v_writelane_b32 v42, s69, 35 +; SI-NEXT: v_writelane_b32 v42, s71, 36 +; SI-NEXT: v_writelane_b32 v42, s70, 37 +; SI-NEXT: v_writelane_b32 v42, s68, 38 +; SI-NEXT: v_writelane_b32 v42, s74, 39 +; SI-NEXT: v_writelane_b32 v42, s46, 40 +; SI-NEXT: v_writelane_b32 v42, s11, 41 +; SI-NEXT: v_writelane_b32 v42, s10, 42 +; SI-NEXT: v_writelane_b32 v42, s62, 43 +; SI-NEXT: v_writelane_b32 v42, s66, 44 +; SI-NEXT: v_writelane_b32 v42, s58, 45 +; SI-NEXT: v_writelane_b32 v42, s28, 46 +; SI-NEXT: v_writelane_b32 v42, s27, 47 +; SI-NEXT: v_writelane_b32 v42, s78, 48 +; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_readlane_b32 s4, v43, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v43, 2 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_writelane_b32 v42, s4, 56 +; SI-NEXT: v_readlane_b32 s4, v43, 1 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v43, 0 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_mov_b32 s22, s6 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 58 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s25, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 59 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_writelane_b32 v42, s5, 60 +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s76, 24 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_writelane_b32 v42, s5, 61 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s17, 24 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_writelane_b32 v42, s6, 62 +; SI-NEXT: s_and_b32 s6, s89, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s88, 24 +; SI-NEXT: s_mov_b32 s4, s47 +; SI-NEXT: s_or_b32 s47, s16, s6 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s19, 24 +; SI-NEXT: s_or_b32 s25, s16, s6 +; SI-NEXT: s_and_b32 s6, s93, 0xff +; SI-NEXT: s_lshl_b32 s16, s92, 8 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s16, s90, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s91, 24 +; SI-NEXT: s_or_b32 s92, s17, s16 +; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 +; SI-NEXT: s_or_b32 s76, s17, s16 +; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s95, 24 +; SI-NEXT: s_or_b32 s91, s17, s16 +; SI-NEXT: s_and_b32 s16, s35, 0xff +; SI-NEXT: s_lshl_b32 s17, s34, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s30, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_or_b32 s77, s18, s17 +; SI-NEXT: s_and_b32 s17, s39, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s38, 24 +; SI-NEXT: s_or_b32 s79, s18, s17 +; SI-NEXT: s_and_b32 s17, s36, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s37, 24 +; SI-NEXT: s_or_b32 s93, s18, s17 +; SI-NEXT: s_and_b32 s17, s51, 0xff +; SI-NEXT: s_lshl_b32 s18, s50, 8 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s48, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s49, 24 +; SI-NEXT: s_or_b32 s89, s19, s18 +; SI-NEXT: s_and_b32 s18, s55, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s54, 24 +; SI-NEXT: s_or_b32 s31, s19, s18 +; SI-NEXT: s_and_b32 s18, s52, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s53, 24 +; SI-NEXT: s_or_b32 s94, s19, s18 +; SI-NEXT: s_and_b32 s18, s84, 0xff +; SI-NEXT: s_lshl_b32 s19, s67, 8 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s64, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s65, 24 +; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_and_b32 s19, s12, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s8, 24 +; SI-NEXT: s_or_b32 s8, s20, s19 +; SI-NEXT: s_and_b32 s19, s85, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s86, 24 +; SI-NEXT: s_or_b32 s12, s20, s19 +; SI-NEXT: s_and_b32 s19, s80, 0xff +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: s_or_b32 vcc_lo, s19, s20 +; SI-NEXT: s_and_b32 s19, s44, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s97, 24 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s15, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s82, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s83, 24 +; SI-NEXT: s_or_b32 s23, s20, s19 +; SI-NEXT: s_and_b32 s19, s26, 0xff +; SI-NEXT: s_lshl_b32 s20, s81, 8 +; SI-NEXT: s_or_b32 vcc_hi, s19, s20 +; SI-NEXT: s_and_b32 s19, s99, 0xff +; SI-NEXT: v_writelane_b32 v42, s9, 50 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s87, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s41, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s98, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s96, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 54 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s46, 0xff +; SI-NEXT: s_lshl_b32 s20, s74, 8 +; SI-NEXT: s_or_b32 s84, s19, s20 +; SI-NEXT: s_and_b32 s19, s71, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s70, 24 +; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s68, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 53 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s14, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s69, 24 +; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_and_b32 s19, s58, 0xff +; SI-NEXT: s_lshl_b32 s20, s66, 8 +; SI-NEXT: s_or_b32 s85, s19, s20 +; SI-NEXT: s_and_b32 s19, s10, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s62, 24 +; SI-NEXT: s_or_b32 s49, s20, s19 +; SI-NEXT: s_and_b32 s19, s27, 0xff +; SI-NEXT: v_writelane_b32 v42, s9, 55 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s28, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 33 +; SI-NEXT: s_or_b32 s50, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 32 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 31 +; SI-NEXT: s_or_b32 s51, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 30 +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: s_or_b32 s86, s19, s20 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 27 +; SI-NEXT: s_or_b32 s52, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 26 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 25 +; SI-NEXT: s_or_b32 s53, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 23 +; SI-NEXT: s_or_b32 s54, s20, s19 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 22 +; SI-NEXT: s_lshl_b32 s20, s9, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_or_b32 s87, s19, s20 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: v_readlane_b32 s9, v43, 19 +; SI-NEXT: s_or_b32 s55, s20, s19 +; SI-NEXT: s_mov_b32 s58, s9 +; SI-NEXT: s_and_b32 s19, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 18 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s9, 24 +; SI-NEXT: s_or_b32 s64, s20, s19 +; SI-NEXT: s_and_b32 s19, s78, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s24, 24 +; SI-NEXT: s_or_b32 s65, s20, s19 +; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_lshl_b32 s20, s45, 8 +; SI-NEXT: s_or_b32 s26, s19, s20 +; SI-NEXT: s_and_b32 s19, s13, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s57, 24 +; SI-NEXT: s_or_b32 s66, s20, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_or_b32 s67, s20, s19 +; SI-NEXT: s_and_b32 s19, s42, 0xff +; SI-NEXT: v_readlane_b32 s88, v43, 17 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s59, 24 +; SI-NEXT: s_or_b32 s68, s20, s19 +; SI-NEXT: s_and_b32 s19, s63, 0xff +; SI-NEXT: s_lshl_b32 s20, s88, 8 +; SI-NEXT: s_or_b32 s27, s19, s20 +; SI-NEXT: s_and_b32 s19, s40, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s43, 24 +; SI-NEXT: s_or_b32 s69, s20, s19 +; SI-NEXT: s_and_b32 s19, s61, 0xff +; SI-NEXT: s_mov_b32 s39, s57 +; SI-NEXT: s_mov_b32 s57, s7 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s75, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 16 +; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: s_mov_b32 s10, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s71, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 14 +; SI-NEXT: s_or_b32 s62, s20, s19 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: s_mov_b32 s41, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 12 +; SI-NEXT: s_or_b32 s29, s19, s20 +; SI-NEXT: s_mov_b32 s14, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 11 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 10 +; SI-NEXT: s_or_b32 s80, s20, s19 +; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 9 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s81, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 8 +; SI-NEXT: s_or_b32 s11, s20, s19 +; SI-NEXT: s_mov_b32 s82, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 7 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_mov_b32 s96, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 6 +; SI-NEXT: s_mov_b32 s36, s63 +; SI-NEXT: s_mov_b32 s63, s93 +; SI-NEXT: s_mov_b32 s93, s61 +; SI-NEXT: s_mov_b32 s61, s91 +; SI-NEXT: s_mov_b32 s91, s75 +; SI-NEXT: s_mov_b32 s75, s92 +; SI-NEXT: s_or_b32 s92, s20, s19 +; SI-NEXT: s_mov_b32 s98, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 5 +; SI-NEXT: s_mov_b32 s44, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 4 +; SI-NEXT: s_mov_b32 s48, s13 +; SI-NEXT: s_mov_b32 s13, s94 +; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: s_or_b32 s21, s19, s20 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s22, 24 +; SI-NEXT: v_readlane_b32 s4, v42, 58 +; SI-NEXT: s_mov_b32 s46, s45 +; SI-NEXT: s_mov_b32 s34, s73 +; SI-NEXT: s_mov_b32 s73, s12 +; SI-NEXT: s_mov_b32 s37, s42 +; SI-NEXT: s_mov_b32 s38, s59 +; SI-NEXT: s_mov_b32 s59, s8 +; SI-NEXT: s_mov_b32 s30, s88 +; SI-NEXT: s_mov_b32 s88, s31 +; SI-NEXT: s_mov_b32 s78, s40 +; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: s_mov_b32 s12, s7 +; SI-NEXT: s_mov_b32 s7, s22 +; SI-NEXT: s_or_b32 s83, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s4, 16 +; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_lshl_b32 s22, s6, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s19, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s17, vcc_lo, 16 +; SI-NEXT: s_lshl_b32 s6, vcc_hi, 16 +; SI-NEXT: s_lshl_b32 s99, s84, 16 +; SI-NEXT: s_lshl_b32 s8, s85, 16 +; SI-NEXT: s_lshl_b32 s97, s86, 16 +; SI-NEXT: s_lshl_b32 s28, s87, 16 +; SI-NEXT: s_lshl_b32 s87, s26, 16 +; SI-NEXT: v_readlane_b32 s26, v42, 56 +; SI-NEXT: s_lshl_b32 s86, s27, 16 +; SI-NEXT: v_readlane_b32 s27, v42, 57 +; SI-NEXT: v_readlane_b32 s35, v42, 61 +; SI-NEXT: s_lshl_b32 s85, s29, 16 +; SI-NEXT: v_readlane_b32 s29, v42, 60 +; SI-NEXT: v_readlane_b32 s24, v42, 59 +; SI-NEXT: v_readlane_b32 s90, v42, 62 +; SI-NEXT: s_lshl_b32 s84, s21, 16 +; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s98, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_add_i32 s6, s12, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s6, s81, 8 +; SI-NEXT: s_add_i32 s16, s82, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s96, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_or_b32 s6, s6, s16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s6, s15, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s16, s41, 8 +; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s16, s9, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: s_add_i32 s18, s10, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s17, s71, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s18, s30, 8 +; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s34, 8 +; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s19, s38, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s20, s46, 8 +; SI-NEXT: s_add_i32 s22, s48, 3 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s20, s39, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_add_i32 s20, s58, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 18 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s22, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 49 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_lshl_b32 s22, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 48 +; SI-NEXT: s_add_i32 s23, s7, 3 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s22, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s23, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 20 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_lshl_b32 s23, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 21 +; SI-NEXT: s_add_i32 s60, s7, 3 +; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s60, s60, 16 +; SI-NEXT: s_addk_i32 s22, 0x300 +; SI-NEXT: s_or_b32 s23, s23, s60 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_add_i32 s23, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s60, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: s_or_b32 s23, s60, s23 +; SI-NEXT: s_lshl_b32 s60, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: s_add_i32 s61, s7, 3 +; SI-NEXT: s_and_b32 s61, s61, 0xff +; SI-NEXT: s_lshl_b32 s61, s61, 16 +; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_or_b32 s60, s60, s61 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: s_or_b32 s23, s60, s23 +; SI-NEXT: s_add_i32 s60, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: s_and_b32 s60, s60, 0xff +; SI-NEXT: s_lshl_b32 s61, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 28 +; SI-NEXT: s_or_b32 s60, s61, s60 +; SI-NEXT: s_lshl_b32 s61, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: s_add_i32 s62, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 47 +; SI-NEXT: s_and_b32 s62, s62, 0xff +; SI-NEXT: s_add_i32 s59, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 46 +; SI-NEXT: s_lshl_b32 s62, s62, 16 +; SI-NEXT: s_addk_i32 s60, 0x300 +; SI-NEXT: s_and_b32 s59, s59, 0xff +; SI-NEXT: s_lshl_b32 s58, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 32 +; SI-NEXT: s_or_b32 s61, s61, s62 +; SI-NEXT: s_and_b32 s60, s60, 0xffff +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_lshl_b32 s59, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 33 +; SI-NEXT: s_or_b32 s60, s61, s60 +; SI-NEXT: s_add_i32 s61, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 45 +; SI-NEXT: s_add_i32 s57, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 44 +; SI-NEXT: s_lshl_b32 s56, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 43 +; SI-NEXT: s_lshl_b32 s47, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: s_add_i32 s46, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 41 +; SI-NEXT: s_add_i32 s45, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 38 +; SI-NEXT: s_lshl_b32 s42, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 35 +; SI-NEXT: s_lshl_b32 s15, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 34 +; SI-NEXT: s_and_b32 s45, s45, 0xff +; SI-NEXT: s_add_i32 s14, s7, 3 +; SI-NEXT: s_or_b32 s42, s42, s45 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: v_readlane_b32 s7, v42, 40 +; SI-NEXT: s_and_b32 s57, s57, 0xff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s42, 0xffff +; SI-NEXT: s_add_i32 s44, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 39 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_or_b32 s57, s14, s15 +; SI-NEXT: s_and_b32 s14, s44, 0xff +; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 37 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_lshl_b32 s15, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 36 +; SI-NEXT: s_add_i32 s40, s7, 3 +; SI-NEXT: s_and_b32 s61, s61, 0xff +; SI-NEXT: s_and_b32 s40, s40, 0xff +; SI-NEXT: s_lshl_b32 s61, s61, 16 +; SI-NEXT: s_addk_i32 s58, 0x300 +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_or_b32 s59, s59, s61 +; SI-NEXT: s_and_b32 s58, s58, 0xffff +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s58, s59, s58 +; SI-NEXT: s_or_b32 s59, s15, s14 +; SI-NEXT: s_add_i32 s14, s6, 0x3000000 +; SI-NEXT: v_readlane_b32 s6, v42, 31 +; SI-NEXT: s_add_i32 s11, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 28 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_lshl_b32 s8, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 25 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_lshl_b32 s8, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 24 +; SI-NEXT: s_add_i32 s24, s7, 3 +; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_or_b32 s8, s8, s6 +; SI-NEXT: v_readlane_b32 s6, v42, 32 +; SI-NEXT: s_add_i32 s12, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 26 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s11, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 16 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: s_lshl_b32 s11, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 18 +; SI-NEXT: s_add_i32 s12, s7, 3 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: v_readlane_b32 s7, v42, 33 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_add_i32 s13, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 30 +; SI-NEXT: s_or_b32 s6, s11, s6 +; SI-NEXT: s_and_b32 s11, s13, 0xff +; SI-NEXT: s_lshl_b32 s10, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v42, 22 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_lshl_b32 s11, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 23 +; SI-NEXT: s_add_i32 s25, s7, 3 +; SI-NEXT: s_and_b32 s12, s25, 0xff +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: v_readlane_b32 s7, v42, 29 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s9, s7, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 27 +; SI-NEXT: v_readlane_b32 s11, v42, 20 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_readlane_b32 s9, v42, 21 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_readlane_b32 s9, v42, 19 +; SI-NEXT: s_add_i32 s21, s9, 3 +; SI-NEXT: v_readlane_b32 s11, v42, 17 +; SI-NEXT: v_readlane_b32 s12, v42, 14 +; SI-NEXT: s_and_b32 s9, s21, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_readlane_b32 s11, v42, 15 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_readlane_b32 s11, v42, 13 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_readlane_b32 s12, v42, 12 +; SI-NEXT: v_readlane_b32 s13, v42, 10 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v42, 11 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v42, 9 +; SI-NEXT: s_add_i32 s15, s16, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: v_readlane_b32 s13, v42, 8 +; SI-NEXT: v_readlane_b32 s16, v42, 6 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v42, 7 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_lshl_b32 s13, s13, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s13, s16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v42, 5 +; SI-NEXT: s_add_i32 s40, s17, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_readlane_b32 s16, v42, 4 +; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: v_readlane_b32 s16, v42, 3 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: s_add_i32 s41, s18, 0x3000000 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 62 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 63 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s17, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 61 +; SI-NEXT: s_add_i32 s42, s19, 0x3000000 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s18, v43, 60 +; SI-NEXT: v_readlane_b32 s19, v43, 58 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: v_readlane_b32 s18, v43, 59 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_or_b32 s16, s18, s16 +; SI-NEXT: v_readlane_b32 s18, v43, 57 +; SI-NEXT: s_add_i32 s43, s20, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s19, v43, 56 +; SI-NEXT: v_readlane_b32 s20, v43, 54 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_readlane_b32 s19, v43, 55 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_lshl_b32 s19, s19, 24 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_readlane_b32 s20, v43, 52 +; SI-NEXT: v_readlane_b32 s21, v43, 50 +; SI-NEXT: s_and_b32 s19, s19, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 8 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_readlane_b32 s20, v43, 51 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_readlane_b32 s20, v43, 49 +; SI-NEXT: s_add_i32 s44, s22, 0x3000000 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s21, v43, 48 +; SI-NEXT: v_readlane_b32 s22, v43, 46 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_readlane_b32 s21, v43, 47 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s21, s21, 24 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s21, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v43, 43 +; SI-NEXT: s_add_i32 s45, s23, 0x3000000 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s22, v43, 42 +; SI-NEXT: v_readlane_b32 s23, v43, 44 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 8 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: v_readlane_b32 s22, v43, 45 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s22, s22, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s22, s22, s23 +; SI-NEXT: s_or_b32 s20, s22, s20 +; SI-NEXT: s_add_i32 s22, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v43, 41 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s23, v43, 40 +; SI-NEXT: v_readlane_b32 s24, v43, 38 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s20, s23, s20 +; SI-NEXT: v_readlane_b32 s23, v43, 39 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s23, s23, 24 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s23, s23, s24 +; SI-NEXT: s_or_b32 s20, s23, s20 +; SI-NEXT: s_add_i32 s23, s20, 0x3000000 +; SI-NEXT: v_readlane_b32 s20, v43, 37 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_readlane_b32 s24, v43, 36 +; SI-NEXT: v_readlane_b32 s25, v43, 34 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s24, s24, 8 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s20, s24, s20 +; SI-NEXT: v_readlane_b32 s24, v43, 35 +; SI-NEXT: s_and_b32 s25, s25, 0xff +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_lshl_b32 s24, s24, 24 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_or_b32 s20, s24, s20 +; SI-NEXT: v_readlane_b32 s24, v43, 3 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_readlane_b32 s25, v43, 2 +; SI-NEXT: v_readlane_b32 s26, v43, 1 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s56, 0xffff +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s56, s46, s47 +; SI-NEXT: s_add_i32 s47, s58, 0x3000000 +; SI-NEXT: s_add_i32 s58, s59, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_readlane_b32 s25, v43, 0 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s9, 16 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_lshl_b32 s25, s25, 24 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s17, 16 +; SI-NEXT: v_writelane_b32 v42, s9, 50 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_lshl_b32 s7, s10, 16 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s46, s60, 0x3000000 +; SI-NEXT: s_add_i32 s56, s56, 0x3000000 +; SI-NEXT: s_add_i32 s57, s57, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s16, s16, 0x3000000 +; SI-NEXT: s_add_i32 s18, s18, 0x3000000 +; SI-NEXT: s_add_i32 s19, s19, 0x3000000 +; SI-NEXT: s_add_i32 s20, s20, 0x3000000 +; SI-NEXT: s_add_i32 s24, s24, 0x3000000 +; SI-NEXT: v_writelane_b32 v42, s7, 53 +; SI-NEXT: s_lshl_b32 s7, s8, 16 +; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s24, 16 +; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s23, 16 +; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s22, 16 +; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s19, 16 +; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s18, 16 +; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s13, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s12, 16 +; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s11, 16 +; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_writelane_b32 v42, s7, 54 +; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s99, s58, 16 +; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s57, 16 +; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s47, 16 +; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s46, 16 +; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s45, 16 +; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s44, 16 +; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s43, 16 +; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s87, s42, 16 +; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s41, 16 +; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s86, s40, 16 +; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s15, 16 +; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s14, 16 +; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s5, 16 +; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s84, s4, 16 +; SI-NEXT: v_writelane_b32 v42, s7, 55 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 50 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_readlane_b32 s4, v42, 51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 52 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 53 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v42, 54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 55 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v41, 35 +; SI-NEXT: v_readlane_b32 s98, v41, 34 +; SI-NEXT: v_readlane_b32 s97, v41, 33 +; SI-NEXT: v_readlane_b32 s96, v41, 32 +; SI-NEXT: v_readlane_b32 s87, v41, 31 +; SI-NEXT: v_readlane_b32 s86, v41, 30 +; SI-NEXT: v_readlane_b32 s85, v41, 29 +; SI-NEXT: v_readlane_b32 s84, v41, 28 +; SI-NEXT: v_readlane_b32 s83, v41, 27 +; SI-NEXT: v_readlane_b32 s82, v41, 26 +; SI-NEXT: v_readlane_b32 s81, v41, 25 +; SI-NEXT: v_readlane_b32 s80, v41, 24 +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: s_mov_b32 s7, s6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: v_readlane_b32 s58, v43, 19 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_mov_b32 s95, s47 +; SI-NEXT: s_mov_b32 s94, s21 +; SI-NEXT: s_mov_b32 s93, s61 +; SI-NEXT: s_mov_b32 s34, s73 +; SI-NEXT: s_mov_b32 s91, s75 +; SI-NEXT: v_readlane_b32 s56, v43, 10 +; SI-NEXT: s_mov_b32 s36, s63 +; SI-NEXT: s_mov_b32 s38, s59 +; SI-NEXT: s_mov_b32 s37, s42 +; SI-NEXT: v_readlane_b32 s30, v43, 17 +; SI-NEXT: v_readlane_b32 s98, v43, 6 +; SI-NEXT: s_mov_b32 s46, s45 +; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: s_mov_b32 s78, s40 +; SI-NEXT: v_readlane_b32 s15, v43, 14 +; SI-NEXT: s_mov_b32 s39, s57 +; SI-NEXT: s_mov_b32 s48, s13 +; SI-NEXT: v_readlane_b32 s41, v43, 13 +; SI-NEXT: v_readlane_b32 s44, v43, 5 +; SI-NEXT: v_readlane_b32 s9, v43, 11 +; SI-NEXT: v_readlane_b32 s14, v43, 12 +; SI-NEXT: v_readlane_b32 s81, v43, 9 +; SI-NEXT: v_readlane_b32 s10, v43, 16 +; SI-NEXT: v_readlane_b32 s12, v43, 4 +; SI-NEXT: v_readlane_b32 s96, v43, 7 +; SI-NEXT: v_readlane_b32 s82, v43, 8 +; SI-NEXT: v_readlane_b32 s71, v43, 15 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v63, v0 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v57, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 +; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 +; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 +; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: s_and_b32 s4, s26, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v63, v3 +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v39, v16 +; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v55, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v50, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v49, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v45 +; GFX9-NEXT: v_mov_b32_e32 v45, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB89_3 +; GFX9-NEXT: .LBB89_2: +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB89_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB89_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_lshl_b32 s6, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshl_b32 s9, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshl_b32 s10, s19, 8 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: .LBB89_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB89_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v2, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v68 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v165 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB89_3 +; GFX11-TRUE16-NEXT: .LBB89_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v167, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v176, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v165 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v165, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v70, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v69, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v64, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v53, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v38, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v37, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v34, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v53, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v49, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 +; GFX11-TRUE16-NEXT: .LBB89_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB89_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB89_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB89_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB89_3 +; GFX11-FAKE16-NEXT: .LBB89_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 -; GFX11-FAKE16-NEXT: .LBB44_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 +; GFX11-FAKE16-NEXT: .LBB89_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB89_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB89_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -79103,2093 +162547,2037 @@ end: } define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v46 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v45 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v28, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v24, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v25, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v26, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v30, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v40, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v39 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v42, v2, v3, 16 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v44, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v46, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v56, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v58, v2, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_alignbit_b32 v61, v2, v18, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_alignbit_b32 v15, v2, v19, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_alignbit_b32 v14, v2, v17, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_alignbit_b32 v13, v2, v27, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_alignbit_b32 v59, v2, v33, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v35 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v39 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v48 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v60 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v20 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v22 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v34 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v11 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v43 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v47 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 24, v62 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v23 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v51, v52, v16, 16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v35, v12, v16, 16 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v36, v36, v12, 16 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v37, v37, v12, 16 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v38, v38, v12, 16 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v39, v6, v12, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v48, v3, v12, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v50, v5, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v52 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v52, v8, v3, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v53, v10, v3, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v54, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v55, v7, v2, 16 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v41, v9, v2, 16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v45, v11, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v57, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v4, v51, v28, 24 -; GCN-NEXT: v_alignbit_b32 v10, v51, v28, 16 -; GCN-NEXT: v_alignbit_b32 v3, v51, v28, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v8, v35, v24, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v12, v36, v25, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: .LBB45_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v29 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v59, v13, v12, 16 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v63 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v15, v17, v15, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v47 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v43 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v20 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v42, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v60 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v62, 0xffff0000, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v7 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v60, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v27 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v31 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v33 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v53 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v54 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v55 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v40 -; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v41 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v42 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v43 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v44 -; GCN-NEXT: v_add_f32_e32 v52, 0x40c00000, v45 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v46 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v47 -; GCN-NEXT: v_add_f32_e32 v51, 0x40c00000, v56 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v57 -; GCN-NEXT: v_add_f32_e32 v47, 0x40c00000, v58 -; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v53, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v61 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v62 -; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v61, v1, v6, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_alignbit_b32 v58, v1, v2, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_alignbit_b32 v56, v1, v3, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_alignbit_b32 v46, v1, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_alignbit_b32 v44, v1, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_alignbit_b32 v42, v1, v33, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_alignbit_b32 v40, v1, v8, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; GCN-NEXT: v_alignbit_b32 v30, v1, v30, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; GCN-NEXT: v_alignbit_b32 v26, v1, v25, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v52 -; GCN-NEXT: v_alignbit_b32 v25, v1, v41, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; GCN-NEXT: v_alignbit_b32 v24, v1, v51, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_alignbit_b32 v28, v1, v53, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 24, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 24, v20 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v60 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v57, v20, v21, 16 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v45, v18, v20, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v41, v10, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v55, v1, v10, 16 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v54, v62, v1, 16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v53, v12, v1, 16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v52, v11, v1, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v50, v4, v1, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v3, v17, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v48, v2, v19, 16 -; GCN-NEXT: v_alignbit_b32 v39, v6, v16, 16 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v38, v29, v22, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v37, v8, v23, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v36, v7, v43, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v35, v5, v47, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v31 -; GCN-NEXT: v_alignbit_b32 v51, v5, v9, 16 -; GCN-NEXT: v_alignbit_b32 v4, v51, v28, 24 -; GCN-NEXT: v_alignbit_b32 v10, v51, v28, 16 -; GCN-NEXT: v_alignbit_b32 v3, v51, v28, 8 -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v8, v35, v24, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v12, v36, v25, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: .LBB45_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v51 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v8 -; GCN-NEXT: v_or_b32_e32 v29, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v12 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v36 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v38 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v50 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v14, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v49, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v4, v1, v30 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v5, v1, v33 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v61, v3, v34 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v29, v29, v37 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v30, v30, v38 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v48 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v43 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v23, v44 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v45 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v46 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v47 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v56 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v58 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v59 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v60 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64bf16_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v38, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v35, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v32, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v29, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 +; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 +; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB90_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v50 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v52 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v54 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_alignbit_b32 v32, v35, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v35, v34, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; SI-NEXT: v_alignbit_b32 v35, v38, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v36 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v37, v38, v37, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v36 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; SI-NEXT: v_alignbit_b32 v38, v49, v38, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v39 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v48, v49, v48, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v39 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v50, v50, v49, 16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v52, v52, v49, 16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v54, v54, v49, 16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_alignbit_b32 v40, v40, v49, 16 +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v41 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v55 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v53 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: .LBB90_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v29 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v37 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v34 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v15 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v31 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v13 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v25 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v128i8: ; VI: ; %bb.0: @@ -81396,7 +164784,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill @@ -81584,12 +164972,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v47, v34 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: .LBB45_2: ; %Flow +; VI-NEXT: .LBB90_2: ; %Flow ; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_xor_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_4 +; VI-NEXT: s_cbranch_execz .LBB90_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 @@ -82347,7 +165735,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: .LBB45_4: ; %end +; VI-NEXT: .LBB90_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -82951,7 +166339,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB45_2 +; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill @@ -83150,11 +166538,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB45_2: ; %Flow +; GFX9-NEXT: .LBB90_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v58, v57 ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB45_4 +; GFX9-NEXT: s_cbranch_execz .LBB90_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -83938,7 +167326,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v60 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v59 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v59 -; GFX9-NEXT: .LBB45_4: ; %end +; GFX9-NEXT: .LBB90_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -84483,7 +167871,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -84598,9 +167986,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.h -; GFX11-TRUE16-NEXT: .LBB45_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB45_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 @@ -85224,7 +168612,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7 -; GFX11-TRUE16-NEXT: .LBB45_4: ; %end +; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v144.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l @@ -85717,7 +169105,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] @@ -85816,9 +169204,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] -; GFX11-FAKE16-NEXT: .LBB45_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB90_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB45_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v17 @@ -86087,676 +169475,9180 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v32, v48, vcc_lo ; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v51 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 +; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 +; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] +; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77 +; GFX11-FAKE16-NEXT: .LBB90_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x15 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB91_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v43, v36 +; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16 +; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 +; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 +; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 +; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_mov_b32_e32 v17, v63 +; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8 +; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24 +; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8 +; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 +; SI-NEXT: v_mov_b32_e32 v15, v9 +; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 +; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v23, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63 +; SI-NEXT: v_mov_b32_e32 v48, v33 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB91_3 +; SI-NEXT: .LBB91_2: +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v48, v33 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v17, v63 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: .LBB91_3: ; %Flow +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v17 +; SI-NEXT: v_mov_b32_e32 v54, v61 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB91_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40 +; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 +; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43 +; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42 +; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 +; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43 +; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41 +; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40 +; SI-NEXT: v_mov_b32_e32 v40, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54 +; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 +; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26 +; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 +; SI-NEXT: .LBB91_5: ; %end +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28 +; SI-NEXT: v_or_b32_e32 v32, v36, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xff, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29 +; SI-NEXT: v_or_b32_e32 v36, v56, v36 +; SI-NEXT: v_or_b32_e32 v32, v32, v36 +; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51 +; SI-NEXT: v_or_b32_e32 v32, v32, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v36 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v14, v32, v14 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19 +; SI-NEXT: v_or_b32_e32 v32, v33, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6 +; SI-NEXT: v_or_b32_e32 v14, v32, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v60 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v28, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2 +; SI-NEXT: v_or_b32_e32 v14, v28, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v57 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v22, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v22, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v38 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s44, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s42, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s40, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s14, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s12, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s10, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s6, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s80, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s81, s12, 8 +; VI-NEXT: s_lshr_b32 s82, s15, 24 +; VI-NEXT: s_lshr_b32 s83, s15, 16 +; VI-NEXT: s_lshr_b32 s85, s15, 8 +; VI-NEXT: s_lshr_b32 s84, s14, 16 +; VI-NEXT: s_lshr_b32 s86, s14, 8 +; VI-NEXT: s_lshr_b32 s87, s41, 24 +; VI-NEXT: s_lshr_b32 s50, s41, 16 +; VI-NEXT: s_lshr_b32 s52, s41, 8 +; VI-NEXT: s_lshr_b32 s51, s40, 16 +; VI-NEXT: s_lshr_b32 s53, s40, 8 +; VI-NEXT: s_lshr_b32 s54, s43, 24 +; VI-NEXT: s_lshr_b32 s55, s43, 16 +; VI-NEXT: s_lshr_b32 s65, s43, 8 +; VI-NEXT: s_lshr_b32 s64, s42, 16 +; VI-NEXT: s_lshr_b32 s66, s42, 8 +; VI-NEXT: s_lshr_b32 s67, s45, 24 +; VI-NEXT: s_lshr_b32 s68, s45, 16 +; VI-NEXT: s_lshr_b32 s70, s45, 8 +; VI-NEXT: s_lshr_b32 s69, s44, 16 +; VI-NEXT: s_lshr_b32 s71, s44, 8 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s46, s45, 16 +; VI-NEXT: v_mov_b32_e32 v31, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s46, v31 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s45, v31 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s45, s44, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s45, v31 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s44, s43, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s43, v31 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s43, s42, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s43, v31 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: s_lshl_b32 s42, s41, 16 +; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; VI-NEXT: v_add_f32_e32 v5, s42, v31 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s41, s41, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s41, v31 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: s_lshl_b32 s41, s40, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 +; VI-NEXT: v_add_f32_e32 v5, s41, v31 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; VI-NEXT: v_add_f32_e32 v7, s40, v31 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: s_lshl_b32 s40, s15, 16 +; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 +; VI-NEXT: v_add_f32_e32 v7, s40, v31 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s15, v31 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s15, s14, 16 +; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; VI-NEXT: v_add_f32_e32 v7, s15, v31 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_add_f32_e32 v9, s14, v31 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_lshl_b32 s14, s13, 16 +; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 +; VI-NEXT: v_add_f32_e32 v9, s14, v31 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_add_f32_e32 v10, s13, v31 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: s_lshl_b32 s13, s12, 16 +; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; VI-NEXT: v_add_f32_e32 v9, s13, v31 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: s_lshl_b32 s12, s11, 16 +; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 +; VI-NEXT: v_add_f32_e32 v11, s12, v31 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_add_f32_e32 v12, s11, v31 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: s_lshl_b32 s11, s10, 16 +; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 +; VI-NEXT: v_add_f32_e32 v11, s11, v31 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s10, v31 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_lshl_b32 s10, s9, 16 +; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 +; VI-NEXT: v_add_f32_e32 v13, s10, v31 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_add_f32_e32 v14, s9, v31 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_lshl_b32 s9, s8, 16 +; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; VI-NEXT: v_add_f32_e32 v13, s9, v31 +; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc +; VI-NEXT: v_add_f32_e32 v15, s8, v31 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: s_lshl_b32 s8, s7, 16 +; VI-NEXT: v_alignbit_b32 v13, v15, v13, 16 +; VI-NEXT: v_add_f32_e32 v15, s8, v31 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s7, v31 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s7, s6, 16 +; VI-NEXT: v_alignbit_b32 v16, v16, v15, 16 +; VI-NEXT: v_add_f32_e32 v15, s7, v31 +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_alignbit_b32 v15, v17, v15, 16 +; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_add_f32_e32 v18, s6, v31 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_alignbit_b32 v18, v18, v17, 16 +; VI-NEXT: v_add_f32_e32 v17, s6, v31 +; VI-NEXT: v_bfe_u32 v19, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v17 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc +; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_alignbit_b32 v17, v19, v17, 16 +; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; VI-NEXT: v_add_f32_e32 v20, s6, v31 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v19, 16 +; VI-NEXT: v_add_f32_e32 v19, s6, v31 +; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v19, v21, v22, vcc +; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_alignbit_b32 v19, v21, v19, 16 +; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; VI-NEXT: v_add_f32_e32 v22, s6, v31 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v21, 16 +; VI-NEXT: v_add_f32_e32 v21, s6, v31 +; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc +; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_alignbit_b32 v21, v23, v21, 16 +; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_add_f32_e32 v24, s6, v31 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v23, 16 +; VI-NEXT: v_add_f32_e32 v23, s6, v31 +; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v23, v25, v26, vcc +; VI-NEXT: v_add_f32_e32 v25, s6, v31 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_alignbit_b32 v23, v25, v23, 16 +; VI-NEXT: v_add_f32_e32 v25, s6, v31 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; VI-NEXT: v_add_f32_e32 v26, s6, v31 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v25, 16 +; VI-NEXT: v_add_f32_e32 v25, s6, v31 +; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v25, v27, v28, vcc +; VI-NEXT: v_add_f32_e32 v27, s6, v31 +; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_alignbit_b32 v25, v27, v25, 16 +; VI-NEXT: v_add_f32_e32 v27, s6, v31 +; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_add_f32_e32 v28, s6, v31 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: s_lshl_b32 s6, s26, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v27, 16 +; VI-NEXT: v_add_f32_e32 v27, s6, v31 +; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v27, v29, v30, vcc +; VI-NEXT: v_add_f32_e32 v29, s6, v31 +; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: s_lshl_b32 s6, s29, 16 +; VI-NEXT: v_alignbit_b32 v27, v29, v27, 16 +; VI-NEXT: v_add_f32_e32 v29, s6, v31 +; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc +; VI-NEXT: v_add_f32_e32 v30, s6, v31 +; VI-NEXT: v_bfe_u32 v32, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v30 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: s_lshl_b32 s6, s28, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v29, 16 +; VI-NEXT: v_add_f32_e32 v29, s6, v31 +; VI-NEXT: v_bfe_u32 v32, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v29 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc +; VI-NEXT: v_add_f32_e32 v32, s6, v31 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: s_lshl_b32 s6, s5, 16 +; VI-NEXT: v_alignbit_b32 v29, v32, v29, 16 +; VI-NEXT: v_add_f32_e32 v32, s6, v31 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_add_f32_e32 v33, s5, v31 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; VI-NEXT: s_lshl_b32 s5, s4, 16 +; VI-NEXT: v_alignbit_b32 v32, v33, v32, 16 +; VI-NEXT: v_add_f32_e32 v33, s5, v31 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v31, s4, v31 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: s_branch .LBB91_5 +; VI-NEXT: .LBB91_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v33, s71 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s69 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s70 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s68 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s67 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s86 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s83 +; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s82 +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 1 +; VI-NEXT: v_mov_b32_e32 v40, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: v_mov_b32_e32 v54, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: v_mov_b32_e32 v53, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: v_mov_b32_e32 v51, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 7 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 9 +; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: v_mov_b32_e32 v47, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 11 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: v_mov_b32_e32 v43, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 13 +; VI-NEXT: v_mov_b32_e32 v46, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: v_mov_b32_e32 v50, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: v_mov_b32_e32 v42, s54 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_mov_b32_e32 v41, s46 +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s56 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s58 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s60 +; VI-NEXT: v_mov_b32_e32 v45, s72 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s74 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s76 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s78 +; VI-NEXT: v_mov_b32_e32 v55, s88 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v36, s66 +; VI-NEXT: v_mov_b32_e32 v52, s64 +; VI-NEXT: v_mov_b32_e32 v55, v50 +; VI-NEXT: v_mov_b32_e32 v35, s30 +; VI-NEXT: v_mov_b32_e32 v59, s87 +; VI-NEXT: v_mov_b32_e32 v58, s34 +; VI-NEXT: v_mov_b32_e32 v45, s36 +; VI-NEXT: v_mov_b32_e32 v34, s38 +; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: v_mov_b32_e32 v2, s45 +; VI-NEXT: v_mov_b32_e32 v3, s42 +; VI-NEXT: v_mov_b32_e32 v4, s43 +; VI-NEXT: v_mov_b32_e32 v5, s40 +; VI-NEXT: v_mov_b32_e32 v6, s41 +; VI-NEXT: v_mov_b32_e32 v7, s14 +; VI-NEXT: v_mov_b32_e32 v8, s15 +; VI-NEXT: v_mov_b32_e32 v9, s12 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: v_mov_b32_e32 v12, s11 +; VI-NEXT: v_mov_b32_e32 v13, s8 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s6 +; VI-NEXT: v_mov_b32_e32 v16, s7 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v21, s20 +; VI-NEXT: v_mov_b32_e32 v22, s21 +; VI-NEXT: v_mov_b32_e32 v23, s22 +; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s26 +; VI-NEXT: v_mov_b32_e32 v28, s27 +; VI-NEXT: v_mov_b32_e32 v29, s28 +; VI-NEXT: v_mov_b32_e32 v30, s29 +; VI-NEXT: v_mov_b32_e32 v32, s5 +; VI-NEXT: v_mov_b32_e32 v41, s62 +; VI-NEXT: v_mov_b32_e32 v57, s81 +; VI-NEXT: v_mov_b32_e32 v37, s84 +; VI-NEXT: v_mov_b32_e32 v60, s52 +; VI-NEXT: v_mov_b32_e32 v38, s51 +; VI-NEXT: v_mov_b32_e32 v61, s65 +; VI-NEXT: v_mov_b32_e32 v49, s66 +; VI-NEXT: v_mov_b32_e32 v39, s55 +; VI-NEXT: v_mov_b32_e32 v50, v46 +; VI-NEXT: v_mov_b32_e32 v46, v48 +; VI-NEXT: v_mov_b32_e32 v48, v47 +; VI-NEXT: v_mov_b32_e32 v47, v56 +; VI-NEXT: v_mov_b32_e32 v56, v51 +; VI-NEXT: v_mov_b32_e32 v51, s90 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s85 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, s48 +; VI-NEXT: v_mov_b32_e32 v51, v53 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: v_mov_b32_e32 v54, v40 +; VI-NEXT: v_mov_b32_e32 v40, s80 +; VI-NEXT: v_mov_b32_e32 v58, s50 +; VI-NEXT: v_mov_b32_e32 v45, s53 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: .LBB91_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s76, v3 +; GFX9-NEXT: v_readfirstlane_b32 s77, v4 +; GFX9-NEXT: v_readfirstlane_b32 s74, v5 +; GFX9-NEXT: v_readfirstlane_b32 s75, v6 +; GFX9-NEXT: v_readfirstlane_b32 s72, v7 +; GFX9-NEXT: v_readfirstlane_b32 s73, v8 +; GFX9-NEXT: v_readfirstlane_b32 s62, v9 +; GFX9-NEXT: v_readfirstlane_b32 s63, v10 +; GFX9-NEXT: v_readfirstlane_b32 s60, v11 +; GFX9-NEXT: v_readfirstlane_b32 s61, v12 +; GFX9-NEXT: v_readfirstlane_b32 s58, v13 +; GFX9-NEXT: v_readfirstlane_b32 s59, v14 +; GFX9-NEXT: v_readfirstlane_b32 s56, v15 +; GFX9-NEXT: v_readfirstlane_b32 s57, v16 +; GFX9-NEXT: v_readfirstlane_b32 s46, v17 +; GFX9-NEXT: v_readfirstlane_b32 s47, v18 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 18 +; GFX9-NEXT: s_lshr_b32 s6, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 17 +; GFX9-NEXT: s_lshr_b32 s6, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 19 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 20 +; GFX9-NEXT: s_lshr_b32 s6, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 21 +; GFX9-NEXT: s_lshr_b32 s6, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 22 +; GFX9-NEXT: s_lshr_b32 s6, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 16 +; GFX9-NEXT: s_lshr_b32 s6, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 23 +; GFX9-NEXT: s_lshr_b32 s6, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 24 +; GFX9-NEXT: s_lshr_b32 s6, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 25 +; GFX9-NEXT: s_lshr_b32 s6, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 26 +; GFX9-NEXT: s_lshr_b32 s6, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 15 +; GFX9-NEXT: s_lshr_b32 s6, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 27 +; GFX9-NEXT: s_lshr_b32 s6, s26, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 28 +; GFX9-NEXT: s_lshr_b32 s6, s26, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 29 +; GFX9-NEXT: s_lshr_b32 s6, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 30 +; GFX9-NEXT: s_lshr_b32 s6, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 14 +; GFX9-NEXT: s_lshr_b32 s6, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 31 +; GFX9-NEXT: s_lshr_b32 s6, s24, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 32 +; GFX9-NEXT: s_lshr_b32 s6, s24, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 33 +; GFX9-NEXT: s_lshr_b32 s6, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 34 +; GFX9-NEXT: s_lshr_b32 s6, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 13 +; GFX9-NEXT: s_lshr_b32 s6, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 35 +; GFX9-NEXT: s_lshr_b32 s6, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 36 +; GFX9-NEXT: s_lshr_b32 s6, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 37 +; GFX9-NEXT: s_lshr_b32 s6, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 38 +; GFX9-NEXT: s_lshr_b32 s6, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 12 +; GFX9-NEXT: s_lshr_b32 s6, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 39 +; GFX9-NEXT: s_lshr_b32 s6, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 40 +; GFX9-NEXT: s_lshr_b32 s6, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 41 +; GFX9-NEXT: s_lshr_b32 s6, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 42 +; GFX9-NEXT: s_lshr_b32 s6, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 11 +; GFX9-NEXT: s_lshr_b32 s6, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 43 +; GFX9-NEXT: s_lshr_b32 s6, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 44 +; GFX9-NEXT: s_lshr_b32 s6, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 45 +; GFX9-NEXT: s_lshr_b32 s6, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s6, 46 +; GFX9-NEXT: s_lshr_b32 s6, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 10 +; GFX9-NEXT: s_lshr_b32 s6, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 47 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s6, 48 +; GFX9-NEXT: s_lshr_b32 s6, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s6, 49 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 8 +; GFX9-NEXT: v_writelane_b32 v62, s41, 9 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[28:29], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 6 +; GFX9-NEXT: v_writelane_b32 v62, s41, 7 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 4 +; GFX9-NEXT: v_writelane_b32 v62, s41, 5 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[24:25], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 2 +; GFX9-NEXT: v_writelane_b32 v62, s41, 3 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[22:23], 24 +; GFX9-NEXT: v_writelane_b32 v62, s40, 0 +; GFX9-NEXT: s_lshr_b32 s70, s47, 24 +; GFX9-NEXT: s_lshr_b32 s15, s47, 16 +; GFX9-NEXT: s_lshr_b32 s7, s47, 8 +; GFX9-NEXT: s_lshr_b32 s53, s46, 16 +; GFX9-NEXT: s_lshr_b32 s52, s46, 8 +; GFX9-NEXT: s_lshr_b32 s67, s57, 24 +; GFX9-NEXT: s_lshr_b32 s14, s57, 16 +; GFX9-NEXT: s_lshr_b32 s69, s57, 8 +; GFX9-NEXT: s_lshr_b32 s6, s56, 16 +; GFX9-NEXT: s_lshr_b32 s71, s56, 8 +; GFX9-NEXT: s_lshr_b32 s64, s59, 24 +; GFX9-NEXT: s_lshr_b32 s13, s59, 16 +; GFX9-NEXT: s_lshr_b32 s66, s59, 8 +; GFX9-NEXT: s_lshr_b32 s51, s58, 16 +; GFX9-NEXT: s_lshr_b32 s68, s58, 8 +; GFX9-NEXT: s_lshr_b32 s99, s61, 24 +; GFX9-NEXT: s_lshr_b32 s12, s61, 16 +; GFX9-NEXT: s_lshr_b32 s55, s61, 8 +; GFX9-NEXT: s_lshr_b32 s50, s60, 16 +; GFX9-NEXT: s_lshr_b32 s65, s60, 8 +; GFX9-NEXT: s_lshr_b32 s96, s63, 24 +; GFX9-NEXT: s_lshr_b32 s11, s63, 16 +; GFX9-NEXT: s_lshr_b32 s98, s63, 8 +; GFX9-NEXT: s_lshr_b32 s49, s62, 16 +; GFX9-NEXT: s_lshr_b32 s54, s62, 8 +; GFX9-NEXT: s_lshr_b32 s85, s73, 24 +; GFX9-NEXT: s_lshr_b32 s10, s73, 16 +; GFX9-NEXT: s_lshr_b32 s87, s73, 8 +; GFX9-NEXT: s_lshr_b32 s48, s72, 16 +; GFX9-NEXT: s_lshr_b32 s97, s72, 8 +; GFX9-NEXT: s_lshr_b32 s82, s75, 24 +; GFX9-NEXT: s_lshr_b32 s9, s75, 16 +; GFX9-NEXT: s_lshr_b32 s84, s75, 8 +; GFX9-NEXT: s_lshr_b32 s39, s74, 16 +; GFX9-NEXT: s_lshr_b32 s86, s74, 8 +; GFX9-NEXT: s_lshr_b32 s80, s77, 24 +; GFX9-NEXT: s_lshr_b32 s8, s77, 16 +; GFX9-NEXT: s_lshr_b32 s81, s77, 8 +; GFX9-NEXT: s_lshr_b32 s38, s76, 16 +; GFX9-NEXT: s_lshr_b32 s83, s76, 8 +; GFX9-NEXT: v_writelane_b32 v62, s41, 1 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[58:59], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[60:61], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[76:77], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s6, s77, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s77, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_and_b32 s6, s76, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s76, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: s_and_b32 s6, s75, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s75, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; GFX9-NEXT: s_and_b32 s6, s74, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v5, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s74, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v33 +; GFX9-NEXT: s_and_b32 s6, s73, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s73, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; GFX9-NEXT: s_and_b32 s6, s72, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v16, v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s72, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v36 +; GFX9-NEXT: s_and_b32 s6, s63, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v15, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s63, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v37 +; GFX9-NEXT: s_and_b32 s6, s62, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s62, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; GFX9-NEXT: s_and_b32 s6, s61, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v17, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s61, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v48 +; GFX9-NEXT: s_and_b32 s6, s60, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v20, v38, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s60, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX9-NEXT: s_and_b32 s6, s59, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s59, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v51 +; GFX9-NEXT: s_and_b32 s6, s58, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s58, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v53 +; GFX9-NEXT: s_and_b32 s6, s57, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s57, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v54 +; GFX9-NEXT: s_and_b32 s6, s56, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s56, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; GFX9-NEXT: s_and_b32 s6, s47, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v23, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s6, s47, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; GFX9-NEXT: s_and_b32 s6, s46, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s6, s46, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s6, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v25, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s17, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s16, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s12, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s19, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s18, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s21, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s20, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s14, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s23, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s22, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s15, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s25, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s24, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s76, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s27, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s29, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s28, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_lshr_b32 s78, s6, 16 +; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s5, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s8, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s6, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s6, s10 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_bfe_u32 s8, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s10, s8, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s4, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; GFX9-NEXT: s_and_b64 s[8:9], vcc, exec +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[23:24] +; GFX9-NEXT: s_cselect_b32 s4, s4, s10 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s13 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s5, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s6 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 +; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX9-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[38:39], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[48:49], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[50:51], s[60:61], 24 +; GFX9-NEXT: s_lshr_b32 s9, s7, 24 +; GFX9-NEXT: s_lshr_b32 s10, s7, 8 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s43, s6, 8 +; GFX9-NEXT: s_lshr_b32 s45, s75, 24 +; GFX9-NEXT: s_lshr_b32 s75, s75, 8 +; GFX9-NEXT: s_lshr_b32 s79, s74, 16 +; GFX9-NEXT: s_lshr_b32 s74, s74, 8 +; GFX9-NEXT: s_lshr_b32 s88, s73, 24 +; GFX9-NEXT: s_lshr_b32 s73, s73, 8 +; GFX9-NEXT: s_lshr_b32 s89, s72, 16 +; GFX9-NEXT: s_lshr_b32 s72, s72, 8 +; GFX9-NEXT: s_lshr_b32 s90, s63, 24 +; GFX9-NEXT: s_lshr_b32 s63, s63, 8 +; GFX9-NEXT: s_lshr_b32 s91, s62, 16 +; GFX9-NEXT: s_lshr_b32 s62, s62, 8 +; GFX9-NEXT: s_lshr_b32 s92, s61, 24 +; GFX9-NEXT: s_lshr_b32 s61, s61, 8 +; GFX9-NEXT: s_lshr_b32 s93, s60, 16 +; GFX9-NEXT: s_lshr_b32 s60, s60, 8 +; GFX9-NEXT: s_lshr_b32 s94, s59, 24 +; GFX9-NEXT: s_lshr_b32 s59, s59, 8 +; GFX9-NEXT: s_lshr_b32 s95, s58, 16 +; GFX9-NEXT: s_lshr_b32 s58, s58, 8 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s57, 24 +; GFX9-NEXT: s_lshr_b32 s57, s57, 8 +; GFX9-NEXT: s_lshr_b32 vcc_hi, s56, 16 +; GFX9-NEXT: s_lshr_b32 s56, s56, 8 +; GFX9-NEXT: s_lshr_b32 s30, s47, 24 +; GFX9-NEXT: s_lshr_b32 s47, s47, 8 +; GFX9-NEXT: s_lshr_b32 s8, s46, 16 +; GFX9-NEXT: s_lshr_b32 s7, s46, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_branch .LBB91_5 +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 0 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 1 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 2 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 3 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 4 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 5 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 7 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s78, 8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 9 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; kill: killed $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v1, s76 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v1, s77 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v46, s51 +; GFX9-NEXT: v_mov_b32_e32 v56, s50 +; GFX9-NEXT: v_mov_b32_e32 v58, s49 +; GFX9-NEXT: v_mov_b32_e32 v60, s48 +; GFX9-NEXT: v_mov_b32_e32 v27, s39 +; GFX9-NEXT: v_mov_b32_e32 v29, s38 +; GFX9-NEXT: v_mov_b32_e32 v10, s34 +; GFX9-NEXT: v_mov_b32_e32 v11, s36 +; GFX9-NEXT: v_readlane_b32 s34, v62, 8 +; GFX9-NEXT: v_readlane_b32 s36, v62, 6 +; GFX9-NEXT: v_readlane_b32 s38, v62, 4 +; GFX9-NEXT: v_readlane_b32 s48, v62, 2 +; GFX9-NEXT: v_readlane_b32 s50, v62, 0 +; GFX9-NEXT: v_mov_b32_e32 v42, s46 +; GFX9-NEXT: v_mov_b32_e32 v41, s47 +; GFX9-NEXT: v_mov_b32_e32 v55, s15 +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: v_mov_b32_e32 v54, s57 +; GFX9-NEXT: v_mov_b32_e32 v52, s14 +; GFX9-NEXT: v_mov_b32_e32 v53, s58 +; GFX9-NEXT: v_mov_b32_e32 v51, s59 +; GFX9-NEXT: v_mov_b32_e32 v49, s13 +; GFX9-NEXT: v_mov_b32_e32 v50, s60 +; GFX9-NEXT: v_mov_b32_e32 v48, s61 +; GFX9-NEXT: v_mov_b32_e32 v38, s12 +; GFX9-NEXT: v_mov_b32_e32 v39, s62 +; GFX9-NEXT: v_mov_b32_e32 v37, s63 +; GFX9-NEXT: v_mov_b32_e32 v35, s11 +; GFX9-NEXT: v_mov_b32_e32 v36, s72 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v32, s10 +; GFX9-NEXT: v_mov_b32_e32 v33, s74 +; GFX9-NEXT: v_mov_b32_e32 v31, s75 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v26, s53 +; GFX9-NEXT: v_mov_b32_e32 v25, s52 +; GFX9-NEXT: v_mov_b32_e32 v6, s70 +; GFX9-NEXT: v_mov_b32_e32 v12, s7 +; GFX9-NEXT: v_mov_b32_e32 v44, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s71 +; GFX9-NEXT: v_mov_b32_e32 v43, s67 +; GFX9-NEXT: v_mov_b32_e32 v24, s69 +; GFX9-NEXT: v_mov_b32_e32 v21, s68 +; GFX9-NEXT: v_mov_b32_e32 v45, s64 +; GFX9-NEXT: v_mov_b32_e32 v22, s66 +; GFX9-NEXT: v_mov_b32_e32 v19, s65 +; GFX9-NEXT: v_mov_b32_e32 v47, s99 +; GFX9-NEXT: v_mov_b32_e32 v20, s55 +; GFX9-NEXT: v_mov_b32_e32 v17, s54 +; GFX9-NEXT: v_mov_b32_e32 v57, s96 +; GFX9-NEXT: v_mov_b32_e32 v18, s98 +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: v_mov_b32_e32 v59, s85 +; GFX9-NEXT: v_mov_b32_e32 v16, s87 +; GFX9-NEXT: v_mov_b32_e32 v13, s86 +; GFX9-NEXT: v_mov_b32_e32 v61, s82 +; GFX9-NEXT: v_mov_b32_e32 v14, s84 +; GFX9-NEXT: v_mov_b32_e32 v7, s83 +; GFX9-NEXT: v_mov_b32_e32 v28, s80 +; GFX9-NEXT: v_mov_b32_e32 v8, s81 +; GFX9-NEXT: v_mov_b32_e32 v1, s78 +; GFX9-NEXT: v_mov_b32_e32 v2, s88 +; GFX9-NEXT: v_mov_b32_e32 v3, s90 +; GFX9-NEXT: v_mov_b32_e32 v4, s92 +; GFX9-NEXT: v_mov_b32_e32 v5, s94 +; GFX9-NEXT: v_mov_b32_e32 v9, s30 +; GFX9-NEXT: v_readlane_b32 s11, v62, 10 +; GFX9-NEXT: v_readlane_b32 s12, v62, 11 +; GFX9-NEXT: v_readlane_b32 s13, v62, 12 +; GFX9-NEXT: v_readlane_b32 s14, v62, 13 +; GFX9-NEXT: v_readlane_b32 s15, v62, 14 +; GFX9-NEXT: v_readlane_b32 s76, v62, 15 +; GFX9-NEXT: v_readlane_b32 s77, v62, 16 +; GFX9-NEXT: v_readlane_b32 s78, v62, 17 +; GFX9-NEXT: v_readlane_b32 s9, v62, 18 +; GFX9-NEXT: v_readlane_b32 s10, v62, 19 +; GFX9-NEXT: v_readlane_b32 s41, v62, 20 +; GFX9-NEXT: v_readlane_b32 s43, v62, 21 +; GFX9-NEXT: v_readlane_b32 s45, v62, 22 +; GFX9-NEXT: v_readlane_b32 s75, v62, 23 +; GFX9-NEXT: v_readlane_b32 s79, v62, 24 +; GFX9-NEXT: v_readlane_b32 s74, v62, 25 +; GFX9-NEXT: v_readlane_b32 s88, v62, 26 +; GFX9-NEXT: v_readlane_b32 s73, v62, 27 +; GFX9-NEXT: v_readlane_b32 s89, v62, 28 +; GFX9-NEXT: v_readlane_b32 s72, v62, 29 +; GFX9-NEXT: v_readlane_b32 s90, v62, 30 +; GFX9-NEXT: v_readlane_b32 s63, v62, 31 +; GFX9-NEXT: v_readlane_b32 s91, v62, 32 +; GFX9-NEXT: v_readlane_b32 s62, v62, 33 +; GFX9-NEXT: v_readlane_b32 s92, v62, 34 +; GFX9-NEXT: v_readlane_b32 s61, v62, 35 +; GFX9-NEXT: v_readlane_b32 s93, v62, 36 +; GFX9-NEXT: v_readlane_b32 s60, v62, 37 +; GFX9-NEXT: v_readlane_b32 s94, v62, 38 +; GFX9-NEXT: v_readlane_b32 s59, v62, 39 +; GFX9-NEXT: v_readlane_b32 s95, v62, 40 +; GFX9-NEXT: v_readlane_b32 s58, v62, 41 +; GFX9-NEXT: v_readlane_b32 vcc_lo, v62, 42 +; GFX9-NEXT: v_readlane_b32 s57, v62, 43 +; GFX9-NEXT: v_readlane_b32 vcc_hi, v62, 44 +; GFX9-NEXT: v_readlane_b32 s56, v62, 45 +; GFX9-NEXT: v_readlane_b32 s30, v62, 46 +; GFX9-NEXT: v_readlane_b32 s47, v62, 47 +; GFX9-NEXT: v_readlane_b32 s8, v62, 48 +; GFX9-NEXT: v_readlane_b32 s7, v62, 49 +; GFX9-NEXT: v_readlane_b32 s35, v62, 9 +; GFX9-NEXT: v_readlane_b32 s37, v62, 7 +; GFX9-NEXT: v_readlane_b32 s39, v62, 5 +; GFX9-NEXT: v_readlane_b32 s49, v62, 3 +; GFX9-NEXT: v_readlane_b32 s51, v62, 1 +; GFX9-NEXT: .LBB91_5: ; %end +; GFX9-NEXT: s_and_b32 s6, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s44, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s47, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s30, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s56, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, vcc_hi, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s42, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s57, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s8, vcc_lo, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s58, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s40, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s59, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s13, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s94, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s60, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s50, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s61, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s14, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s92, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s62, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s48, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s15, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s90, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s72, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s89, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s38, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s73, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s76, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s88, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s74, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s79, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s36, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s6, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s75, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s45, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v30, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s43, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s34, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s10, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s78, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s9, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v30, s4 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v29, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v58, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX9-NEXT: v_or_b32_sdwa v7, v33, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v27, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v7, v31, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v7, v34, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v32, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v7, v39, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v5, v37, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v38, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s72, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s73, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s62, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s63, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s60, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s61, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s59, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s44, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s45, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s42, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s43, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s102, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v41, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s1, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s0, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s43, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s43, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s43, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s42, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s45, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s45, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s45, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s44, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s44, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s59, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s24, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s14, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s59, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s59, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s15, 9 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s58, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s58, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s61, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s61, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s61, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s60, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s60, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s63, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s63, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s63, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s62, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s73, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s73, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s73, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s72, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s72, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s29, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s29, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s28, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[100:101], s[26:27], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[42:43], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s17, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s62, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 26 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 19 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[18:19], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s14, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s15, 12 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-TRUE16-NEXT: .LBB91_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s58, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s58, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s41, 0xffff0000 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s58, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s41, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s77, s28, 0xffff0000 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s58, 0x10010 +; GFX11-TRUE16-NEXT: s_lshl_b32 s78, s28, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s6, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s73, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s76, s73, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s74, s72, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s75, s72, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s12, s63, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s73, s63, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s63, s62, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s72, s62, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s62, s61, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s57, s61, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s47, s60, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s56, s60, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s46, s59, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s28, s59, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s45, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s45, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s45, s44, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s44, s44, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s29, s43, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s43, s43, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s13, s42, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s42, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s40, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s58, 22 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s58, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s40, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s77 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s78 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s1, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s1 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v4, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v51, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s76 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v20, 16, v4 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s5, s0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v1, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v3 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s75 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s40, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s74 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s3, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s3 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s73 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s12, s2, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s12, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v24, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v9, 16, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s72 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s63 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s17, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s40, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s17, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s17, s17, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s17, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s16, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s57 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s16, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s16 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s16, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s16, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v6, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s47 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s19, s19, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s56 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s40, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s19, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v11, v10 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s19, 22 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s19, s19, s41 +; GFX11-TRUE16-NEXT: s_and_b32 s40, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s40 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s19, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s46 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s15 +; GFX11-TRUE16-NEXT: s_bfe_u32 s41, s40, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, s40 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s40, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s41, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s40, s40, s41 +; GFX11-TRUE16-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s40, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s18, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v28, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v5, 16, v10 +; GFX11-TRUE16-NEXT: s_bfe_u32 s28, s18, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, s18 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s18, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s18, s18, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s28, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s28 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s18, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v11 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s14, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v12 +; GFX11-TRUE16-NEXT: s_bfe_u32 s21, s15, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, s15 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s15, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s21, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s14, s15, s21 +; GFX11-TRUE16-NEXT: s_and_b32 s15, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s14, 16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s11 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s2, s12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s11, 0x10010 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v33 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s11 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s11, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s20, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v6, 16, v11 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s9, s14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s11, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s11, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s45 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s44 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v8, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s11, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v11 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s11, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s43 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v71, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s13 +; GFX11-TRUE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s14, s15 +; GFX11-TRUE16-NEXT: s_lshl_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v70, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v11 +; GFX11-TRUE16-NEXT: s_bfe_u32 s13, s14, 0x10010 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s14 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s13, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s14, s13 +; GFX11-TRUE16-NEXT: s_and_b32 s14, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s13, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-TRUE16-NEXT: s_bfe_u32 s14, s10, 0x10010 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, s10 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s14, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s10, s10, s14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s10, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_bfe_u32 s13, s8, 0x10010 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s13, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s13 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v48, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s7, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s7, s10 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s24, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s7, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v51, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s4, s8, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v51 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s4, 16 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v50 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v8, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v51 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s4, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v66, v5, 16, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v67, v38, 16, v9 +; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v55, v48, 16, v8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v54, v6, 16, v10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s3, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s17, s60 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s6, 0x10010 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s16, s42 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s6, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s13, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s27, s73 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[66:67] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[70:71] +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s12, 0x10010 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[18:19] +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, s12 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s12, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s11, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s12, s12, s11 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[14:15] +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s12, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[1:2] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s26, s13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 24, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[44:45], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[42:43], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[36:37], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[38:39], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s45, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s45, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s44, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s44, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s7, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s4, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s43, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s43, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s41, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB91_5 +; GFX11-TRUE16-NEXT: .LBB91_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s5, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v43, s4, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s4, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s74, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s75, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB91_2 +; GFX11-TRUE16-NEXT: .LBB91_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s94, v43, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v43, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s39 +; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v42, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v43, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s92 +; GFX11-TRUE16-NEXT: v_readlane_b32 s58, v43, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s59, v43, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s60, v43, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s61, v43, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s62, v43, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s63, v43, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s72, v42, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s73, v42, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s13, v42, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s15, v42, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s45, v42, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s44, v42, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s47, v42, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s11, v42, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s57, v43, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s10, v43, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s74, v43, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s9, v43, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s75, v43, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s8, v43, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s76, v43, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s77, v43, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s78, v43, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s79, v43, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s88, v43, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s89, v43, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s90, v43, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s91, v43, 9 +; GFX11-TRUE16-NEXT: s_mov_b32 s92, s100 +; GFX11-TRUE16-NEXT: v_readlane_b32 s93, v43, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s43, v43, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s95, v43, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v42, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s42, v43, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s41, v43, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v43, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v43, 3 +; GFX11-TRUE16-NEXT: .LBB91_5: ; %end +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s104 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s56 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s103 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s102 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s99 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s46 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s31 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s93 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s91 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s38 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s78 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s76 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s94 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s36 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s72 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s47 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s92 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s15 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s73 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s13 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 8, v100 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v101, s2 :: v_dual_lshlrev_b32 v12, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v102, s3 :: v_dual_and_b32 v99, 0xff, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v99, v12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v99, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xff, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v98 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v87, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v29, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v84, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v26, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v21, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v22, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v12, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v28, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v10, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v18, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v19, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v22, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v28, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v100, s1 :: v_dual_lshlrev_b32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v12, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v13 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[99:102], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:112 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v41, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v41, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v41, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v41, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v41, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v41, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v41, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v41, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v41, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v40, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v40, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v40, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v40, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v40, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v40, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v40, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v40, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v40, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v40, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v40, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v40, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v40, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v40, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v40, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v40, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v40, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v40, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v40, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v40, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v40, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:12 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s72, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s73, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s62, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s63, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s60, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s61, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s58, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s59, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s56, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s57, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s46, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s45, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s102, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v41, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[26:27], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s1, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s45, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s45, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s45, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s44, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s44, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s47, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s47, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s47, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s46, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s57, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s57, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s57, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s56, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s56, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s59, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s59, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s59, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s58, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s58, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s61, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s61, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s61, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s60, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s60, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s63, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s63, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s63, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s62, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s62, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s73, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s73, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s73, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s72, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s72, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s28, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[2:3], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[44:45], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[46:47], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[58:59], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[62:63], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[72:73], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s18, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s17, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s3, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s46, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 7 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 5 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s12, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s13, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-FAKE16-NEXT: .LBB91_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s47, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s15, s47, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s45, 0xffff0000 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s47, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s45, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s78, s28, 0xffff0000 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s47, 0x10010 +; GFX11-FAKE16-NEXT: s_lshl_b32 s79, s28, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s6, s47 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s77, s73, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s75, s72, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s76, s72, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s63, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s74, s63, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s72, s62, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s73, s62, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s63, s61, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s62, s61, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s61, s60, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s60, s60, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s41, s59, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s40, s59, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s28, s58, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s29, s58, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s13, s57, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s57, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s42, s56, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s43, s56, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s12, s46, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s46, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s44, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s44, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s47, 22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s47, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s44, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s78 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s79 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s1, 0x10010 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s1 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s77 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v4 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s5, s0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s44, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v9 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v6, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s76 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s44, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s75 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 24, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s3, 0x10010 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s3 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] -; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77 -; GFX11-FAKE16-NEXT: .LBB45_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s74 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s2, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s11, s2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s44, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s73 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s17 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s72 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s44, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s17, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v27 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s17 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s17, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v28, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s17, s17, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s17, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 24, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v29 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s46, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s44, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s16, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s62 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s16, 0x10010 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s16 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s16, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s16, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s44, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s44, 0x10010 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s61 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s44 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s44, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s44, s44, s45 +; GFX11-FAKE16-NEXT: s_lshl_b32 s19, s19, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s19 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s44, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v9, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s45, s19, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s45, s45, s19 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s19, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s44, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s19, s19, s45 +; GFX11-FAKE16-NEXT: s_and_b32 s44, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s44 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s19, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s41 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s47, s17, s72 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s44, s41, 0x10010 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_add_i32 s44, s44, s41 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s41, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s45, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s41, s41, s44 +; GFX11-FAKE16-NEXT: s_lshl_b32 s18, s18, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s18, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v1, 16, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_bfe_u32 s40, s18, 0x10010 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s28 +; GFX11-FAKE16-NEXT: s_add_i32 s44, s40, s18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s41, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s44, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s18, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s18, s18, s44 +; GFX11-FAKE16-NEXT: s_and_b32 s41, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s41 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s18, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v10, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s28, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s28 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s41, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s28, s28, s29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s21, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s21 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s2, s11 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s29, s21, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, s21 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s21, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s28, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s21, s21, s29 +; GFX11-FAKE16-NEXT: s_and_b32 s28, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s21, 16 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s3, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s46, s16, s46 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s13, 0x10010 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s13 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s13, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s29, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s13, s28 +; GFX11-FAKE16-NEXT: s_lshl_b32 s20, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s20, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v2, 16, v9 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s20, 0x10010 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s10, s20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s13, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s20, 22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s20, s28 +; GFX11-FAKE16-NEXT: s_and_b32 s20, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s42 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s20 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s43 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s28, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s20, s28, 0x10010 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s20, s28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s13, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s29, 0x7fff +; GFX11-FAKE16-NEXT: s_bitset1_b32 s28, 22 +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s28, s29 +; GFX11-FAKE16-NEXT: s_lshl_b32 s23, s23, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v4, v8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s13, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s28, s23, 0x10010 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, s23 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s23, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s28, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s23, s28 +; GFX11-FAKE16-NEXT: s_and_b32 s23, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s23 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s13, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v71, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s12 +; GFX11-FAKE16-NEXT: s_bfe_u32 s15, s14, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, s14 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s15, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s13, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s14, s15 +; GFX11-FAKE16-NEXT: s_lshl_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s13, 16 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s14, 0x10010 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s14 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s14, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s15, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s14, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s14, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s12, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: s_bfe_u32 s14, s9, 0x10010 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s14, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s9, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s8, 0x10010 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s12, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s8, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v12, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s0, s5 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s7, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s24, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v12, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s4, s8, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s8, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v52, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s4, 16 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v4, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v66, v1, 16, v11 +; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v55, v50, 16, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s22, s13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v54, v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v48, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[17:18] +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s6, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[15:16] +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s14, s6, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s26, 16 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s20, s10 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s14, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7] +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s1, s58 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[66:67] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[70:71] +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s11, 0x10010 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[4:5], 24, v[19:20] +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s19, s60 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s10, 16 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s18, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s23, s62 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 24, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 8, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 24, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s21, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s25, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s57, s27, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s56, s26, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s24, s12 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[8:9], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[44:45], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 vcc, s[56:57], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[34:35], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s57, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s57, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s56, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s56, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s9, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s7, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s4, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s47, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s47, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s46, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s46, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s45, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s99, s44, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s44, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s29, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 8 +; GFX11-FAKE16-NEXT: s_branch .LBB91_5 +; GFX11-FAKE16-NEXT: .LBB91_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s4, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s5, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s74, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v43, s75, 7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: s_branch .LBB91_2 +; GFX11-FAKE16-NEXT: .LBB91_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s94, v43, 2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35 +; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v43, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_lo, v43, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v43, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v43, 4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, s11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s59, v43, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s72, v43, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s60, v43, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s61, v43, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s62, v43, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s63, v43, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s73, v43, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s13, v43, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s15, v43, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s41, v43, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s43, v43, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s56, v43, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s11, v43, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s57, v43, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s10, v43, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s74, v43, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s9, v43, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s75, v43, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s8, v43, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s76, v43, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s77, v43, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s78, v43, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s79, v43, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s88, v43, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s89, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s90, v42, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s91, v42, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s92, v42, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s47, v42, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s93, v42, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 vcc_hi, v43, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s46, v42, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v43, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s95, v42, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s45, v42, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v43, 5 +; GFX11-FAKE16-NEXT: .LBB91_5: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s104, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s103, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s102, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s100, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s99, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s45, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s95, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s4 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s46, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s93, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s72, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s92, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s91, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s90, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s89, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s60, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s79, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s78, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s30, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s75, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s94, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s9, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s62, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s74, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s10, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s34, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s11, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s25, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s63, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s56, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s43, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, vcc_lo, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s15, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s13, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s73, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v96, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v6, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v11, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v22, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v26, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v24, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v22, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v26, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v6, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v11, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v15, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v19, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v18, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v20, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v21, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v3, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v10, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v11, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v5 ; GFX11-FAKE16-NEXT: s_clause 0x5 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[97:100], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[112:115], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v41, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v41, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v41, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v41, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v41, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v41, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v41, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v41, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v41, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v40, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v40, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v40, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v40, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v40, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v40, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v40, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v40, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v40, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v40, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v40, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v40, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v40, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v40, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v40, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v40, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v40, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v40, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v40, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v40, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v40, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:12 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -86777,2301 +178669,2298 @@ end: } define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v50, v27 -; GCN-NEXT: v_mov_b32_e32 v49, v25 -; GCN-NEXT: v_mov_b32_e32 v39, v21 -; GCN-NEXT: v_mov_b32_e32 v48, v3 -; GCN-NEXT: v_mov_b32_e32 v37, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v4 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v57, 8, v4 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v2 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:152 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:216 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:248 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:280 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:312 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:344 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:376 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:368 -; GCN-NEXT: s_waitcnt vmcnt(13) -; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v3 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_mov_b32_e32 v26, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v37 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v11 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v15 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v19 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v39 -; GCN-NEXT: v_or_b32_e32 v25, v0, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v0, v24 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GCN-NEXT: v_or_b32_e32 v33, v0, v28 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v29 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v0, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v14 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v2, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v6, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v6, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v51 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: v_mov_b32_e32 v7, v8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v12, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v30 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v12, v5 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v7 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v5 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v53, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v5 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v40, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v13 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v42, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v15 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v43, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v46, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v51, v51, v52 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v52, v52, v54 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v54, v54, v55 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v56 -; GCN-NEXT: v_or_b32_e32 v55, v55, v41 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v41, v41, v44 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v44, v44, v45 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v45, v45, v47 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v58 -; GCN-NEXT: v_or_b32_e32 v47, v47, v57 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v56, v56, v62 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v27 -; GCN-NEXT: v_or_b32_e32 v31, v57, v31 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v60 -; GCN-NEXT: v_or_b32_e32 v57, v57, v63 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v61 -; GCN-NEXT: v_or_b32_e32 v1, v58, v1 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v59 -; GCN-NEXT: v_or_b32_e32 v3, v58, v3 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v25 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v23 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v32 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v33 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v36 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v50 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v42 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v3 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: .LBB46_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v59 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v6, v3, v2 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v61 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v10, v1, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v16, v63, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v20, v31, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v62, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v58 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v57, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v47, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v45, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v44, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v41, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v55, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v54, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v1, v52, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v3, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v0, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v40 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v0, v26 -; GCN-NEXT: v_mov_b32_e32 v0, v37 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v4, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v53 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v1, v34 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v21, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v25, v49 -; GCN-NEXT: v_mov_b32_e32 v27, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v50 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v51 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v52 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v53 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v54 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v55 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v2, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v2, v41 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v42 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v4, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v45 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v21 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_or_b32_e32 v46, v22, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v2, v47 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 -; GCN-NEXT: v_or_b32_e32 v56, v18, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v15 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v2, v57 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v13 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v58 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v58, v2, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v11 -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v59 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v2, v59 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v9 -; GCN-NEXT: v_and_b32_e32 v60, 0xff, v60 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v2, v60 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v7 -; GCN-NEXT: v_and_b32_e32 v61, 0xff, v61 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v2, v61 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v62, 0xff, v62 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v2, v62 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v63, 0xff, v63 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v63 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v0, v3 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x300, v6 -; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v20 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v26 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v46, vcc, s6, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, s6, v47 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, s6, v57 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s6, v59 -; GCN-NEXT: v_add_i32_e32 v60, vcc, s6, v60 -; GCN-NEXT: v_add_i32_e32 v61, vcc, s6, v61 -; GCN-NEXT: v_add_i32_e32 v62, vcc, s6, v62 -; GCN-NEXT: v_add_i32_e32 v63, vcc, s6, v63 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s6, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v63 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v62 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v61 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v60 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v58 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v56 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v47 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v46 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v14 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v22 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v31 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v44 -; GCN-NEXT: .LBB46_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, v50, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v50 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v45, v1, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v44, v1, v0 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v47, v1, v0 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v46, v1, v0 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v18, vcc, 40, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v22, vcc, 44, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v63, v1, v0 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 48, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v60, v1, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 52, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v30, v1, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 56, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v61, v1, v0 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v7, v1, v0 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 64, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v62, v1, v0 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x44, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v5, v1, v0 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x48, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v52, v12, v0 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v50 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v54, v12, v0 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v32 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v0 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x54, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v11, v11, v0 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x58, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v15, v12, v0 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x5c, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v19, v6, v0 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x60, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v9, v6, v0 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v13, v2, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x68, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v17, v2, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v21, v2, v0 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x70, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v25, v2, v0 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x74, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v29, v2, v0 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x78, v50 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v50 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v48, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_mov_b32_e32 v43, v17 +; SI-NEXT: v_mov_b32_e32 v41, v7 +; SI-NEXT: v_mov_b32_e32 v55, v5 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: v_mov_b32_e32 v30, v0 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:392 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:344 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v48, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v45, v46, v45 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v56, v56, v61 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v2, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v31, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v23, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v29, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v51, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v2, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v4, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v33, v6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v35, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v34, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v8, v53 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v16, v8, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v24, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v26, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v18, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v22, v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v32, v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v36, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v36, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v37, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v38, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v39, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v39, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v48, v48, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v49, v49, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v50, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v50, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v9, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v54, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v54, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v52, v52, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v53, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v53, v53, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v55, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v55, v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v13, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v41, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v41, v41, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v40, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v40, v40, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v42, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v43, 0xff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v43, v43, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v34, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v44, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v44, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v48, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v46, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v46, v46, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v58, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v58, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v59, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v59, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v60, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v60, v60, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v21, v61, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v61, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v61, v63 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v3 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: .LBB92_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v4, v3, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v5, v1, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v35 +; SI-NEXT: v_mov_b32_e32 v1, v17 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v19, v44 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v23, v63, v2 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v25, v47, v2 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v27, v62, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v29, v61, v2 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v31, v60, v2 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v33, v59, v2 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v11, v12, v59 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v34, v58, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v36, v45, v2 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v0, v8 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v40, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v53, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v52 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v32, v32, v42 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v48 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v0, v47 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v34 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v0, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v33 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v0, v57 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v31 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v29 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_or_b32_e32 v0, v2, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v6, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v0, v63 +; SI-NEXT: v_or_b32_e32 v0, v1, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v58 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v56 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: .LBB92_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v30 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v30 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64f16: ; VI: ; %bb.0: @@ -89405,7 +181294,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -89887,9 +181776,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: .LBB46_2: ; %Flow +; VI-NEXT: .LBB92_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_4 +; VI-NEXT: s_cbranch_execz .LBB92_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload @@ -90276,7 +182165,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 ; VI-NEXT: v_or_b32_e32 v21, v39, v21 ; VI-NEXT: v_or_b32_e32 v31, v31, v54 -; VI-NEXT: .LBB46_4: ; %end +; VI-NEXT: .LBB92_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -90652,7 +182541,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: s_cbranch_execz .LBB92_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -91135,9 +183024,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: .LBB46_2: ; %Flow +; GFX9-NEXT: .LBB92_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB46_4 +; GFX9-NEXT: s_cbranch_execz .LBB92_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload @@ -91528,7 +183417,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 ; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 ; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 -; GFX9-NEXT: .LBB46_4: ; %end +; GFX9-NEXT: .LBB92_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -91773,15 +183662,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_4 -; GFX11-TRUE16-NEXT: .LBB46_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_4 +; GFX11-TRUE16-NEXT: .LBB92_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB46_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l @@ -92040,8 +183929,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 -; GFX11-TRUE16-NEXT: .LBB46_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB92_2 +; GFX11-TRUE16-NEXT: .LBB92_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 @@ -92581,7 +184470,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 @@ -92872,9 +184761,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: .LBB46_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB92_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB92_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 @@ -92938,300 +184827,5754 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: .LBB92_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s10, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v61, s29, 0 +; SI-NEXT: v_writelane_b32 v61, s28, 1 +; SI-NEXT: v_writelane_b32 v61, s27, 2 +; SI-NEXT: s_mov_b32 s61, s21 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: s_mov_b32 s67, s19 +; SI-NEXT: s_mov_b32 s54, s17 +; SI-NEXT: s_mov_b32 s35, s23 +; SI-NEXT: s_mov_b32 s39, s26 +; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s99, v1 +; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s6, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v62, s74, 0 +; SI-NEXT: v_readfirstlane_b32 s12, v26 +; SI-NEXT: v_writelane_b32 v62, s6, 1 +; SI-NEXT: v_readfirstlane_b32 s14, v25 +; SI-NEXT: v_writelane_b32 v62, s12, 2 +; SI-NEXT: v_readfirstlane_b32 s46, v28 +; SI-NEXT: v_writelane_b32 v62, s14, 3 +; SI-NEXT: v_readfirstlane_b32 s56, v27 +; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: v_readfirstlane_b32 s57, v30 +; SI-NEXT: v_writelane_b32 v62, s56, 5 +; SI-NEXT: v_readfirstlane_b32 s59, v29 +; SI-NEXT: v_writelane_b32 v62, s57, 6 +; SI-NEXT: v_writelane_b32 v62, s59, 7 +; SI-NEXT: s_mov_b32 s60, s20 +; SI-NEXT: s_mov_b32 s63, s24 +; SI-NEXT: v_readfirstlane_b32 s95, v3 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_readfirstlane_b32 s24, v9 +; SI-NEXT: v_readfirstlane_b32 s38, v12 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s27, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: v_readfirstlane_b32 s79, v15 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s15, v17 +; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_readfirstlane_b32 s43, v19 +; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 +; SI-NEXT: v_writelane_b32 v61, s4, 3 +; SI-NEXT: v_readfirstlane_b32 s45, v21 +; SI-NEXT: v_readfirstlane_b32 s98, v10 +; SI-NEXT: v_readfirstlane_b32 s90, v8 +; SI-NEXT: v_readfirstlane_b32 s88, v7 +; SI-NEXT: v_readfirstlane_b32 s91, v6 +; SI-NEXT: v_readfirstlane_b32 s93, v4 +; SI-NEXT: v_readfirstlane_b32 s55, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 +; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 +; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 +; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 +; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 +; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 +; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 +; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 +; SI-NEXT: v_writelane_b32 v61, s4, 17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 +; SI-NEXT: v_writelane_b32 v61, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 +; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 +; SI-NEXT: v_writelane_b32 v61, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 +; SI-NEXT: v_writelane_b32 v61, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 +; SI-NEXT: v_writelane_b32 v61, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 +; SI-NEXT: v_writelane_b32 v61, s4, 27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 +; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 +; SI-NEXT: v_writelane_b32 v61, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 +; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 +; SI-NEXT: v_writelane_b32 v61, s4, 31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 +; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s16, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 +; SI-NEXT: v_writelane_b32 v61, s4, 33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 +; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 +; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: v_writelane_b32 v61, s4, 36 +; SI-NEXT: v_writelane_b32 v61, s54, 37 +; SI-NEXT: v_writelane_b32 v61, s10, 38 +; SI-NEXT: v_writelane_b32 v61, s67, 39 +; SI-NEXT: v_writelane_b32 v61, s18, 40 +; SI-NEXT: v_writelane_b32 v61, s61, 41 +; SI-NEXT: v_writelane_b32 v61, s60, 42 +; SI-NEXT: v_writelane_b32 v61, s35, 43 +; SI-NEXT: v_writelane_b32 v61, s22, 44 +; SI-NEXT: v_writelane_b32 v61, s62, 45 +; SI-NEXT: v_writelane_b32 v61, s63, 46 +; SI-NEXT: v_writelane_b32 v61, s39, 47 +; SI-NEXT: v_writelane_b32 v61, s99, 48 +; SI-NEXT: v_writelane_b32 v61, s95, 49 +; SI-NEXT: v_writelane_b32 v61, s31, 50 +; SI-NEXT: v_writelane_b32 v61, s24, 51 +; SI-NEXT: v_writelane_b32 v61, s38, 52 +; SI-NEXT: v_writelane_b32 v61, s36, 53 +; SI-NEXT: v_writelane_b32 v61, s8, 54 +; SI-NEXT: v_writelane_b32 v61, s27, 55 +; SI-NEXT: v_writelane_b32 v61, s9, 56 +; SI-NEXT: v_writelane_b32 v61, s79, 57 +; SI-NEXT: v_writelane_b32 v61, s13, 58 +; SI-NEXT: v_writelane_b32 v61, s15, 59 +; SI-NEXT: v_writelane_b32 v61, s42, 60 +; SI-NEXT: v_writelane_b32 v61, s43, 61 +; SI-NEXT: v_writelane_b32 v61, s44, 62 +; SI-NEXT: v_writelane_b32 v61, s45, 63 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v62, s25, 8 +; SI-NEXT: v_writelane_b32 v62, s28, 9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s92, v31 +; SI-NEXT: v_writelane_b32 v62, s92, 10 +; SI-NEXT: v_writelane_b32 v62, s75, 11 +; SI-NEXT: v_writelane_b32 v62, s26, 12 +; SI-NEXT: v_writelane_b32 v62, s30, 13 +; SI-NEXT: v_writelane_b32 v62, s23, 14 +; SI-NEXT: v_writelane_b32 v62, s52, 15 +; SI-NEXT: v_writelane_b32 v62, s64, 16 +; SI-NEXT: v_writelane_b32 v62, s17, 17 +; SI-NEXT: v_writelane_b32 v62, s65, 18 +; SI-NEXT: v_writelane_b32 v62, s70, 19 +; SI-NEXT: v_writelane_b32 v62, s71, 20 +; SI-NEXT: v_writelane_b32 v62, s49, 21 +; SI-NEXT: v_writelane_b32 v62, s83, 22 +; SI-NEXT: v_writelane_b32 v62, s80, 23 +; SI-NEXT: v_writelane_b32 v62, s82, 24 +; SI-NEXT: v_writelane_b32 v62, s84, 25 +; SI-NEXT: v_writelane_b32 v62, s87, 26 +; SI-NEXT: v_writelane_b32 v62, s86, 27 +; SI-NEXT: v_writelane_b32 v62, s51, 28 +; SI-NEXT: v_writelane_b32 v62, s96, 29 +; SI-NEXT: v_writelane_b32 v62, s34, 30 +; SI-NEXT: v_writelane_b32 v62, s94, 31 +; SI-NEXT: v_writelane_b32 v62, s53, 32 +; SI-NEXT: v_writelane_b32 v62, s66, 33 +; SI-NEXT: v_writelane_b32 v62, s68, 34 +; SI-NEXT: v_writelane_b32 v62, s69, 35 +; SI-NEXT: v_writelane_b32 v62, s77, 36 +; SI-NEXT: v_writelane_b32 v62, s78, 37 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s5, v61, 2 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 1 +; SI-NEXT: v_readlane_b32 s5, v61, 0 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s99, 0xff +; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s98, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_and_b32 s4, s52, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_and_b32 s4, s87, 0xff +; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_and_b32 s4, s80, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_and_b32 s4, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_and_b32 s4, s97, 0xff +; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s69, v61, 35 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s53, v61, 32 +; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s34, v61, 30 +; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s51, v61, 28 +; SI-NEXT: v_readlane_b32 s86, v61, 27 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s87, v61, 26 +; SI-NEXT: v_readlane_b32 s84, v61, 25 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_and_b32 s4, s87, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s82, v61, 24 +; SI-NEXT: v_readlane_b32 s80, v61, 23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_and_b32 s4, s82, 0xff +; SI-NEXT: s_lshl_b32 s5, s80, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s83, v61, 22 +; SI-NEXT: v_readlane_b32 s49, v61, 21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s71, v61, 20 +; SI-NEXT: v_readlane_b32 s70, v61, 19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_and_b32 s4, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s65, v61, 18 +; SI-NEXT: v_readlane_b32 s54, v61, 17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_and_b32 s4, s65, 0xff +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s67, v61, 16 +; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_and_b32 s4, s67, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s64, v61, 14 +; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_and_b32 s4, s64, 0xff +; SI-NEXT: s_lshl_b32 s5, s52, 8 +; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s35, v61, 12 +; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s30, v61, 10 +; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s30, 0xff +; SI-NEXT: s_lshl_b32 s5, s39, 8 +; SI-NEXT: s_mov_b32 s26, s37 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s37, v61, 8 +; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s92, v61, 6 +; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s78, v61, 4 +; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s78, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: s_mov_b32 s20, s88 +; SI-NEXT: s_mov_b32 s24, s98 +; SI-NEXT: s_mov_b32 s59, s58 +; SI-NEXT: s_mov_b32 s56, s47 +; SI-NEXT: s_mov_b32 s46, s41 +; SI-NEXT: s_mov_b32 s12, s11 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s97 +; SI-NEXT: s_mov_b32 s97, s81 +; SI-NEXT: s_mov_b32 s81, s85 +; SI-NEXT: s_mov_b32 s6, s40 +; SI-NEXT: s_mov_b32 s40, s72 +; SI-NEXT: s_mov_b32 s45, s73 +; SI-NEXT: s_mov_b32 s15, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_mov_b32 s55, s93 +; SI-NEXT: s_mov_b32 s95, s91 +; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s78, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s92, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 vcc_lo, s77, 8 +; SI-NEXT: s_or_b32 s5, vcc_lo, s5 +; SI-NEXT: s_add_i32 vcc_lo, s37, 3 +; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 +; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo +; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s60, s39, 8 +; SI-NEXT: s_or_b32 s60, s60, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s61, s48, 8 +; SI-NEXT: s_or_b32 s61, s61, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s64, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s62, s52, 8 +; SI-NEXT: s_or_b32 s62, s62, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s67, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s63, s50, 8 +; SI-NEXT: s_or_b32 s10, s63, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s65, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s72, s54, 8 +; SI-NEXT: s_or_b32 s72, s72, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s71, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s73, s70, 8 +; SI-NEXT: s_or_b32 s73, s73, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s83, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s74, s49, 8 +; SI-NEXT: s_or_b32 s74, s74, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s82, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s75, s80, 8 +; SI-NEXT: s_or_b32 s75, s75, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s87, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s76, s84, 8 +; SI-NEXT: s_or_b32 s76, s76, vcc_hi +; SI-NEXT: s_add_i32 vcc_hi, s51, 3 +; SI-NEXT: s_add_i32 s93, s53, 3 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s77, s86, 8 +; SI-NEXT: s_add_i32 s89, s34, 3 +; SI-NEXT: s_and_b32 s93, s93, 0xff +; SI-NEXT: s_lshl_b32 s78, s94, 8 +; SI-NEXT: s_add_i32 s34, s66, 3 +; SI-NEXT: s_or_b32 s77, s77, vcc_hi +; SI-NEXT: s_and_b32 s89, s89, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 +; SI-NEXT: s_or_b32 s22, s78, s93 +; SI-NEXT: s_and_b32 s93, s34, 0xff +; SI-NEXT: s_lshl_b32 s92, s16, 8 +; SI-NEXT: s_add_i32 s53, s68, 3 +; SI-NEXT: s_or_b32 s89, vcc_hi, s89 +; SI-NEXT: s_or_b32 s92, s92, s93 +; SI-NEXT: s_and_b32 s93, s53, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s15, 8 +; SI-NEXT: s_add_i32 s66, s69, 3 +; SI-NEXT: s_or_b32 s93, vcc_hi, s93 +; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff +; SI-NEXT: s_lshl_b32 s34, s45, 8 +; SI-NEXT: s_add_i32 s68, s6, 3 +; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi +; SI-NEXT: s_and_b32 s34, s68, 0xff +; SI-NEXT: s_lshl_b32 s39, s40, 8 +; SI-NEXT: s_add_i32 s69, s81, 3 +; SI-NEXT: s_or_b32 s34, s39, s34 +; SI-NEXT: s_and_b32 s39, s69, 0xff +; SI-NEXT: s_lshl_b32 s52, s21, 8 +; SI-NEXT: s_add_i32 s81, s7, 3 +; SI-NEXT: s_or_b32 s39, s52, s39 +; SI-NEXT: s_and_b32 s52, s81, 0xff +; SI-NEXT: s_lshl_b32 s53, s97, 8 +; SI-NEXT: s_add_i32 s85, s12, 3 +; SI-NEXT: s_or_b32 s52, s53, s52 +; SI-NEXT: s_and_b32 s53, s85, 0xff +; SI-NEXT: s_lshl_b32 s64, s11, 8 +; SI-NEXT: s_add_i32 s97, s56, 3 +; SI-NEXT: s_or_b32 s53, s64, s53 +; SI-NEXT: s_and_b32 s64, s97, 0xff +; SI-NEXT: s_lshl_b32 s66, s46, 8 +; SI-NEXT: s_add_i32 s21, s29, 3 +; SI-NEXT: s_or_b32 s64, s66, s64 +; SI-NEXT: s_and_b32 s21, s21, 0xff +; SI-NEXT: s_lshl_b32 s66, s59, 8 +; SI-NEXT: s_add_i32 s25, s8, 3 +; SI-NEXT: s_or_b32 s66, s66, s21 +; SI-NEXT: s_and_b32 s21, s25, 0xff +; SI-NEXT: s_lshl_b32 s6, s28, 8 +; SI-NEXT: s_add_i32 s29, s19, 3 +; SI-NEXT: s_or_b32 s67, s6, s21 +; SI-NEXT: s_and_b32 s6, s29, 0xff +; SI-NEXT: s_lshl_b32 s18, s26, 8 +; SI-NEXT: s_add_i32 s28, s17, 3 +; SI-NEXT: s_or_b32 s68, s18, s6 +; SI-NEXT: s_and_b32 s6, s28, 0xff +; SI-NEXT: s_lshl_b32 s18, s23, 8 +; SI-NEXT: s_or_b32 s69, s18, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: s_add_i32 s7, s6, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 15 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v62, 16 +; SI-NEXT: s_add_i32 s27, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 13 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_lshl_b32 s23, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 14 +; SI-NEXT: s_mov_b32 s91, s24 +; SI-NEXT: s_or_b32 s70, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 20 +; SI-NEXT: s_add_i32 s24, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 11 +; SI-NEXT: s_add_i32 s11, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 18 +; SI-NEXT: s_lshl_b32 s19, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 12 +; SI-NEXT: s_mov_b32 s90, s20 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s20, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 9 +; SI-NEXT: s_or_b32 s71, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 22 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s17, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v62, 10 +; SI-NEXT: s_add_i32 s12, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: s_or_b32 s17, s17, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s20, v62, 8 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 8 +; SI-NEXT: s_or_b32 s81, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 23 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_or_b32 s16, s20, s16 +; SI-NEXT: v_readlane_b32 s20, v62, 7 +; SI-NEXT: s_add_i32 s14, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: s_or_b32 s19, s19, s24 +; SI-NEXT: s_add_i32 s98, s20, 3 +; SI-NEXT: v_readlane_b32 s24, v62, 6 +; SI-NEXT: s_and_b32 s6, s14, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s20, s98, 0xff +; SI-NEXT: s_lshl_b32 s24, s24, 8 +; SI-NEXT: s_or_b32 s83, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: s_and_b32 s27, s27, 0xff +; SI-NEXT: s_or_b32 s20, s24, s20 +; SI-NEXT: v_readlane_b32 s24, v62, 5 +; SI-NEXT: s_add_i32 s41, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 24 +; SI-NEXT: s_or_b32 s23, s23, s27 +; SI-NEXT: s_add_i32 s86, s24, 3 +; SI-NEXT: v_readlane_b32 s27, v62, 4 +; SI-NEXT: s_and_b32 s6, s41, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s24, s86, 0xff +; SI-NEXT: s_lshl_b32 s27, s27, 8 +; SI-NEXT: s_or_b32 s85, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 28 +; SI-NEXT: s_or_b32 s24, s27, s24 +; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: s_add_i32 s46, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 25 +; SI-NEXT: s_add_i32 s12, s73, 0x300 +; SI-NEXT: s_add_i32 s82, s27, 3 +; SI-NEXT: v_readlane_b32 s73, v62, 2 +; SI-NEXT: s_and_b32 s6, s46, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s27, s82, 0xff +; SI-NEXT: s_lshl_b32 s73, s73, 8 +; SI-NEXT: s_or_b32 s96, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 31 +; SI-NEXT: s_or_b32 s27, s73, s27 +; SI-NEXT: v_readlane_b32 s73, v62, 1 +; SI-NEXT: s_add_i32 s47, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 27 +; SI-NEXT: s_add_i32 s13, s74, 0x300 +; SI-NEXT: s_add_i32 s65, s73, 3 +; SI-NEXT: v_readlane_b32 s74, v62, 0 +; SI-NEXT: s_and_b32 s6, s47, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s73, s65, 0xff +; SI-NEXT: s_lshl_b32 s74, s74, 8 +; SI-NEXT: s_or_b32 s97, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 34 +; SI-NEXT: s_or_b32 s73, s74, s73 +; SI-NEXT: v_readlane_b32 s74, v61, 63 +; SI-NEXT: s_add_i32 s56, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 29 +; SI-NEXT: s_add_i32 s14, s75, 0x300 +; SI-NEXT: s_add_i32 s54, s74, 3 +; SI-NEXT: v_readlane_b32 s75, v61, 62 +; SI-NEXT: s_and_b32 s6, s56, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s74, s54, 0xff +; SI-NEXT: s_lshl_b32 s75, s75, 8 +; SI-NEXT: s_or_b32 s63, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 36 +; SI-NEXT: s_or_b32 s74, s75, s74 +; SI-NEXT: v_readlane_b32 s75, v61, 61 +; SI-NEXT: s_add_i32 s58, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 30 +; SI-NEXT: s_add_i32 s15, s76, 0x300 +; SI-NEXT: s_add_i32 s50, s75, 3 +; SI-NEXT: v_readlane_b32 s76, v61, 60 +; SI-NEXT: s_and_b32 s6, s58, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s75, s50, 0xff +; SI-NEXT: s_lshl_b32 s76, s76, 8 +; SI-NEXT: s_or_b32 s79, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 37 +; SI-NEXT: s_or_b32 s75, s76, s75 +; SI-NEXT: v_readlane_b32 s76, v61, 59 +; SI-NEXT: s_add_i32 s59, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 33 +; SI-NEXT: s_add_i32 s18, s77, 0x300 +; SI-NEXT: s_add_i32 s48, s76, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 58 +; SI-NEXT: s_and_b32 s6, s59, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s76, s48, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 8 +; SI-NEXT: s_or_b32 s78, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 35 +; SI-NEXT: s_or_b32 s76, s77, s76 +; SI-NEXT: v_readlane_b32 s77, v61, 57 +; SI-NEXT: s_add_i32 s57, s6, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 32 +; SI-NEXT: s_add_i32 s11, s72, 0x300 +; SI-NEXT: s_add_i32 s72, s79, 0x300 +; SI-NEXT: s_add_i32 s37, s77, 3 +; SI-NEXT: v_readlane_b32 s79, v61, 56 +; SI-NEXT: s_and_b32 s6, s57, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s77, s37, 0xff +; SI-NEXT: s_lshl_b32 s79, s79, 8 +; SI-NEXT: s_or_b32 s88, s7, s6 +; SI-NEXT: s_or_b32 s77, s79, s77 +; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: s_add_i32 s21, s89, 0x300 +; SI-NEXT: s_add_i32 s89, s88, 0x300 +; SI-NEXT: s_add_i32 s35, s79, 3 +; SI-NEXT: v_readlane_b32 s88, v61, 54 +; SI-NEXT: s_and_b32 s79, s35, 0xff +; SI-NEXT: s_lshl_b32 s88, s88, 8 +; SI-NEXT: s_or_b32 s79, s88, s79 +; SI-NEXT: v_readlane_b32 s88, v61, 53 +; SI-NEXT: s_add_i32 s25, s92, 0x300 +; SI-NEXT: s_add_i32 s30, s88, 3 +; SI-NEXT: v_readlane_b32 s92, v61, 52 +; SI-NEXT: s_and_b32 s88, s30, 0xff +; SI-NEXT: s_lshl_b32 s92, s92, 8 +; SI-NEXT: s_or_b32 s88, s92, s88 +; SI-NEXT: v_readlane_b32 s92, v61, 51 +; SI-NEXT: s_add_i32 s94, s92, 3 +; SI-NEXT: s_and_b32 s92, s94, 0xff +; SI-NEXT: s_lshl_b32 s91, s91, 8 +; SI-NEXT: s_add_i32 s90, s90, 3 +; SI-NEXT: s_or_b32 s91, s91, s92 +; SI-NEXT: s_and_b32 s90, s90, 0xff +; SI-NEXT: s_lshl_b32 s92, s31, 8 +; SI-NEXT: s_or_b32 s90, s92, s90 +; SI-NEXT: v_readlane_b32 s92, v61, 50 +; SI-NEXT: s_add_i32 s92, s92, 3 +; SI-NEXT: s_add_i32 s26, s93, 0x300 +; SI-NEXT: s_and_b32 s92, s92, 0xff +; SI-NEXT: s_lshl_b32 s93, s95, 8 +; SI-NEXT: s_or_b32 s92, s93, s92 +; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_and_b32 s93, s93, 0xff +; SI-NEXT: s_lshl_b32 s94, s55, 8 +; SI-NEXT: s_or_b32 s93, s94, s93 +; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: s_and_b32 s94, s94, 0xff +; SI-NEXT: s_lshl_b32 s95, s99, 8 +; SI-NEXT: s_or_b32 s94, s95, s94 +; SI-NEXT: v_readlane_b32 s95, v61, 1 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: v_readlane_b32 s30, v61, 0 +; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 +; SI-NEXT: s_and_b32 s95, s95, 0xff +; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 +; SI-NEXT: v_readlane_b32 s30, v61, 47 +; SI-NEXT: s_or_b32 s95, vcc_lo, s95 +; SI-NEXT: s_add_i32 vcc_lo, s30, 3 +; SI-NEXT: v_readlane_b32 s30, v61, 2 +; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 +; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 +; SI-NEXT: v_readlane_b32 s30, v61, 46 +; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo +; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: v_readlane_b32 s30, v61, 45 +; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s30, s30, 8 +; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi +; SI-NEXT: v_readlane_b32 s30, v61, 44 +; SI-NEXT: s_add_i32 s30, s30, 3 +; SI-NEXT: v_readlane_b32 s31, v61, 43 +; SI-NEXT: s_and_b32 s30, s30, 0xff +; SI-NEXT: s_lshl_b32 s31, s31, 8 +; SI-NEXT: s_or_b32 s30, s31, s30 +; SI-NEXT: v_readlane_b32 s31, v61, 42 +; SI-NEXT: s_add_i32 s29, s34, 0x300 +; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: v_readlane_b32 s34, v61, 41 +; SI-NEXT: s_and_b32 s31, s31, 0xff +; SI-NEXT: s_lshl_b32 s34, s34, 8 +; SI-NEXT: s_or_b32 s31, s34, s31 +; SI-NEXT: s_addk_i32 s31, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 +; SI-NEXT: s_addk_i32 s30, 0x300 +; SI-NEXT: s_addk_i32 vcc_hi, 0x300 +; SI-NEXT: v_readlane_b32 s34, v61, 40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 +; SI-NEXT: s_add_i32 s34, s34, 3 +; SI-NEXT: v_readlane_b32 s35, v61, 39 +; SI-NEXT: s_and_b32 s34, s34, 0xff +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi +; SI-NEXT: s_lshl_b32 s35, s35, 8 +; SI-NEXT: s_addk_i32 vcc_lo, 0x300 +; SI-NEXT: s_or_b32 s34, s35, s34 +; SI-NEXT: v_readlane_b32 s35, v61, 38 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: v_readlane_b32 s36, v61, 37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo +; SI-NEXT: s_and_b32 s35, s35, 0xff +; SI-NEXT: s_lshl_b32 s36, s36, 8 +; SI-NEXT: s_or_b32 s35, s36, s35 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_add_i32 s7, s60, 0x300 +; SI-NEXT: s_add_i32 s8, s61, 0x300 +; SI-NEXT: s_add_i32 s9, s62, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s22, 0x300 +; SI-NEXT: s_add_i32 s40, s39, 0x300 +; SI-NEXT: s_add_i32 s41, s52, 0x300 +; SI-NEXT: s_add_i32 s42, s53, 0x300 +; SI-NEXT: s_add_i32 s43, s64, 0x300 +; SI-NEXT: s_add_i32 s44, s66, 0x300 +; SI-NEXT: s_add_i32 s45, s67, 0x300 +; SI-NEXT: s_add_i32 s46, s68, 0x300 +; SI-NEXT: s_add_i32 s47, s69, 0x300 +; SI-NEXT: s_add_i32 s56, s70, 0x300 +; SI-NEXT: s_add_i32 s57, s71, 0x300 +; SI-NEXT: s_add_i32 s58, s81, 0x300 +; SI-NEXT: s_add_i32 s59, s83, 0x300 +; SI-NEXT: s_add_i32 s60, s85, 0x300 +; SI-NEXT: s_add_i32 s61, s96, 0x300 +; SI-NEXT: s_add_i32 s62, s97, 0x300 +; SI-NEXT: s_addk_i32 s63, 0x300 +; SI-NEXT: s_addk_i32 s78, 0x300 +; SI-NEXT: s_addk_i32 s23, 0x300 +; SI-NEXT: s_addk_i32 s19, 0x300 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_addk_i32 s27, 0x300 +; SI-NEXT: s_addk_i32 s73, 0x300 +; SI-NEXT: s_addk_i32 s74, 0x300 +; SI-NEXT: s_addk_i32 s75, 0x300 +; SI-NEXT: s_addk_i32 s76, 0x300 +; SI-NEXT: s_addk_i32 s77, 0x300 +; SI-NEXT: s_addk_i32 s79, 0x300 +; SI-NEXT: s_addk_i32 s88, 0x300 +; SI-NEXT: s_addk_i32 s91, 0x300 +; SI-NEXT: s_addk_i32 s90, 0x300 +; SI-NEXT: s_addk_i32 s92, 0x300 +; SI-NEXT: s_addk_i32 s93, 0x300 +; SI-NEXT: s_addk_i32 s94, 0x300 +; SI-NEXT: s_addk_i32 s95, 0x300 +; SI-NEXT: s_addk_i32 s34, 0x300 +; SI-NEXT: s_addk_i32 s35, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s35 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: s_mov_b32 s26, s37 +; SI-NEXT: s_mov_b32 s28, s29 +; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: s_mov_b32 s59, s58 +; SI-NEXT: s_mov_b32 s56, s47 +; SI-NEXT: s_mov_b32 s46, s41 +; SI-NEXT: s_mov_b32 s12, s11 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s97 +; SI-NEXT: s_mov_b32 s97, s81 +; SI-NEXT: s_mov_b32 s81, s85 +; SI-NEXT: s_mov_b32 s6, s40 +; SI-NEXT: s_mov_b32 s40, s72 +; SI-NEXT: s_mov_b32 s45, s73 +; SI-NEXT: s_mov_b32 s15, s89 +; SI-NEXT: s_mov_b32 s24, s98 +; SI-NEXT: s_mov_b32 s20, s88 +; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_readlane_b32 s78, v61, 4 +; SI-NEXT: v_readlane_b32 s92, v61, 6 +; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: v_readlane_b32 s37, v61, 8 +; SI-NEXT: v_readlane_b32 s30, v61, 10 +; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_readlane_b32 s35, v61, 12 +; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_readlane_b32 s64, v61, 14 +; SI-NEXT: v_readlane_b32 s54, v61, 17 +; SI-NEXT: v_readlane_b32 s67, v61, 16 +; SI-NEXT: v_readlane_b32 s65, v61, 18 +; SI-NEXT: v_readlane_b32 s70, v61, 19 +; SI-NEXT: v_readlane_b32 s49, v61, 21 +; SI-NEXT: v_readlane_b32 s71, v61, 20 +; SI-NEXT: v_readlane_b32 s80, v61, 23 +; SI-NEXT: v_readlane_b32 s83, v61, 22 +; SI-NEXT: v_readlane_b32 s84, v61, 25 +; SI-NEXT: v_readlane_b32 s82, v61, 24 +; SI-NEXT: v_readlane_b32 s87, v61, 26 +; SI-NEXT: v_readlane_b32 s86, v61, 27 +; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s51, v61, 28 +; SI-NEXT: s_mov_b32 s55, s93 +; SI-NEXT: s_mov_b32 s95, s91 +; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: v_readlane_b32 s34, v61, 30 +; SI-NEXT: v_readlane_b32 s53, v61, 32 +; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_readlane_b32 s69, v61, 35 +; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v63, v0 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v57, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 +; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 +; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 +; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: s_and_b32 s4, s26, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v63, v3 +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v128i8_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v39, v16 +; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v55, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v50, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v49, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v45 +; GFX9-NEXT: v_mov_b32_e32 v45, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB93_3 +; GFX9-NEXT: .LBB93_2: +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB93_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB93_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_lshl_b32 s6, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshl_b32 s9, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshl_b32 s10, s19, 8 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: .LBB93_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB93_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v2, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v68 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v165 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB93_3 +; GFX11-TRUE16-NEXT: .LBB93_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v167, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v176, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v165 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v165, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v70, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v69, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v64, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v53, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v38, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v37, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v34, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v53, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v49, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 +; GFX11-TRUE16-NEXT: .LBB93_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB93_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB93_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB93_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB93_3 +; GFX11-FAKE16-NEXT: .LBB93_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 -; GFX11-FAKE16-NEXT: .LBB46_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 +; GFX11-FAKE16-NEXT: .LBB93_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB93_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB93_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -93250,2056 +190593,1992 @@ end: } define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v20 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v24 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v23 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v25 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v28 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v27 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v29 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v44 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v37 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v36 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v62 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v53 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v61 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v59 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v56 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v57 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v41 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v40 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v54 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v52 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v50 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v38 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v35 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v46 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v47 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v14 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v15 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; GCN-NEXT: v_or_b32_e32 v61, v32, v14 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v35 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v15, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v55, v5, v6 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; GCN-NEXT: v_or_b32_e32 v43, v7, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; GCN-NEXT: v_or_b32_e32 v41, v9, v5 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_or_b32_e32 v47, v10, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v2, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_or_b32_e32 v58, v11, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v54, v4, v1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_or_b32_e32 v60, v12, v1 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_or_b32_e32 v44, v13, v2 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v5, v1 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v34 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v6, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v36 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v7, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v37 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v9, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v10, v1 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v48 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v56, v11, v5 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v49 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v12, v3 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v50 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v13, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v33, v7 -; GCN-NEXT: v_bfe_u32 v7, v35, 8, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v7, v4 -; GCN-NEXT: v_bfe_u32 v4, v59, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v4, v9 -; GCN-NEXT: v_bfe_u32 v4, v23, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v4, v1 -; GCN-NEXT: v_bfe_u32 v1, v32, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v1, v10 -; GCN-NEXT: v_bfe_u32 v1, v31, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v1, v5 -; GCN-NEXT: v_bfe_u32 v1, v22, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v1, v11 -; GCN-NEXT: v_bfe_u32 v1, v16, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v1, v3 -; GCN-NEXT: v_bfe_u32 v1, v15, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v15, v8, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v21, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v1, v6 -; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v16, v38, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v18, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v29, v2 -; GCN-NEXT: v_bfe_u32 v2, v14, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v2, v24, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v2, v30, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v2, v19, 8, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v43, v55, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v43, v55, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v43, v55, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v62 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v47 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v58 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v60 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v14, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v17, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; kill: killed $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: .LBB47_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v38 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v29 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v19 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v8 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v49 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v16 -; GCN-NEXT: v_or_b32_e32 v8, v8, v19 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v48 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v19, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v18 -; GCN-NEXT: v_or_b32_e32 v8, v8, v21 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v39 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v63, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v33 -; GCN-NEXT: v_mov_b32_e32 v33, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GCN-NEXT: v_or_b32_e32 v59, v23, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GCN-NEXT: v_or_b32_e32 v23, v19, v5 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v42, v9, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; GCN-NEXT: v_or_b32_e32 v35, v35, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_or_b32_e32 v57, v28, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; GCN-NEXT: v_or_b32_e32 v28, v27, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v36 -; GCN-NEXT: v_or_b32_e32 v56, v13, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; GCN-NEXT: v_or_b32_e32 v27, v22, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; GCN-NEXT: v_or_b32_e32 v46, v24, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v32 -; GCN-NEXT: v_or_b32_e32 v26, v26, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v45, v34, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 -; GCN-NEXT: v_or_b32_e32 v25, v25, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v62 -; GCN-NEXT: v_or_b32_e32 v44, v12, v14 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_bfe_u32 v5, v62, 8, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v51, v40, v38 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v5, v18, 8, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v54, v54, v13 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v5, v60, 8, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v60, v52, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v1, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v40, v50, v22 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v58, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v58, v43, v30 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v41, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v41, v61, v24 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v55, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v47, v47, v32 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v49, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v55, v4, v31 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v48, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v43, v3, v36 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v11, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v61, v2, v34 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v21, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v62, v15, v37 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v53, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_bfe_u32 v1, v33, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v19, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v55, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v55, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v33, v43, v55, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 8 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 8 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v17, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: .LBB47_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v62 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; GCN-NEXT: v_or_b32_e32 v29, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v30, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v2, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v61, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v62, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v54 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v22, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v25, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v26, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v27, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v28, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_mov_b32_e32 v5, v19 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v18, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v31, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v49, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v4, v1, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v5, v1, v33 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; GCN-NEXT: v_or_b32_e32 v61, v3, v34 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v29, v37 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v30, v38 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v48 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v22, v22, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v25, v25, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v27, v27, v43 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v23, v23, v44 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v19, v19, v45 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v21, v21, v46 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v17, v17, v47 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v20, v20, v56 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v58 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v59 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_mov_b32_e32 v45, v46 +; SI-NEXT: v_mov_b32_e32 v46, v6 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v41, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v53, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v51, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v37, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v38, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v35, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v34, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v47, v14 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v25, v5, v14 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 +; SI-NEXT: v_or_b32_e32 v22, v58, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 +; SI-NEXT: v_or_b32_e32 v23, v57, v5 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v20, v61, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v21, v60, v5 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v18, v40, v5 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_or_b32_e32 v19, v55, v5 +; SI-NEXT: v_alignbit_b32 v5, v50, v49, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: v_or_b32_e32 v16, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v42, v1 +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_or_b32_e32 v15, v2, v1 +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: .LBB94_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v20, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v22, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_or_b32_e32 v24, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v26, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v28, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v27, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_or_b32_e32 v30, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v35, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v37, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v36, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v38, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v48, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v49, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v51, v4, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v50, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v52, v1, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_or_b32_e32 v54, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v44, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_or_b32_e32 v41, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v24, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v42, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: .LBB94_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v128i8: ; VI: ; %bb.0: @@ -95500,7 +192779,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill @@ -95644,9 +192923,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v33, v31 ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[19:20] ; VI-NEXT: v_mov_b32_e32 v51, v34 -; VI-NEXT: .LBB47_2: ; %Flow +; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_4 +; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v63, 0x200 ; VI-NEXT: v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -95984,7 +193263,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v32, v41, 8, 8 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: .LBB47_4: ; %end +; VI-NEXT: .LBB94_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -96576,7 +193855,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill @@ -96759,9 +194038,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: .LBB47_2: ; %Flow +; GFX9-NEXT: .LBB94_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_4 +; GFX9-NEXT: s_cbranch_execz .LBB94_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] @@ -96977,7 +194256,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 -; GFX9-NEXT: .LBB47_4: ; %end +; GFX9-NEXT: .LBB94_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -97417,7 +194696,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -97484,9 +194763,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] @@ -97586,7 +194865,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GFX11-TRUE16-NEXT: .LBB47_4: ; %end +; GFX11-TRUE16-NEXT: .LBB94_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l @@ -98034,7 +195313,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] @@ -98133,462 +195412,7092 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: .LBB47_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB94_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: .LBB94_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_mov_b32_e32 v59, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v51 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v52 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB95_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v44 +; SI-NEXT: v_or_b32_e32 v55, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v57, v16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_or_b32_e32 v17, v14, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_or_b32_e32 v16, v19, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_or_b32_e32 v19, v23, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v47, v60, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v43, v42, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v63, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 +; SI-NEXT: v_or_b32_e32 v42, v58, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v60, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_or_b32_e32 v22, v2, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v2, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_or_b32_e32 v34, v34, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_or_b32_e32 v3, v59, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_or_b32_e32 v59, v56, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v62, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_or_b32_e32 v62, v25, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v2, v27, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_or_b32_e32 v25, v28, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v36, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_or_b32_e32 v23, v35, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v39, v11 +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_alignbit_b32 v1, v55, v13, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v13, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v57, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v57, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v57, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v19, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v19, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v14, v43, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v14, v43, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v14, v43, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v60, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v24, v22, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v24, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v24, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v4, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v34, v4, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v3, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v59, v3, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_or_b32_e32 v61, v50, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; SI-NEXT: v_or_b32_e32 v2, v48, v11 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v49, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v49, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v62, v49, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v36, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v25, v36, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v23, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v18, 16 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_or_b32_e32 v58, v54, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_or_b32_e32 v6, v53, v11 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v61, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v58, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v58, v2, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v61 +; SI-NEXT: v_or_b32_e32 v54, v40, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v44, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v29, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v38, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v48, 8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v37, 8, 8 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v52, 8, 8 +; SI-NEXT: v_alignbit_b32 v28, v58, v2, 24 +; SI-NEXT: v_alignbit_b32 v2, v54, v6, 24 +; SI-NEXT: v_alignbit_b32 v39, v54, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v54, v6, 8 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v56, v12, v11, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 8 +; SI-NEXT: v_mov_b32_e32 v20, v29 +; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_bfe_u32 v29, v7, 8, 8 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB95_3 +; SI-NEXT: .LBB95_2: +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v20, v29 +; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: .LBB95_3: ; %Flow +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v6, v7 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_mov_b32_e32 v31, v33 +; SI-NEXT: v_mov_b32_e32 v44, v15 +; SI-NEXT: v_mov_b32_e32 v33, v20 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v2 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v11, v27 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v30, v29 +; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: s_cbranch_vccnz .LBB95_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; SI-NEXT: v_or_b32_e32 v56, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v36, v14, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v54, v14, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v52, v17, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v58, v17, v19 +; SI-NEXT: v_alignbit_b32 v40, v58, v52, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v11, v21, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v21, v22 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v16, v23, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v48, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v53, v26, v27 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v62, v28, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v59, v29, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v3, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v4, v34, v30 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_or_b32_e32 v34, v35, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v22, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v24, v37, v36 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 +; SI-NEXT: v_or_b32_e32 v42, v39, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: v_mov_b32_e32 v36, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v60, v37, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_alignbit_b32 v39, v54, v29, 16 +; SI-NEXT: v_or_b32_e32 v43, v48, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v14, v49, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v28, v14, v43, 8 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v19, v48, v37 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 +; SI-NEXT: v_or_b32_e32 v47, v49, v37 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_or_b32_e32 v21, v50, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v16, v37, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_alignbit_b32 v50, v54, v29, 24 +; SI-NEXT: v_or_b32_e32 v57, v48, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v17, v49, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_mov_b32_e32 v49, v53 +; SI-NEXT: v_alignbit_b32 v53, v54, v29, 8 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_or_b32_e32 v13, v48, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_or_b32_e32 v55, v51, v37 +; SI-NEXT: v_alignbit_b32 v10, v55, v13, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v55, v13, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v17, v57, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v17, v57, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v17, v57, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v16, v21, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v16, v21, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v16, v21, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v47, v19, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v47, v19, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v47, v19, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v14, v43, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v14, v43, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v60, v42, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v60, v42, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v60, v42, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v24, v22, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v24, v22, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v24, v22, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v34, v4, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v34, v4, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v34, v4, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v59, v3, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v59, v3, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v59, v3, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v62, v49, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v62, v49, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v62, v49, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v25, v36, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v25, v36, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v25, v36, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v23, v35, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v23, v35, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v23, v35, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v61, v11, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v61, v11, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v10, v61, v11, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v10, v58, v52, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v10, v58, v52, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v56 +; SI-NEXT: v_alignbit_b32 v11, v12, v10, 24 +; SI-NEXT: v_alignbit_b32 v56, v12, v10, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v38, v12, v10, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v55 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v17 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v47 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v14 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v60 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v59 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v62 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v23 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v58 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v54 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v12 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v20, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v18, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v15, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v33, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v44, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v31, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v30, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v9, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v8, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v6, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v5, 8, 8 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_bfe_u32 v10, v26, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v2, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v1, 8, 8 +; SI-NEXT: v_alignbit_b32 v48, v55, v13, 24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 +; SI-NEXT: v_bfe_u32 v30, v7, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v27, 8, 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: .LBB95_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v48 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v37, v51 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v51, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_or_b32_e32 v37, v37, v51 +; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v37, v51 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 +; SI-NEXT: v_or_b32_e32 v20, v48, v20 +; SI-NEXT: v_or_b32_e32 v20, v37, v20 +; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v37, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 +; SI-NEXT: v_or_b32_e32 v37, v48, v37 +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v37 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v37, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 +; SI-NEXT: v_or_b32_e32 v20, v37, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v15, v20, v15 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v3 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v3 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v4 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v63, s68, 20 +; VI-NEXT: v_writelane_b32 v63, s69, 21 +; VI-NEXT: v_writelane_b32 v63, s70, 22 +; VI-NEXT: v_writelane_b32 v63, s71, 23 +; VI-NEXT: v_writelane_b32 v63, s80, 24 +; VI-NEXT: v_writelane_b32 v63, s81, 25 +; VI-NEXT: v_writelane_b32 v63, s82, 26 +; VI-NEXT: v_writelane_b32 v63, s83, 27 +; VI-NEXT: v_writelane_b32 v63, s84, 28 +; VI-NEXT: v_writelane_b32 v63, s85, 29 +; VI-NEXT: v_writelane_b32 v63, s86, 30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v63, s87, 31 +; VI-NEXT: v_readfirstlane_b32 s44, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s42, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s40, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s14, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s12, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s10, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s8, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s6, v17 +; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v62, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s41, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s40, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s43, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s42, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v62, s46, 57 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: s_lshr_b32 s52, s7, 16 +; VI-NEXT: s_lshr_b32 s53, s6, 16 +; VI-NEXT: s_lshr_b32 s84, s9, 16 +; VI-NEXT: s_lshr_b32 s85, s8, 16 +; VI-NEXT: s_lshr_b32 s80, s11, 24 +; VI-NEXT: s_lshr_b32 s86, s11, 16 +; VI-NEXT: s_lshr_b32 s87, s10, 16 +; VI-NEXT: s_lshr_b32 s81, s13, 24 +; VI-NEXT: s_lshr_b32 s54, s13, 16 +; VI-NEXT: s_lshr_b32 s55, s12, 16 +; VI-NEXT: s_lshr_b32 s82, s15, 24 +; VI-NEXT: s_lshr_b32 s64, s15, 16 +; VI-NEXT: s_lshr_b32 s65, s14, 16 +; VI-NEXT: s_lshr_b32 s83, s41, 24 +; VI-NEXT: s_lshr_b32 s66, s41, 16 +; VI-NEXT: s_lshr_b32 s67, s40, 16 +; VI-NEXT: s_lshr_b32 s50, s43, 24 +; VI-NEXT: s_lshr_b32 s68, s43, 16 +; VI-NEXT: s_lshr_b32 s69, s42, 16 +; VI-NEXT: s_lshr_b32 s51, s45, 24 +; VI-NEXT: s_lshr_b32 s70, s45, 16 +; VI-NEXT: s_lshr_b32 s71, s44, 16 +; VI-NEXT: v_writelane_b32 v62, s46, 56 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_mov_b32_e32 v7, 0x200 +; VI-NEXT: v_add_f16_e32 v1, s46, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s45, v7 +; VI-NEXT: s_lshr_b32 s45, s44, 16 +; VI-NEXT: v_or_b32_e32 v23, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s45, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s44, v7 +; VI-NEXT: s_lshr_b32 s44, s43, 16 +; VI-NEXT: v_or_b32_e32 v22, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s44, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s43, v7 +; VI-NEXT: s_lshr_b32 s43, s42, 16 +; VI-NEXT: v_or_b32_e32 v25, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s43, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s42, v7 +; VI-NEXT: s_lshr_b32 s42, s41, 16 +; VI-NEXT: v_or_b32_e32 v24, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s42, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s41, v7 +; VI-NEXT: s_lshr_b32 s41, s40, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v27, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s41, v7 +; VI-NEXT: v_add_f16_e32 v2, s40, v7 +; VI-NEXT: s_lshr_b32 s40, s15, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v53, s40, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v26, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; VI-NEXT: v_add_f16_e32 v2, s15, v7 +; VI-NEXT: s_lshr_b32 s15, s14, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v29, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s15, v7 +; VI-NEXT: v_add_f16_e32 v2, s14, v7 +; VI-NEXT: s_lshr_b32 s14, s13, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v43, s14, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v28, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; VI-NEXT: v_add_f16_e32 v2, s13, v7 +; VI-NEXT: s_lshr_b32 s13, s12, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s13, v7 +; VI-NEXT: v_add_f16_e32 v2, s12, v7 +; VI-NEXT: s_lshr_b32 s12, s11, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v37, s12, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_add_f16_e32 v2, s11, v7 +; VI-NEXT: s_lshr_b32 s11, s10, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v31, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s11, v7 +; VI-NEXT: v_add_f16_e32 v2, s10, v7 +; VI-NEXT: s_lshr_b32 s10, s9, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v52, s10, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v30, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; VI-NEXT: v_add_f16_e32 v2, s9, v7 +; VI-NEXT: s_lshr_b32 s9, s8, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s9, v7 +; VI-NEXT: v_add_f16_e32 v2, s8, v7 +; VI-NEXT: s_lshr_b32 s8, s7, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v50, s8, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; VI-NEXT: v_add_f16_e32 v2, s7, v7 +; VI-NEXT: s_lshr_b32 s7, s6, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s7, v7 +; VI-NEXT: v_add_f16_e32 v8, s6, v7 +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v36, s6, v7 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; VI-NEXT: v_add_f16_e32 v9, s17, v7 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_or_b32_e32 v33, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s6, v7 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s16, v7 +; VI-NEXT: v_add_f16_e32 v38, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v32, v9, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; VI-NEXT: v_add_f16_e32 v9, s19, v7 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_or_b32_e32 v21, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s6, v7 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s18, v7 +; VI-NEXT: v_add_f16_e32 v61, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v20, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s20, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v61 +; VI-NEXT: v_add_f16_e32 v9, s21, v7 +; VI-NEXT: v_or_b32_e32 v35, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s20, v7 +; VI-NEXT: v_add_f16_e32 v45, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v34, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s22, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 +; VI-NEXT: v_add_f16_e32 v9, s23, v7 +; VI-NEXT: v_or_b32_e32 v19, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s22, v7 +; VI-NEXT: v_add_f16_e32 v47, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v18, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s24, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; VI-NEXT: v_add_f16_e32 v9, s25, v7 +; VI-NEXT: v_or_b32_e32 v16, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s24, v7 +; VI-NEXT: v_add_f16_e32 v57, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v15, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s26, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 +; VI-NEXT: v_add_f16_e32 v9, s27, v7 +; VI-NEXT: v_or_b32_e32 v13, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s26, v7 +; VI-NEXT: v_add_f16_e32 v59, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v12, v9, v8 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; VI-NEXT: v_add_f16_e32 v9, s29, v7 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_or_b32_e32 v10, v9, v8 +; VI-NEXT: v_add_f16_e32 v8, s7, v7 +; VI-NEXT: s_lshr_b32 s7, s4, 16 +; VI-NEXT: v_add_f16_e32 v51, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_f16_e32 v9, s28, v7 +; VI-NEXT: v_add_f16_e32 v54, s5, v7 +; VI-NEXT: v_add_f16_e32 v11, s7, v7 +; VI-NEXT: v_add_f16_e32 v55, s4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v9, v8 +; VI-NEXT: v_or_b32_e32 v8, v54, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v55, v7 +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v8 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v10 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v9 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v13 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v35 +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[22:23] +; VI-NEXT: v_bfe_u32 v23, v50, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v52, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v37, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v43, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v23, v53, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25] +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[30:31] +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v20 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v21 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[28:29] +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v22 +; VI-NEXT: v_bfe_u32 v25, v51, 8, 8 +; VI-NEXT: v_bfe_u32 v27, v59, 8, 8 +; VI-NEXT: v_bfe_u32 v6, v57, 8, 8 +; VI-NEXT: v_bfe_u32 v12, v47, 8, 8 +; VI-NEXT: v_bfe_u32 v15, v45, 8, 8 +; VI-NEXT: v_bfe_u32 v1, v61, 8, 8 +; VI-NEXT: v_bfe_u32 v22, v38, 8, 8 +; VI-NEXT: v_bfe_u32 v2, v36, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_bfe_u32 v26, v50, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v24, v24, 8, 8 +; VI-NEXT: s_branch .LBB95_5 +; VI-NEXT: .LBB95_3: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v1, s44 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s45 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s43 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s40 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s41 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s14 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s18 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s71 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s69 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s68 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s67 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s66 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s65 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s64 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s87 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s85 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s53 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s52 +; VI-NEXT: v_readlane_b32 s6, v62, 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 1 +; VI-NEXT: v_mov_b32_e32 v36, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 3 +; VI-NEXT: v_mov_b32_e32 v38, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 5 +; VI-NEXT: v_mov_b32_e32 v61, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 7 +; VI-NEXT: v_mov_b32_e32 v45, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 9 +; VI-NEXT: v_mov_b32_e32 v47, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 10 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 11 +; VI-NEXT: v_mov_b32_e32 v57, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 12 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 13 +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: v_mov_b32_e32 v59, s6 +; VI-NEXT: v_readlane_b32 s6, v62, 14 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v22, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 19 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 20 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 21 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 22 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 23 +; VI-NEXT: v_mov_b32_e32 v25, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 24 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 25 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: v_mov_b32_e32 v46, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 28 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s80 +; VI-NEXT: v_mov_b32_e32 v60, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 29 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s81 +; VI-NEXT: v_mov_b32_e32 v40, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 30 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s82 +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 31 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 32 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 33 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 34 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 35 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 36 +; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 37 +; VI-NEXT: v_mov_b32_e32 v49, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 38 +; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 39 +; VI-NEXT: v_mov_b32_e32 v42, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 40 +; VI-NEXT: v_mov_b32_e32 v56, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 41 +; VI-NEXT: v_mov_b32_e32 v58, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 42 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 43 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 44 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 45 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 46 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s78 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_readlane_b32 s4, v62, 48 +; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 49 +; VI-NEXT: v_mov_b32_e32 v30, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 50 +; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 51 +; VI-NEXT: v_mov_b32_e32 v32, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 52 +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: v_mov_b32_e32 v28, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 54 +; VI-NEXT: v_mov_b32_e32 v34, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 55 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: v_mov_b32_e32 v3, s88 +; VI-NEXT: v_readlane_b32 s6, v62, 15 +; VI-NEXT: v_mov_b32_e32 v21, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s70 +; VI-NEXT: v_mov_b32_e32 v43, s54 +; VI-NEXT: v_mov_b32_e32 v37, s86 +; VI-NEXT: v_mov_b32_e32 v52, s84 +; VI-NEXT: v_mov_b32_e32 v51, s6 +; VI-NEXT: v_mov_b32_e32 v54, s5 +; VI-NEXT: v_mov_b32_e32 v23, s83 +; VI-NEXT: v_mov_b32_e32 v24, s50 +; VI-NEXT: v_mov_b32_e32 v26, s51 +; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_mov_b32_e32 v20, s76 +; VI-NEXT: v_mov_b32_e32 v19, s74 +; VI-NEXT: v_mov_b32_e32 v18, s72 +; VI-NEXT: v_mov_b32_e32 v17, s62 +; VI-NEXT: v_mov_b32_e32 v16, s60 +; VI-NEXT: v_mov_b32_e32 v13, s58 +; VI-NEXT: v_mov_b32_e32 v10, s56 +; VI-NEXT: v_mov_b32_e32 v7, s46 +; VI-NEXT: v_mov_b32_e32 v3, s90 +; VI-NEXT: v_mov_b32_e32 v4, s30 +; VI-NEXT: v_mov_b32_e32 v5, s34 +; VI-NEXT: v_mov_b32_e32 v8, s36 +; VI-NEXT: v_mov_b32_e32 v11, s38 +; VI-NEXT: v_mov_b32_e32 v14, s48 +; VI-NEXT: .LBB95_5: ; %end +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v58, v53, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v53, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v46 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v20, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v22 +; VI-NEXT: v_or_b32_sdwa v19, v38, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s44, v3 +; GFX9-NEXT: v_readfirstlane_b32 s45, v4 +; GFX9-NEXT: v_readfirstlane_b32 s42, v5 +; GFX9-NEXT: v_readfirstlane_b32 s43, v6 +; GFX9-NEXT: v_readfirstlane_b32 s40, v7 +; GFX9-NEXT: v_readfirstlane_b32 s41, v8 +; GFX9-NEXT: v_readfirstlane_b32 s14, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s26, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s26, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s24, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s24, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s82, s11, 24 +; GFX9-NEXT: s_lshr_b32 s83, s11, 16 +; GFX9-NEXT: s_lshr_b32 s85, s11, 8 +; GFX9-NEXT: s_lshr_b32 s84, s10, 16 +; GFX9-NEXT: s_lshr_b32 s86, s10, 8 +; GFX9-NEXT: s_lshr_b32 s87, s13, 24 +; GFX9-NEXT: s_lshr_b32 s96, s13, 16 +; GFX9-NEXT: s_lshr_b32 s98, s13, 8 +; GFX9-NEXT: s_lshr_b32 s97, s12, 16 +; GFX9-NEXT: s_lshr_b32 s99, s12, 8 +; GFX9-NEXT: s_lshr_b32 s38, s15, 24 +; GFX9-NEXT: s_lshr_b32 s39, s15, 16 +; GFX9-NEXT: s_lshr_b32 s49, s15, 8 +; GFX9-NEXT: s_lshr_b32 s48, s14, 16 +; GFX9-NEXT: s_lshr_b32 s50, s14, 8 +; GFX9-NEXT: s_lshr_b32 s51, s41, 24 +; GFX9-NEXT: s_lshr_b32 s52, s41, 16 +; GFX9-NEXT: s_lshr_b32 s54, s41, 8 +; GFX9-NEXT: s_lshr_b32 s53, s40, 16 +; GFX9-NEXT: s_lshr_b32 s55, s40, 8 +; GFX9-NEXT: s_lshr_b32 s64, s43, 24 +; GFX9-NEXT: s_lshr_b32 s65, s43, 16 +; GFX9-NEXT: s_lshr_b32 s67, s43, 8 +; GFX9-NEXT: s_lshr_b32 s66, s42, 16 +; GFX9-NEXT: s_lshr_b32 s68, s42, 8 +; GFX9-NEXT: s_lshr_b32 s69, s45, 24 +; GFX9-NEXT: s_lshr_b32 s70, s45, 16 +; GFX9-NEXT: s_lshr_b32 s80, s45, 8 +; GFX9-NEXT: s_lshr_b32 s71, s44, 16 +; GFX9-NEXT: s_lshr_b32 s81, s44, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v15, 0x200 +; GFX9-NEXT: v_pk_add_f16 v26, s5, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, s4, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, s45, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, s44, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s43, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s42, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s41, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s40, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s15, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s14, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s13, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s12, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s11, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s10, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s9, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s8, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s7, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s6, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v49, s17, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v48, s16, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v38, s19, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v37, s18, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v36, s21, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v35, s20, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v34, s23, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v33, s22, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, s25, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, s24, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, s27, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, s26, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, s29, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, s28, v15 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v4 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: s_branch .LBB95_5 +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v15, s71 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s80 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s70 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s69 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s68 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s66 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s67 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s65 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s64 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s53 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s54 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s52 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s50 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s49 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s39 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s99 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s96 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s87 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s86 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s84 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s85 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s83 +; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s82 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: v_mov_b32_e32 v51, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: v_mov_b32_e32 v50, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: v_mov_b32_e32 v20, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: v_mov_b32_e32 v54, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: v_mov_b32_e32 v23, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: v_mov_b32_e32 v56, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s56 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s58 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s60 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s62 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s72 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s74 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s76 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s78 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s88 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s90 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s92 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s94 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s36 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: v_mov_b32_e32 v22, s45 +; GFX9-NEXT: v_mov_b32_e32 v13, s42 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v11, s40 +; GFX9-NEXT: v_mov_b32_e32 v12, s41 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v48, s16 +; GFX9-NEXT: v_mov_b32_e32 v49, s17 +; GFX9-NEXT: v_mov_b32_e32 v37, s18 +; GFX9-NEXT: v_mov_b32_e32 v38, s19 +; GFX9-NEXT: v_mov_b32_e32 v35, s20 +; GFX9-NEXT: v_mov_b32_e32 v36, s21 +; GFX9-NEXT: v_mov_b32_e32 v33, s22 +; GFX9-NEXT: v_mov_b32_e32 v34, s23 +; GFX9-NEXT: v_mov_b32_e32 v31, s24 +; GFX9-NEXT: v_mov_b32_e32 v32, s25 +; GFX9-NEXT: v_mov_b32_e32 v29, s26 +; GFX9-NEXT: v_mov_b32_e32 v30, s27 +; GFX9-NEXT: v_mov_b32_e32 v27, s28 +; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s5 +; GFX9-NEXT: v_mov_b32_e32 v41, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v51 +; GFX9-NEXT: v_mov_b32_e32 v51, v52 +; GFX9-NEXT: v_mov_b32_e32 v52, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v55 +; GFX9-NEXT: v_mov_b32_e32 v55, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s81 +; GFX9-NEXT: .LBB95_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64f16_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s10, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s12, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s15, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s74, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s75, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v51, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v50, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s40 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v55, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v49, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v48, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[35:36] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 16, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 16, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v21 +; GFX11-TRUE16-NEXT: s_branch .LBB95_5 +; GFX11-TRUE16-NEXT: .LBB95_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB95_2 +; GFX11-TRUE16-NEXT: .LBB95_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v78, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s0 +; GFX11-TRUE16-NEXT: .LBB95_5: ; %end +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v60, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v69, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v81, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v50, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v81, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v35, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v67, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v180 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v35, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v36, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, v67, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v66, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v36, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xff, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v52, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v27, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v28, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v21, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v32, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v49, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v51, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v17, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v18, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v27, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v70, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v24, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v26, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v23, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v20 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[35:38], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s5, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s4, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s9, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s8, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s11, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s10, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s74, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s75, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v51, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v50, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s41 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s40 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v53, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v52, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 24, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 24, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-FAKE16-NEXT: .LBB47_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-FAKE16-NEXT: s_branch .LBB95_5 +; GFX11-FAKE16-NEXT: .LBB95_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, -1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB95_2 +; GFX11-FAKE16-NEXT: .LBB95_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v71, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v74, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v49, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v59, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v57, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v47, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v45, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v43, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v41, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v48, s62 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v54, s72 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v64, s60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v80, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v182, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v181, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v180, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v179, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v177, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v176, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v167, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v166, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v163, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v162, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v161, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v151, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v150, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v149, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v148, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v78, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-FAKE16-NEXT: .LBB95_5: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v69, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v60, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v50, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, v60, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v52, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v66, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v57, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v80, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v183 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v70, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v80, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v66, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v67, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v80, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v32, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v33, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v28, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v29, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v20, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v21, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v15, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v144 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v25, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v33, v34 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v13, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v26, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v21, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v20, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v22, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v17 ; GFX11-FAKE16-NEXT: s_clause 0x5 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -98609,2336 +202518,2164 @@ end: } define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v128i8_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v49, v7 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v60, v0 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 24, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 24, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v63, 24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:392 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v19 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:376 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:368 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:328 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:360 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:384 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; kill: killed $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; kill: killed $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; kill: killed $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; kill: killed $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; kill: killed $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; kill: killed $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; kill: killed $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; kill: killed $vgpr25 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v32, v1, v24 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v28, v1, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v1 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v17 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v25, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v25, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v25, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v25, v45 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v13, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v13, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v12, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v12, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v56, v48, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v50, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v2, v36, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v53, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v3, v40, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v47, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v26 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v1, v63, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v1, v33 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v1, v38 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v1, v48 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v58, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v45, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v47, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v1, v53 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v1, v54 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v30 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v56 -; GCN-NEXT: v_or_b32_e32 v8, v49, v1 -; GCN-NEXT: v_or_b32_e32 v31, v51, v2 -; GCN-NEXT: v_or_b32_e32 v56, v57, v3 -; GCN-NEXT: v_or_b32_e32 v4, v55, v0 -; GCN-NEXT: v_or_b32_e32 v5, v40, v62 -; GCN-NEXT: v_or_b32_e32 v6, v37, v17 -; GCN-NEXT: v_or_b32_e32 v7, v25, v61 -; GCN-NEXT: v_or_b32_e32 v37, v34, v59 -; GCN-NEXT: v_or_b32_e32 v25, v9, v63 -; GCN-NEXT: v_or_b32_e32 v38, v10, v44 -; GCN-NEXT: v_or_b32_e32 v51, v11, v52 -; GCN-NEXT: v_or_b32_e32 v55, v12, v50 -; GCN-NEXT: v_or_b32_e32 v49, v13, v48 -; GCN-NEXT: v_or_b32_e32 v40, v14, v41 -; GCN-NEXT: v_or_b32_e32 v11, v15, v39 -; GCN-NEXT: v_or_b32_e32 v57, v16, v53 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v36, v12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v18, v14 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v19, v13 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v34 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v23, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v23 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v33, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v58 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v27, v46 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v45 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v29, v43 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v42 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v35, v47 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v54 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v62 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v61 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v59 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v63 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v44 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v48 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v61, v1, v12, 16 -; GCN-NEXT: v_mov_b32_e32 v12, v33 -; GCN-NEXT: v_alignbit_b32 v33, v31, v14, 16 -; GCN-NEXT: v_mov_b32_e32 v27, v56 -; GCN-NEXT: v_alignbit_b32 v59, v56, v13, 16 -; GCN-NEXT: v_mov_b32_e32 v13, v19 -; GCN-NEXT: v_alignbit_b32 v29, v4, v34, 16 -; GCN-NEXT: v_alignbit_b32 v0, v5, v20, 16 -; GCN-NEXT: v_alignbit_b32 v14, v6, v21, 16 -; GCN-NEXT: v_mov_b32_e32 v21, v18 -; GCN-NEXT: v_mov_b32_e32 v18, v10 -; GCN-NEXT: v_mov_b32_e32 v56, v16 -; GCN-NEXT: v_mov_b32_e32 v16, v36 -; GCN-NEXT: v_mov_b32_e32 v10, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v7, v22, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v62, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v37, v23, 16 -; GCN-NEXT: v_mov_b32_e32 v23, v9 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v10, v58, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v51 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v51, v46, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v58, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v55, v45, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v52, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v49, v43, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v46, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v15, v40, v42, 16 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v41, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v47, 16 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v3, v57, v54, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; kill: killed $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; kill: killed $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; kill: killed $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; kill: killed $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB48_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v25, v27 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v30, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v38, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v1, v20, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v1, v51, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v28, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v6, v22, v7 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v1, v8 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v45, v9 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v9, v24, v10 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v1, v13 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v1, v14 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v1, v18 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v1, v19 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v1, v20 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v1, v21 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v1, v22 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_mov_b32_e32 v2, v35 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v51, v1, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v49 -; GCN-NEXT: v_mov_b32_e32 v43, v42 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_mov_b32_e32 v44, v40 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v57, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v1, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v59, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v61, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v1, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v23 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_mov_b32_e32 v52, v36 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v24, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_mov_b32_e32 v45, v53 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v24, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v24, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v31, v29 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v36, v34 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v11 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v2, v37 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v49, v2, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v63, v54 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v43 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v2, v40 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v41, v47, v41 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v43, v44, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v15 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v45, v52, v45 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v47, v50, v47 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v48, v56 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v61, v61, v2 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v1, v4 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s7, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, s7, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s7, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s7, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s7, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, s7, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, s7, v38 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v62 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v38, v1 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v38, v6 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v38, v7 -; GCN-NEXT: v_or_b32_e32 v8, v39, v8 -; GCN-NEXT: v_or_b32_e32 v9, v53, v9 -; GCN-NEXT: v_or_b32_e32 v10, v55, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: v_or_b32_e32 v13, v25, v13 -; GCN-NEXT: v_or_b32_e32 v14, v27, v14 -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v29, v16 -; GCN-NEXT: v_or_b32_e32 v17, v31, v17 -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_or_b32_e32 v19, v33, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v37, v22 -; GCN-NEXT: v_or_b32_e32 v24, v49, v26 -; GCN-NEXT: v_or_b32_e32 v25, v54, v30 -; GCN-NEXT: v_or_b32_e32 v26, v40, v35 -; GCN-NEXT: v_or_b32_e32 v28, v41, v51 -; GCN-NEXT: v_or_b32_e32 v30, v43, v42 -; GCN-NEXT: v_or_b32_e32 v33, v44, v46 -; GCN-NEXT: v_or_b32_e32 v34, v45, v57 -; GCN-NEXT: v_or_b32_e32 v38, v47, v58 -; GCN-NEXT: v_or_b32_e32 v39, v56, v59 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v61 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v33 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v34 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v39 -; GCN-NEXT: v_alignbit_b32 v61, v1, v16, 16 -; GCN-NEXT: v_alignbit_b32 v33, v31, v56, 16 -; GCN-NEXT: v_alignbit_b32 v59, v25, v18, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v29, v4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v0, v5, v21, 16 -; GCN-NEXT: v_alignbit_b32 v2, v6, v13, 16 -; GCN-NEXT: v_alignbit_b32 v19, v7, v23, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v17, v14, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v19, v15, v12, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v10, v53, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v11, v52, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v48, v51, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v36, v50, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v35, v49, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v3, v32, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v19, v27, v37, 16 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v31 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v62, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v58, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v52, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v46, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v41, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: .LBB48_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v61 -; GCN-NEXT: v_or_b32_e32 v8, v8, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v19 -; GCN-NEXT: buffer_store_dword v8, v60, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 4, v60 -; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v33 -; GCN-NEXT: v_or_b32_e32 v56, v1, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 8, v60 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v60 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 16, v60 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 20, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; GCN-NEXT: v_or_b32_e32 v61, v2, v3 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 36, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v47, v2, v3 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v23, v3, v5 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 48, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v15, v3, v5 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v57, v3, v5 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v21, v3, v5 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v37 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v3, v5 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v25, v3, v5 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v10 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v27, v3, v5 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v30, v3, v5 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v32, v3, v5 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v34, v3, v5 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v58 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v29, v3, v5 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v37, v3, v5 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v3, v5 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v48, v3, v7 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v50, v3, v7 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v55, v3, v7 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v60 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v41 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v54, v3, v7 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v41, v3, v7 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v52, v3, v7 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x78, v60 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v60 -; GCN-NEXT: buffer_store_dword v56, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v128i8_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:360 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:332 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:348 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:368 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v20, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v41, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v55, v24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v13, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v26, v30, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v21, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v32, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v35, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v63, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v48, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v53, v4, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v1, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v43, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v4, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v46, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v56, v4, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v57, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v61, v4, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v10, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v4, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v23, v10, v22 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v33, v10, v22 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: v_mov_b32_e32 v10, v29 +; SI-NEXT: v_or_b32_e32 v29, v22, v34 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v34, v37, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v37, v51, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v51, v22, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v47, v59, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v54, v22, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v39, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v59, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v60, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v36, v22, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v38, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v58, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v24, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v40, v22, v40 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v15, v22, v15 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v45, v22, v62 +; SI-NEXT: v_mov_b32_e32 v62, v18 +; SI-NEXT: v_or_b32_e32 v20, v20, v62 +; SI-NEXT: v_or_b32_e32 v22, v19, v3 +; SI-NEXT: v_alignbit_b32 v3, v20, v3, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v3, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v3, v9 +; SI-NEXT: v_alignbit_b32 v3, v5, v9, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v41, v3, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v18, v3, v13 +; SI-NEXT: v_alignbit_b32 v3, v41, v13, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v13, v3, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v39, v3, v21 +; SI-NEXT: v_alignbit_b32 v3, v13, v21, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v32, v3, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v42, v3, v27 +; SI-NEXT: v_alignbit_b32 v3, v32, v27, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v27, v3, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v27, v31, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v21, v3, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v31, v3, v50 +; SI-NEXT: v_alignbit_b32 v3, v21, v50, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v3, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v53, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v6, v6, v43 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v3, v43, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v9, v6, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v9, v46, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v63, v6, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v6, v63, v57, 16 +; SI-NEXT: v_or_b32_e32 v57, v4, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v52, v4, v33 +; SI-NEXT: v_alignbit_b32 v4, v57, v33, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v46, v4, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v38, v4, v37 +; SI-NEXT: v_alignbit_b32 v4, v46, v37, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v44, v4, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_or_b32_e32 v36, v4, v47 +; SI-NEXT: v_alignbit_b32 v4, v44, v47, 16 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v54 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_or_b32_e32 v43, v4, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_or_b32_e32 v12, v4, v14 +; SI-NEXT: v_alignbit_b32 v4, v43, v14, 16 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v61, v6, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v24 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: v_or_b32_e32 v58, v6, v49 +; SI-NEXT: v_alignbit_b32 v6, v61, v49, 16 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v34 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB96_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2 +; SI-NEXT: v_mov_b32_e32 v28, v24 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v15, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v5 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v3 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v57, vcc, s7, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v9 +; SI-NEXT: v_mov_b32_e32 v58, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v63, vcc, s7, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s7, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v23 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v25, v25, v21 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v20 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v41, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v1 +; SI-NEXT: v_alignbit_b32 v1, v20, v22, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v5, v25, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v13, v39, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v27, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v11, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v3, v12, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v9, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v63, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v61, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v57, v6, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v46, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v44, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: .LBB96_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v64i16: ; VI: ; %bb.0: @@ -101272,7 +205009,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -101754,9 +205491,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload @@ -102143,7 +205880,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 ; VI-NEXT: v_or_b32_e32 v21, v39, v21 ; VI-NEXT: v_or_b32_e32 v31, v31, v54 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -102519,7 +206256,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -103002,9 +206739,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload @@ -103395,7 +207132,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 ; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 ; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -103640,15 +207377,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_4 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_4 +; GFX11-TRUE16-NEXT: .LBB96_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB48_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l @@ -103907,8 +207644,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 +; GFX11-TRUE16-NEXT: .LBB96_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3 @@ -104182,2316 +207919,19162 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16: +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v128i8_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v62, s28, 0 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v62, s27, 1 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v62, s25, 2 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v62, s24, 3 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v62, s23, 4 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v62, s22, 5 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v62, s21, 6 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v62, s20, 7 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v62, s19, 8 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v62, s18, 9 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v62, s16, 10 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_readfirstlane_b32 s76, v18 +; SI-NEXT: v_readfirstlane_b32 s40, v25 +; SI-NEXT: v_readfirstlane_b32 s16, v24 +; SI-NEXT: v_readfirstlane_b32 s42, v23 +; SI-NEXT: v_readfirstlane_b32 s52, v20 +; SI-NEXT: v_readfirstlane_b32 s8, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s50, v35 +; SI-NEXT: v_readfirstlane_b32 s31, v36 +; SI-NEXT: v_readfirstlane_b32 s53, v37 +; SI-NEXT: v_readfirstlane_b32 s82, v48 +; SI-NEXT: v_readfirstlane_b32 s7, v49 +; SI-NEXT: v_readfirstlane_b32 s79, v52 +; SI-NEXT: v_readfirstlane_b32 s78, v55 +; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 +; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s4, v56 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s4, v57 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: v_readfirstlane_b32 s4, v60 +; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s36, v32 +; SI-NEXT: v_readfirstlane_b32 s71, v33 +; SI-NEXT: v_readfirstlane_b32 s77, v59 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v61 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: v_readfirstlane_b32 s98, v50 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s67, v53 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: v_writelane_b32 v62, s4, 17 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_readfirstlane_b32 s81, v34 +; SI-NEXT: v_readfirstlane_b32 s75, v39 +; SI-NEXT: v_readfirstlane_b32 s68, v42 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 +; SI-NEXT: v_readfirstlane_b32 s49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 +; SI-NEXT: v_readfirstlane_b32 s51, v54 +; SI-NEXT: v_readfirstlane_b32 s97, v51 +; SI-NEXT: v_readfirstlane_b32 s35, v27 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s28, v28 +; SI-NEXT: v_readfirstlane_b32 s87, v26 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_readfirstlane_b32 s96, v17 +; SI-NEXT: v_readfirstlane_b32 s99, v16 +; SI-NEXT: v_readfirstlane_b32 s89, v15 +; SI-NEXT: v_readfirstlane_b32 s88, v12 +; SI-NEXT: v_readfirstlane_b32 s30, v11 +; SI-NEXT: v_readfirstlane_b32 s64, v10 +; SI-NEXT: v_readfirstlane_b32 s55, v9 +; SI-NEXT: v_readfirstlane_b32 s65, v8 +; SI-NEXT: v_readfirstlane_b32 s80, v7 +; SI-NEXT: v_readfirstlane_b32 s21, v2 +; SI-NEXT: v_readfirstlane_b32 s74, v1 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 +; SI-NEXT: v_readfirstlane_b32 s93, v36 +; SI-NEXT: v_readfirstlane_b32 s24, v37 +; SI-NEXT: v_readfirstlane_b32 s27, v48 +; SI-NEXT: v_readfirstlane_b32 s84, v43 +; SI-NEXT: v_readfirstlane_b32 s83, v44 +; SI-NEXT: v_readfirstlane_b32 s85, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s20, v47 +; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: v_writelane_b32 v62, s4, 19 +; SI-NEXT: v_readfirstlane_b32 s23, v49 +; SI-NEXT: v_readfirstlane_b32 s92, v52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 +; SI-NEXT: v_readfirstlane_b32 s90, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: v_readfirstlane_b32 s38, v32 +; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: v_readfirstlane_b32 s54, v59 +; SI-NEXT: v_readfirstlane_b32 s57, v60 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s56, v61 +; SI-NEXT: v_readfirstlane_b32 s59, v55 +; SI-NEXT: v_readfirstlane_b32 s61, v41 +; SI-NEXT: v_readfirstlane_b32 s19, v45 +; SI-NEXT: v_readfirstlane_b32 s34, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_readfirstlane_b32 s25, v53 +; SI-NEXT: v_readfirstlane_b32 s91, v40 +; SI-NEXT: v_readfirstlane_b32 s37, v34 +; SI-NEXT: v_readfirstlane_b32 s47, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s46, v57 +; SI-NEXT: v_readfirstlane_b32 s22, v42 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_readfirstlane_b32 s72, v39 +; SI-NEXT: v_readfirstlane_b32 s94, v51 +; SI-NEXT: v_readfirstlane_b32 s48, v54 +; SI-NEXT: v_readfirstlane_b32 s66, v43 +; SI-NEXT: v_readfirstlane_b32 s69, v44 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s45, v46 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_writelane_b32 v62, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s17, 23 +; SI-NEXT: v_writelane_b32 v62, s40, 24 +; SI-NEXT: v_writelane_b32 v62, s16, 25 +; SI-NEXT: v_writelane_b32 v62, s42, 26 +; SI-NEXT: v_writelane_b32 v62, s46, 27 +; SI-NEXT: v_writelane_b32 v62, s47, 28 +; SI-NEXT: v_writelane_b32 v62, s56, 29 +; SI-NEXT: v_writelane_b32 v62, s57, 30 +; SI-NEXT: v_writelane_b32 v62, s45, 31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s58, v52 +; SI-NEXT: v_writelane_b32 v62, s49, 32 +; SI-NEXT: v_writelane_b32 v62, s58, 33 +; SI-NEXT: v_writelane_b32 v62, s59, 34 +; SI-NEXT: v_writelane_b32 v62, s52, 35 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s60, v38 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s60, 36 +; SI-NEXT: v_writelane_b32 v62, s61, 37 +; SI-NEXT: v_writelane_b32 v62, s93, 38 +; SI-NEXT: v_writelane_b32 v62, s8, 39 +; SI-NEXT: v_readfirstlane_b32 s62, v47 +; SI-NEXT: v_writelane_b32 v62, s72, 40 +; SI-NEXT: v_readfirstlane_b32 s73, v58 +; SI-NEXT: v_writelane_b32 v62, s62, 41 +; SI-NEXT: v_writelane_b32 v62, s73, 42 +; SI-NEXT: v_writelane_b32 v62, s35, 43 +; SI-NEXT: v_writelane_b32 v62, s94, 44 +; SI-NEXT: v_writelane_b32 v62, s48, 45 +; SI-NEXT: v_writelane_b32 v62, s91, 46 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v50 +; SI-NEXT: v_writelane_b32 v62, s66, 47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v18 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB97_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 7 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_readlane_b32 s4, v62, 5 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_or_b32 s63, s5, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 9 +; SI-NEXT: s_and_b32 s5, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s4, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: s_or_b32 s9, s9, s5 +; SI-NEXT: s_and_b32 s5, s4, 0xff +; SI-NEXT: s_lshl_b32 s10, s29, 8 +; SI-NEXT: s_or_b32 s4, s5, s10 +; SI-NEXT: v_writelane_b32 v62, s4, 49 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_writelane_b32 v62, s54, 50 +; SI-NEXT: s_lshl_b32 s11, s21, 24 +; SI-NEXT: s_mov_b32 s18, s22 +; SI-NEXT: s_mov_b32 s22, s21 +; SI-NEXT: s_or_b32 s21, s11, s5 +; SI-NEXT: s_and_b32 s11, s26, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s4, 24 +; SI-NEXT: s_or_b32 s14, s12, s11 +; SI-NEXT: s_and_b32 s11, s80, 0xff +; SI-NEXT: s_lshl_b32 s12, s65, 8 +; SI-NEXT: s_or_b32 s12, s11, s12 +; SI-NEXT: s_and_b32 s11, s55, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s13, s64, 24 +; SI-NEXT: s_or_b32 s41, s13, s11 +; SI-NEXT: s_and_b32 s11, s89, 0xff +; SI-NEXT: s_lshl_b32 s13, s99, 8 +; SI-NEXT: s_or_b32 s13, s11, s13 +; SI-NEXT: s_and_b32 s11, s96, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s76, 24 +; SI-NEXT: s_or_b32 s43, s15, s11 +; SI-NEXT: s_and_b32 s11, s42, 0xff +; SI-NEXT: s_lshl_b32 s15, s16, 8 +; SI-NEXT: s_or_b32 s16, s11, s15 +; SI-NEXT: s_and_b32 s11, s40, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s87, 24 +; SI-NEXT: s_or_b32 s44, s15, s11 +; SI-NEXT: s_and_b32 s11, s73, 0xff +; SI-NEXT: s_lshl_b32 s15, s62, 8 +; SI-NEXT: s_or_b32 s62, s11, s15 +; SI-NEXT: s_and_b32 s11, s58, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s45, 24 +; SI-NEXT: s_or_b32 s45, s15, s11 +; SI-NEXT: s_and_b32 s11, s48, 0xff +; SI-NEXT: s_lshl_b32 s15, s94, 8 +; SI-NEXT: s_or_b32 s10, s11, s15 +; SI-NEXT: s_and_b32 s11, s46, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s47, 24 +; SI-NEXT: s_or_b32 s46, s15, s11 +; SI-NEXT: s_and_b32 s11, s25, 0xff +; SI-NEXT: s_lshl_b32 s15, s34, 8 +; SI-NEXT: s_or_b32 s94, s11, s15 +; SI-NEXT: s_and_b32 s11, s72, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s60, 24 +; SI-NEXT: s_or_b32 s47, s15, s11 +; SI-NEXT: s_and_b32 s11, s61, 0xff +; SI-NEXT: s_lshl_b32 s15, s59, 8 +; SI-NEXT: s_or_b32 s73, s11, s15 +; SI-NEXT: s_and_b32 s11, s56, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s57, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 20 +; SI-NEXT: s_or_b32 s56, s15, s11 +; SI-NEXT: s_and_b32 s11, s38, 0xff +; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: s_or_b32 s48, s11, s15 +; SI-NEXT: s_and_b32 s11, s92, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s23, 24 +; SI-NEXT: s_or_b32 vcc_lo, s15, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xff +; SI-NEXT: s_lshl_b32 s15, s85, 8 +; SI-NEXT: s_or_b32 s72, s11, s15 +; SI-NEXT: s_and_b32 s11, s83, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s84, 24 +; SI-NEXT: s_or_b32 vcc_hi, s15, s11 +; SI-NEXT: s_and_b32 s11, s93, 0xff +; SI-NEXT: s_lshl_b32 s15, s97, 8 +; SI-NEXT: s_or_b32 s57, s11, s15 +; SI-NEXT: s_and_b32 s11, s67, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s51, 24 +; SI-NEXT: v_writelane_b32 v62, s67, 51 +; SI-NEXT: s_mov_b32 s67, s51 +; SI-NEXT: s_mov_b32 s51, s74 +; SI-NEXT: s_or_b32 s74, s15, s11 +; SI-NEXT: s_and_b32 s11, s98, 0xff +; SI-NEXT: s_lshl_b32 s15, s75, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 18 +; SI-NEXT: v_writelane_b32 v62, s87, 52 +; SI-NEXT: s_or_b32 s58, s11, s15 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_writelane_b32 v62, s25, 53 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s81, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 15 +; SI-NEXT: s_mov_b32 s54, s75 +; SI-NEXT: s_or_b32 s75, s15, s11 +; SI-NEXT: s_and_b32 s11, s77, 0xff +; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: s_or_b32 s59, s11, s15 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: v_readlane_b32 s4, v62, 13 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s4, 24 +; SI-NEXT: v_readlane_b32 s4, v62, 12 +; SI-NEXT: s_mov_b32 s95, s69 +; SI-NEXT: s_mov_b32 s69, s76 +; SI-NEXT: s_or_b32 s76, s15, s11 +; SI-NEXT: s_and_b32 s11, s86, 0xff +; SI-NEXT: s_lshl_b32 s15, s4, 8 +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: s_or_b32 s60, s11, s15 +; SI-NEXT: s_and_b32 s11, s4, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s78, 24 +; SI-NEXT: s_mov_b32 s93, s99 +; SI-NEXT: s_mov_b32 s99, s84 +; SI-NEXT: s_mov_b32 s84, s77 +; SI-NEXT: s_or_b32 s77, s15, s11 +; SI-NEXT: s_and_b32 s11, s82, 0xff +; SI-NEXT: s_lshl_b32 s15, s53, 8 +; SI-NEXT: s_or_b32 s61, s11, s15 +; SI-NEXT: s_and_b32 s11, s31, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s15, s50, 24 +; SI-NEXT: s_mov_b32 s4, s85 +; SI-NEXT: s_mov_b32 s85, s83 +; SI-NEXT: s_mov_b32 s83, s82 +; SI-NEXT: s_mov_b32 s82, s53 +; SI-NEXT: s_mov_b32 s53, s50 +; SI-NEXT: s_mov_b32 s50, s31 +; SI-NEXT: s_mov_b32 s31, s78 +; SI-NEXT: s_or_b32 s78, s15, s11 +; SI-NEXT: v_readlane_b32 s11, v62, 10 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s15, s17, 8 +; SI-NEXT: s_or_b32 s11, s11, s15 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_mov_b32_e32 v51, s9 +; SI-NEXT: s_or_b32 s6, s11, s9 +; SI-NEXT: v_readlane_b32 s9, v62, 3 +; SI-NEXT: v_readlane_b32 s11, v62, 2 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s15, s11, 8 +; SI-NEXT: s_or_b32 s9, s9, s15 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s17, s9, s14 +; SI-NEXT: v_readlane_b32 s9, v62, 22 +; SI-NEXT: v_mov_b32_e32 v52, s14 +; SI-NEXT: s_and_b32 s14, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 21 +; SI-NEXT: s_lshl_b32 s15, s9, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v53, v6, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v50, s14, v53 +; SI-NEXT: s_and_b32 s14, s30, 0xff +; SI-NEXT: s_lshl_b32 s15, s88, 8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v54, v14, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v17, s14, v54 +; SI-NEXT: s_and_b32 s14, s8, 0xff +; SI-NEXT: s_lshl_b32 s15, s52, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v55, v40, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v16, s14, v55 +; SI-NEXT: s_and_b32 s14, s35, 0xff +; SI-NEXT: s_lshl_b32 s15, s28, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v40, v42, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v15, s14, v40 +; SI-NEXT: s_and_b32 s14, s95, 0xff +; SI-NEXT: s_lshl_b32 s15, s66, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v41, v61, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v12, s14, v41 +; SI-NEXT: s_and_b32 s14, s18, 0xff +; SI-NEXT: s_lshl_b32 s15, s91, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v42, v60, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v11, s14, v42 +; SI-NEXT: s_and_b32 s14, s37, 0xff +; SI-NEXT: s_lshl_b32 s15, s19, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_mov_b32 s91, s6 +; SI-NEXT: v_or_b32_e32 v59, v31, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 50 +; SI-NEXT: v_or_b32_e32 v10, s14, v59 +; SI-NEXT: s_and_b32 s14, s6, 0xff +; SI-NEXT: s_lshl_b32 s15, s70, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v24, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 19 +; SI-NEXT: v_or_b32_e32 v9, s14, v5 +; SI-NEXT: s_and_b32 s14, s90, 0xff +; SI-NEXT: s_lshl_b32 s15, s6, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v13, v25, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v8, s14, v13 +; SI-NEXT: s_and_b32 s14, s27, 0xff +; SI-NEXT: s_lshl_b32 s15, s24, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v26, v31 +; SI-NEXT: v_or_b32_e32 v31, v27, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v7, s14, v31 +; SI-NEXT: s_and_b32 s14, s49, 0xff +; SI-NEXT: s_lshl_b32 s15, s68, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v60 +; SI-NEXT: v_or_b32_e32 v60, v43, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: v_or_b32_e32 v4, s14, v60 +; SI-NEXT: s_and_b32 s14, s6, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 16 +; SI-NEXT: s_lshl_b32 s15, s6, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v23, v27 +; SI-NEXT: v_mov_b32_e32 v27, v24 +; SI-NEXT: v_mov_b32_e32 v24, v61 +; SI-NEXT: v_or_b32_e32 v61, v44, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: v_or_b32_e32 v2, s14, v61 +; SI-NEXT: s_and_b32 s14, s71, 0xff +; SI-NEXT: s_lshl_b32 s15, s36, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_readlane_b32 s6, v62, 48 +; SI-NEXT: v_or_b32_e32 v6, v45, v1 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: v_readlane_b32 s6, v62, 49 +; SI-NEXT: v_or_b32_e32 v1, s14, v6 +; SI-NEXT: s_and_b32 s14, s79, 0xff +; SI-NEXT: s_lshl_b32 s15, s7, 8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: s_or_b32 s42, s8, s63 +; SI-NEXT: s_and_b32 s8, s6, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s40, s8, s21 +; SI-NEXT: s_and_b32 s8, s12, 0xffff +; SI-NEXT: v_or_b32_e32 v14, v46, v3 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s8, s41 +; SI-NEXT: s_and_b32 s8, s13, 0xffff +; SI-NEXT: v_or_b32_e32 v3, s14, v14 +; SI-NEXT: s_or_b32 s14, s8, s43 +; SI-NEXT: s_and_b32 s8, s16, 0xffff +; SI-NEXT: s_and_b32 s16, s73, 0xffff +; SI-NEXT: s_or_b32 s13, s8, s44 +; SI-NEXT: s_and_b32 s8, s62, 0xffff +; SI-NEXT: s_or_b32 s35, s16, s56 +; SI-NEXT: s_and_b32 s16, s48, 0xffff +; SI-NEXT: s_or_b32 s12, s8, s45 +; SI-NEXT: s_and_b32 s8, s10, 0xffff +; SI-NEXT: s_or_b32 s52, s16, vcc_lo +; SI-NEXT: s_and_b32 s16, s72, 0xffff +; SI-NEXT: s_or_b32 s10, s8, s46 +; SI-NEXT: s_and_b32 s8, s94, 0xffff +; SI-NEXT: s_or_b32 s94, s16, vcc_hi +; SI-NEXT: s_and_b32 s16, s57, 0xffff +; SI-NEXT: s_or_b32 s49, s16, s74 +; SI-NEXT: s_and_b32 s16, s58, 0xffff +; SI-NEXT: s_or_b32 s48, s16, s75 +; SI-NEXT: s_and_b32 s16, s59, 0xffff +; SI-NEXT: s_mov_b32 s25, s23 +; SI-NEXT: s_or_b32 s11, s16, s76 +; SI-NEXT: s_and_b32 s16, s60, 0xffff +; SI-NEXT: s_and_b32 s23, s61, 0xffff +; SI-NEXT: s_mov_b32 s87, s34 +; SI-NEXT: s_mov_b32 s34, s55 +; SI-NEXT: s_mov_b32 s55, s22 +; SI-NEXT: s_or_b32 s8, s8, s47 +; SI-NEXT: s_or_b32 s9, s16, s77 +; SI-NEXT: s_or_b32 s16, s23, s78 +; SI-NEXT: s_mov_b32 s22, s18 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_mov_b32_e32 v18, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: v_mov_b32_e32 v22, v46 +; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16 +; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16 +; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16 +; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16 +; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16 +; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16 +; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16 +; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, s35, v59, 16 +; SI-NEXT: v_alignbit_b32 v41, s52, v5, 16 +; SI-NEXT: v_alignbit_b32 v40, s94, v13, 16 +; SI-NEXT: v_alignbit_b32 v55, s49, v31, 16 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_alignbit_b32 v54, s48, v60, 16 +; SI-NEXT: v_mov_b32_e32 v60, v25 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_alignbit_b32 v53, s11, v61, 16 +; SI-NEXT: v_mov_b32_e32 v61, v24 +; SI-NEXT: v_mov_b32_e32 v24, v27 +; SI-NEXT: v_alignbit_b32 v52, s9, v6, 16 +; SI-NEXT: v_alignbit_b32 v51, s16, v14, 16 +; SI-NEXT: s_lshr_b32 s73, s63, 16 +; SI-NEXT: s_lshr_b32 s72, s21, 16 +; SI-NEXT: s_lshr_b32 s63, s41, 16 +; SI-NEXT: s_lshr_b32 s62, s43, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s60, s45, 16 +; SI-NEXT: s_lshr_b32 s59, s46, 16 +; SI-NEXT: s_lshr_b32 s58, s47, 16 +; SI-NEXT: s_lshr_b32 s57, s56, 16 +; SI-NEXT: s_lshr_b32 s56, vcc_lo, 16 +; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 +; SI-NEXT: s_lshr_b32 s46, s74, 16 +; SI-NEXT: s_mov_b32 s74, s51 +; SI-NEXT: s_mov_b32 s51, s67 +; SI-NEXT: v_readlane_b32 s67, v62, 51 +; SI-NEXT: s_lshr_b32 s45, s75, 16 +; SI-NEXT: s_mov_b32 s23, s25 +; SI-NEXT: s_mov_b32 s21, s55 +; SI-NEXT: s_mov_b32 s55, s34 +; SI-NEXT: s_mov_b32 s75, s54 +; SI-NEXT: s_mov_b32 s34, s87 +; SI-NEXT: v_readlane_b32 s25, v62, 53 +; SI-NEXT: v_readlane_b32 s87, v62, 52 +; SI-NEXT: s_lshr_b32 s44, s76, 16 +; SI-NEXT: v_readlane_b32 s54, v62, 50 +; SI-NEXT: s_lshr_b32 s43, s77, 16 +; SI-NEXT: s_mov_b32 s76, s69 +; SI-NEXT: s_mov_b32 s69, s95 +; SI-NEXT: s_mov_b32 s77, s84 +; SI-NEXT: s_mov_b32 s84, s99 +; SI-NEXT: s_mov_b32 s99, s93 +; SI-NEXT: s_lshr_b32 s41, s78, 16 +; SI-NEXT: s_mov_b32 s78, s31 +; SI-NEXT: s_mov_b32 s31, s50 +; SI-NEXT: s_mov_b32 s50, s53 +; SI-NEXT: s_mov_b32 s53, s82 +; SI-NEXT: s_mov_b32 s82, s83 +; SI-NEXT: s_mov_b32 s83, s85 +; SI-NEXT: s_mov_b32 s85, s4 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v6, v20 +; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: s_branch .LBB97_3 +; SI-NEXT: .LBB97_2: +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v46 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v45 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v44 +; SI-NEXT: v_mov_b32_e32 v18, v43 +; SI-NEXT: v_mov_b32_e32 v23, v27 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: .LBB97_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v5, v39 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB97_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s79, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_add_i32 s4, s82, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_add_i32 s8, s31, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s8, s36, 8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_readlane_b32 s7, v62, 12 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: s_add_i32 s36, s86, 3 +; SI-NEXT: s_lshl_b32 s8, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v62, 11 +; SI-NEXT: v_or_b32_e32 v2, s5, v2 +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: s_add_i32 s9, s7, 3 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s78, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_add_i32 s16, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: s_add_i32 s9, s5, 0x3000000 +; SI-NEXT: s_add_i32 s79, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_add_i32 s4, s77, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: v_readlane_b32 s6, v62, 14 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 13 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 32 +; SI-NEXT: s_add_i32 s53, s4, 3 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_add_i32 s93, s98, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 18 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s93, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s81, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s86, s27, 3 +; SI-NEXT: s_add_i32 s48, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s24, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_readlane_b32 s4, v62, 38 +; SI-NEXT: s_add_i32 s68, s4, 3 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s97, 8 +; SI-NEXT: s_add_i32 s8, s67, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s51, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s50, s90, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 19 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v5 +; SI-NEXT: s_add_i32 s49, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: s_add_i32 s94, s20, 3 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s85, 8 +; SI-NEXT: s_add_i32 s8, s83, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s84, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s54, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v5 +; SI-NEXT: s_add_i32 s94, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v49 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s98, s38, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 20 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s8, s92, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s37, 3 +; SI-NEXT: s_add_i32 s52, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 34 +; SI-NEXT: v_readlane_b32 s6, v62, 29 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s38, s6, 3 +; SI-NEXT: s_and_b32 s8, s38, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s85, s25, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 40 +; SI-NEXT: s_add_i32 s70, s6, 3 +; SI-NEXT: s_and_b32 s7, s70, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s19, s69, 3 +; SI-NEXT: s_add_i32 s51, s30, 3 +; SI-NEXT: s_add_i32 s95, s89, 3 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v39, s9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_mov_b32_e32 v28, s11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 +; SI-NEXT: v_mov_b32_e32 v27, s48 +; SI-NEXT: v_mov_b32_e32 v26, s49 +; SI-NEXT: v_mov_b32_e32 v25, s94 +; SI-NEXT: v_mov_b32_e32 v24, s52 +; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 +; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 +; SI-NEXT: s_lshr_b32 s56, s52, 16 +; SI-NEXT: s_lshr_b32 s47, s94, 16 +; SI-NEXT: s_lshr_b32 s46, s49, 16 +; SI-NEXT: s_lshr_b32 s45, s48, 16 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: s_lshr_b32 s43, s9, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 37 +; SI-NEXT: s_add_i32 s67, s4, 3 +; SI-NEXT: s_and_b32 s4, s67, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 30 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 46 +; SI-NEXT: s_add_i32 s35, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: v_mov_b32_e32 v23, s35 +; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 +; SI-NEXT: s_lshr_b32 s57, s35, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 36 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 47 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 44 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_mov_b32_e32 v22, s8 +; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16 +; SI-NEXT: s_lshr_b32 s58, s8, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 45 +; SI-NEXT: s_add_i32 s6, s4, 3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 27 +; SI-NEXT: s_add_i32 s34, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 28 +; SI-NEXT: s_and_b32 s6, s34, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s10, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 43 +; SI-NEXT: s_add_i32 s97, s4, 3 +; SI-NEXT: s_and_b32 s4, s97, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 41 +; SI-NEXT: v_readlane_b32 s6, v62, 33 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s54, s6, 3 +; SI-NEXT: s_and_b32 s6, s54, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 +; SI-NEXT: s_lshr_b32 s59, s10, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v14, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s4, v62, 42 +; SI-NEXT: s_add_i32 s81, s4, 3 +; SI-NEXT: s_and_b32 s4, s81, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 31 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s12, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 39 +; SI-NEXT: s_add_i32 s69, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 35 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_readlane_b32 s5, v62, 25 +; SI-NEXT: v_readlane_b32 s6, v62, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s92, s6, 3 +; SI-NEXT: s_and_b32 s6, s92, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_mov_b32_e32 v19, s12 +; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 +; SI-NEXT: s_lshr_b32 s60, s12, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_readlane_b32 s4, v62, 26 +; SI-NEXT: s_add_i32 s31, s4, 3 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s87, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s13, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s51, 0xff +; SI-NEXT: s_lshl_b32 s5, s88, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 8 +; SI-NEXT: s_add_i32 s6, s96, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v18, s13 +; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 +; SI-NEXT: s_lshr_b32 s61, s13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s76, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s14, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 22 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 21 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_add_i32 s6, s55, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 +; SI-NEXT: s_lshr_b32 s62, s14, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_add_i32 s4, s80, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s64, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s15, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 2 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s17, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s74, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s21, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s40, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 10 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 23 +; SI-NEXT: v_readlane_b32 s6, v62, 9 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s91, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v62, 7 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: v_readlane_b32 s6, v62, 5 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s42, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v13, s91 +; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v6 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_alignbit_b32 v57, s42, v13, 16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16 +; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16 +; SI-NEXT: s_lshr_b32 s73, s42, 16 +; SI-NEXT: s_lshr_b32 s72, s40, 16 +; SI-NEXT: s_lshr_b32 s63, s15, 16 +; SI-NEXT: .LBB97_5: ; %end +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v58 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v45 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v43 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 +; SI-NEXT: s_and_b32 s4, s94, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; SI-NEXT: s_and_b32 s4, s49, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; SI-NEXT: s_and_b32 s4, s48, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v1 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v34, v0 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v37, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v49, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v1 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v56, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v38, v0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v52, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v63, v0 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v57, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 +; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s5, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshl_b32 s10, s17, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 +; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 +; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 +; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 +; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 +; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: s_and_b32 s4, s26, 0xff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s18, 0xff +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s16, 0xff +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_addk_i32 s9, 0x300 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s8, s8, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v63, v3 +; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: v_mov_b32_e32 v54, v22 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v128i8_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(41) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v39, v16 +; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v55, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v50, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v49, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v45 +; GFX9-NEXT: v_mov_b32_e32 v45, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_branch .LBB97_3 +; GFX9-NEXT: .LBB97_2: +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v45 +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: .LBB97_3: ; %Flow +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: s_cbranch_vccnz .LBB97_5 +; GFX9-NEXT: ; %bb.4: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_lshl_b32 s6, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshl_b32 s9, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshl_b32 s10, s19, 8 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:436 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:192 +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v2, 0xffff, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v68 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v148 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v20, 16, v21 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v147 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v149 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v151 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v161 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v163 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v165 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v41 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v75 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v78 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v79 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v91 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v92 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_3 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v90 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v88 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v77 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v76 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v59 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v59, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v60, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v58 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v57, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v56 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v56, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v45 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v45, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v41 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v42 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v167 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v167, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v176, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v165 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v166 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v165, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v164 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v164, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v163 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v161 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v161, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v149 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v150 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v148 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v70, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v69, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v64, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v53, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v49, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v38, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v37, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v34, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v53, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v49, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v165, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v59, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v35 +; GFX11-TRUE16-NEXT: .LBB97_3: ; %end +; GFX11-TRUE16-NEXT: s_clause 0x1d +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:332 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:340 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:376 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:380 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:384 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:392 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:396 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:400 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:404 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:408 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:412 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:416 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:420 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:424 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:428 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:432 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:436 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; +; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:516 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:392 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:384 -; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:380 -; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 offset:376 -; GFX11-FAKE16-NEXT: scratch_load_u16 v134, off, s32 offset:372 -; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:368 -; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:364 -; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:360 -; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:356 -; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:352 -; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:348 -; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:344 -; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:340 -; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:336 -; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:332 -; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:328 -; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:324 -; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:320 -; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:316 -; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:312 -; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:308 -; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:304 -; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:300 -; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:296 -; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:292 -; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:260 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v150, off, s32 offset:388 -; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240 ; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_u16 v42, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_u16 v177, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:20 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v84, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v82, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v115, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v128, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v113, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v132, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v100, 8, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v161, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v160, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v176, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v167, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v127, 8, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v126, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v124, 8, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v125, 8, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v120, 8, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v123, 8, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v121, 8, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v122, 8, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v106, 8, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v111, 8, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v109, 8, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v110, 8, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v107, 8, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v47, 8, v72 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v72, 8, v73 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v63, 8, v74 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v74, 8, v75 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v73, 8, v76 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v75, 8, v77 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v61, 8, v78 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v78, 8, v79 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v77, 8, v89 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v79, 8, v90 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v76, 8, v95 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v90, 8, v104 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v89, 8, v105 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v104, 8, v94 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v95, 8, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v105, 8, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v94, 8, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v34 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v132 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v100 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v161 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v176 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v167 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v135 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v40 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v43 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v182 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v46 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v57 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v56 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v149 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v146 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v148 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v58 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v47 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v72 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v73 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v177 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v17, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v61 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v78 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v77 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v79 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v90 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v92 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v91 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v68 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v67 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v104 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v95 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v105 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v94 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v108 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v107 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v110 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v109 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v111 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v106 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v122 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v30, v121 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v123 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v120 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v125 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, v34, v124 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v35, v126 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v36, v127 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_3 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 +; GFX11-FAKE16-NEXT: .LBB97_3: ; %end +; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v64i16_to_v128i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v46, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v45, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v44, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v43, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v41, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v40, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v55, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v53, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v52, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v51, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v50, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v49, v2, v27 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v48, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v39, v2, v19 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v38, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v37, v2, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v36, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v35, v2, v29 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v34, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v2, v21 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v31, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v32, v2, v17 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v26, v2, v25 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v2, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v18, v2, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v22, v2, v9 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v14, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v10, v2, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v60, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v58, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v59, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v63, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v61, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v62, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v6, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v6, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v6, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v12, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v20, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v47, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v62, v22, v18, 24 +; SI-NEXT: v_alignbit_b32 v63, v22, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; SI-NEXT: v_alignbit_b32 v56, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v57, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v61, v10, v14, 8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v45, v46, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v44, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v41, v42, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v51, v52, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v37, v38, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: v_alignbit_b32 v4, v45, v46, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v10 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v11 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v13 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v14 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v46 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: .LBB98_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB98_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v31, 3 +; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v33, 3, v30 +; VI-NEXT: v_add_u16_e32 v34, 3, v29 +; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v30, v33, v29 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; VI-NEXT: v_add_u16_e32 v33, 3, v37 +; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v29, v34, v29 +; VI-NEXT: v_add_u16_e32 v34, 3, v36 +; VI-NEXT: v_or_b32_e32 v37, v33, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 +; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v36, v34, v32 +; VI-NEXT: v_add_u16_e32 v33, 3, v2 +; VI-NEXT: v_add_u16_e32 v34, 3, v1 +; VI-NEXT: v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; VI-NEXT: v_or_b32_e32 v2, v33, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v1, v34, v1 +; VI-NEXT: v_add_u16_e32 v33, 3, v4 +; VI-NEXT: v_add_u16_e32 v34, 3, v3 +; VI-NEXT: v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; VI-NEXT: v_or_b32_e32 v4, v33, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v3, v34, v3 +; VI-NEXT: v_add_u16_e32 v33, 3, v6 +; VI-NEXT: v_add_u16_e32 v34, 3, v5 +; VI-NEXT: v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; VI-NEXT: v_or_b32_e32 v6, v33, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v34, v5 +; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v38, 3, v8 +; VI-NEXT: v_add_u16_e32 v33, 3, v7 +; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; VI-NEXT: v_or_b32_e32 v8, v38, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_u16_e32 v33, 3, v10 +; VI-NEXT: v_add_u16_e32 v38, 3, v9 +; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; VI-NEXT: v_or_b32_e32 v10, v33, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v38, v9 +; VI-NEXT: v_add_u16_e32 v33, 3, v12 +; VI-NEXT: v_add_u16_e32 v38, 3, v11 +; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v12, v33, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v38, v11 +; VI-NEXT: v_add_u16_e32 v38, 3, v14 +; VI-NEXT: v_add_u16_e32 v49, 3, v13 +; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v38, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v16, v16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v13, v49, v13 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; VI-NEXT: v_mov_b32_e32 v46, v35 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v49, v53 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v51 +; VI-NEXT: v_bfe_u32 v31, v51, 8, 8 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 +; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 +; VI-NEXT: .LBB98_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(45) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: .LBB98_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB98_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: .LBB98_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64i16_to_v128i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4 +; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v51, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v68 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v51, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v65, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67 +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr79 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr90 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr89 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr92 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr93 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr88 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr104 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr95 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr105 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr94 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr107 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr109 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr111 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr106 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr122 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr121 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr120 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr125 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr124 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr126 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr127 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v134, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v118, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v131, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v116, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v129, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, v35, 3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v126, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v127, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v125, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v124, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v31, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v98, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v116, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v30, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v98, 0x300, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v112, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v99, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v103, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v123, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v120, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v122, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v29, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v121, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v99, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v111, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v28, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v81, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v81, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v101, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v27, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v86, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v97, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v83, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v106, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v110, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v109, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v83, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v108, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v26, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v107, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v86, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v85, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v25, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v67, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v80, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v68, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v69, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v105, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v94, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v104, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v24, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v95, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v68, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v23, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v64, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v64, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v42, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v22, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v65, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v183, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v180, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v88, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v91, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v65, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v90, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v69, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v179, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v165, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v80, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v177, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v163, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v166, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v79, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v76, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v78, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v85, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v75, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v145, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v97, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v151, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v148, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v178, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v164, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v61, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v74, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v101, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v103, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v162, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v146, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v112, 0x300, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v149, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v144, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v147, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v62, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v47, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v118, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v58, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v119, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v119, 0x300, v2 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v135, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: .LBB98_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v117, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v117, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v46, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v45, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v129, 0x300, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v114, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v96, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v102, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v87, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v55, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v43, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v182, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v41, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v40, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v55, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v181, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v37, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v54, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v52, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v53, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v50, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v150, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v176, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v167, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v50, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v161, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v160, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v52, 0x300, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v51, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v49, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v49, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v48, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v36, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v132, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v115, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v34, 0x300, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v100, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v113, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, v32, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: s_mov_b32 s6, s18 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s62, v30 +; SI-NEXT: v_readfirstlane_b32 s63, v29 +; SI-NEXT: v_readfirstlane_b32 s59, v26 +; SI-NEXT: v_readfirstlane_b32 s60, v25 +; SI-NEXT: v_readfirstlane_b32 s98, v22 +; SI-NEXT: v_readfirstlane_b32 s61, v21 +; SI-NEXT: v_readfirstlane_b32 s99, v18 +; SI-NEXT: v_readfirstlane_b32 s58, v17 +; SI-NEXT: v_readfirstlane_b32 s96, v14 +; SI-NEXT: v_readfirstlane_b32 s97, v13 +; SI-NEXT: v_readfirstlane_b32 s86, v10 +; SI-NEXT: v_readfirstlane_b32 s87, v9 +; SI-NEXT: v_readfirstlane_b32 s84, v6 +; SI-NEXT: v_readfirstlane_b32 s85, v5 +; SI-NEXT: v_readfirstlane_b32 s81, v2 +; SI-NEXT: v_readfirstlane_b32 s82, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s88, v36 +; SI-NEXT: v_readfirstlane_b32 s18, v37 +; SI-NEXT: v_readfirstlane_b32 s78, v38 +; SI-NEXT: v_readfirstlane_b32 s79, v39 +; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: v_readfirstlane_b32 s77, v49 +; SI-NEXT: v_readfirstlane_b32 s74, v50 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s75, v51 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s72, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s73, v53 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_writelane_b32 v62, s6, 0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_alignbit_b32 v8, s57, v1, 24 +; SI-NEXT: v_alignbit_b32 v50, s57, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s57, v1, 8 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s46 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s47, v1, 24 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s47, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, s47, v1, 8 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s45, v1, 24 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s45, v1, 16 +; SI-NEXT: v_alignbit_b32 v49, s45, v1, 8 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s42 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s43, v1, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, s43, v1, 16 +; SI-NEXT: v_alignbit_b32 v48, s43, v1, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: v_alignbit_b32 v1, s41, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s41, v16, 16 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s41, v16, 8 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: v_or_b32_e32 v14, v1, v4 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s61, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: v_alignbit_b32 v1, s40, v14, 24 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s60, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s40, v14, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s40, v14, 8 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s75, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: v_or_b32_e32 v12, v1, v5 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: v_alignbit_b32 v1, s15, v12, 24 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s79, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s15, v12, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s15, v12, 8 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_writelane_b32 v62, s4, 1 +; SI-NEXT: s_lshr_b32 s4, s10, 8 +; SI-NEXT: v_or_b32_e32 v10, v1, v6 +; SI-NEXT: v_writelane_b32 v62, s4, 3 +; SI-NEXT: s_lshr_b32 s4, s9, 8 +; SI-NEXT: v_alignbit_b32 v1, s14, v10, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s14, v10, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: s_lshr_b32 s4, s7, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, s14, v10, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: s_lshr_b32 s4, s6, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: s_and_b32 s4, s72, 0xffff +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v1, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: s_and_b32 s4, s74, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v1, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_writelane_b32 v62, s4, 5 +; SI-NEXT: s_and_b32 s4, s76, 0xffff +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_or_b32_e32 v13, v1, v17 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: s_and_b32 s4, s78, 0xffff +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: v_or_b32_e32 v9, v1, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: s_and_b32 s4, s88, 0xffff +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_or_b32_e32 v6, v1, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: s_bfe_u32 s4, s74, 0x80008 +; SI-NEXT: v_or_b32_e32 v4, v1, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: s_bfe_u32 s4, s76, 0x80008 +; SI-NEXT: v_or_b32_e32 v2, v1, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_writelane_b32 v62, s4, 7 +; SI-NEXT: s_bfe_u32 s4, s78, 0x80008 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: s_bfe_u32 s4, s88, 0x80008 +; SI-NEXT: v_mov_b32_e32 v29, v17 +; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v21 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: s_lshr_b32 s68, s57, 8 +; SI-NEXT: s_lshr_b32 s65, s47, 8 +; SI-NEXT: s_lshr_b32 s54, s45, 8 +; SI-NEXT: s_lshr_b32 s51, s43, 8 +; SI-NEXT: s_lshr_b32 s48, s41, 8 +; SI-NEXT: s_lshr_b32 s37, s40, 8 +; SI-NEXT: s_lshr_b32 s34, s15, 8 +; SI-NEXT: s_lshr_b32 s95, s14, 8 +; SI-NEXT: s_lshr_b32 s92, s13, 8 +; SI-NEXT: s_lshr_b32 s89, s12, 8 +; SI-NEXT: s_and_b32 s71, s19, 0xffff +; SI-NEXT: s_and_b32 s69, s23, 0xffff +; SI-NEXT: s_and_b32 s66, s27, 0xffff +; SI-NEXT: s_and_b32 s55, s81, 0xffff +; SI-NEXT: s_and_b32 s52, s84, 0xffff +; SI-NEXT: s_and_b32 s49, s86, 0xffff +; SI-NEXT: s_and_b32 s38, s96, 0xffff +; SI-NEXT: s_and_b32 s35, s99, 0xffff +; SI-NEXT: s_and_b32 s30, s98, 0xffff +; SI-NEXT: s_and_b32 s93, s59, 0xffff +; SI-NEXT: s_and_b32 s90, s62, 0xffff +; SI-NEXT: s_bfe_u32 s83, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s80, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s70, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s67, s81, 0x80008 +; SI-NEXT: s_bfe_u32 s64, s84, 0x80008 +; SI-NEXT: s_bfe_u32 s53, s86, 0x80008 +; SI-NEXT: s_bfe_u32 s50, s96, 0x80008 +; SI-NEXT: s_bfe_u32 s39, s99, 0x80008 +; SI-NEXT: s_bfe_u32 s36, s98, 0x80008 +; SI-NEXT: s_bfe_u32 s31, s59, 0x80008 +; SI-NEXT: s_bfe_u32 s94, s62, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s72, 0x80008 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: v_alignbit_b32 v45, s13, v8, 24 +; SI-NEXT: v_alignbit_b32 v47, s13, v8, 16 +; SI-NEXT: v_alignbit_b32 v57, s13, v8, 8 +; SI-NEXT: v_alignbit_b32 v41, s12, v5, 24 +; SI-NEXT: v_alignbit_b32 v43, s12, v5, 16 +; SI-NEXT: v_alignbit_b32 v44, s12, v5, 8 +; SI-NEXT: v_alignbit_b32 v21, s11, v13, 24 +; SI-NEXT: v_alignbit_b32 v22, s11, v13, 16 +; SI-NEXT: v_alignbit_b32 v24, s11, v13, 8 +; SI-NEXT: v_alignbit_b32 v17, s10, v9, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v9, 16 +; SI-NEXT: v_alignbit_b32 v20, s10, v9, 8 +; SI-NEXT: v_alignbit_b32 v59, s9, v6, 24 +; SI-NEXT: v_alignbit_b32 v60, s9, v6, 16 +; SI-NEXT: v_alignbit_b32 v61, s9, v6, 8 +; SI-NEXT: v_alignbit_b32 v46, s8, v4, 24 +; SI-NEXT: v_alignbit_b32 v56, s8, v4, 16 +; SI-NEXT: v_alignbit_b32 v58, s8, v4, 8 +; SI-NEXT: v_alignbit_b32 v55, s7, v2, 24 +; SI-NEXT: v_alignbit_b32 v40, s7, v2, 16 +; SI-NEXT: v_alignbit_b32 v42, s7, v2, 8 +; SI-NEXT: v_alignbit_b32 v52, s6, v1, 24 +; SI-NEXT: v_alignbit_b32 v53, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, s6, v1, 8 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s79, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s75, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s60, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s61, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s82, s82, 3 +; SI-NEXT: s_add_i32 s42, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s43, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s44, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s45, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s47, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s56, s4, 0x30000 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s57, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v21, s56 +; SI-NEXT: v_alignbit_b32 v22, s57, v21, 24 +; SI-NEXT: v_alignbit_b32 v50, s57, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s57, v21, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, s46 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s47, v21, 24 +; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s47, v21, 16 +; SI-NEXT: v_alignbit_b32 v51, s47, v21, 8 +; SI-NEXT: v_mov_b32_e32 v21, s44 +; SI-NEXT: v_writelane_b32 v62, s4, 1 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: s_lshr_b32 s4, s10, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, s45, v21, 8 +; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: v_writelane_b32 v62, s4, 3 +; SI-NEXT: s_lshr_b32 s4, s9, 24 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 +; SI-NEXT: v_alignbit_b32 v48, s43, v21, 8 +; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 5 +; SI-NEXT: s_lshr_b32 s4, s9, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v7 +; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v3, v16, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v16, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: s_lshr_b32 s4, s8, 24 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v7, v14, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 7 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v7, v14, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v11 +; SI-NEXT: v_mov_b32_e32 v11, s15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v7, v14, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: s_lshr_b32 s4, s7, 24 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v12, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v12, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 11 +; SI-NEXT: s_lshr_b32 s4, s7, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: v_mov_b32_e32 v15, s14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v11, v12, 8 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: s_lshr_b32 s4, s6, 24 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v15, v10, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 13 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_mov_b32_e32 v35, s6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_mov_b32_e32 v34, s7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_mov_b32_e32 v33, s8 +; SI-NEXT: v_mov_b32_e32 v32, s9 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v17, s11 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v15, v10, 16 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: s_lshr_b32 s4, s6, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v15, v10, 8 +; SI-NEXT: v_alignbit_b32 v45, v19, v8, 24 +; SI-NEXT: v_alignbit_b32 v47, v19, v8, 16 +; SI-NEXT: v_alignbit_b32 v57, v19, v8, 8 +; SI-NEXT: v_alignbit_b32 v41, v18, v5, 24 +; SI-NEXT: v_alignbit_b32 v43, v18, v5, 16 +; SI-NEXT: v_alignbit_b32 v44, v18, v5, 8 +; SI-NEXT: v_alignbit_b32 v21, v17, v13, 24 +; SI-NEXT: v_alignbit_b32 v22, v17, v13, 16 +; SI-NEXT: v_alignbit_b32 v24, v17, v13, 8 +; SI-NEXT: v_alignbit_b32 v17, v20, v9, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v9, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v9, 8 +; SI-NEXT: v_alignbit_b32 v59, v32, v6, 24 +; SI-NEXT: v_alignbit_b32 v60, v32, v6, 16 +; SI-NEXT: v_alignbit_b32 v61, v32, v6, 8 +; SI-NEXT: v_alignbit_b32 v46, v33, v4, 24 +; SI-NEXT: v_alignbit_b32 v56, v33, v4, 16 +; SI-NEXT: v_alignbit_b32 v58, v33, v4, 8 +; SI-NEXT: v_alignbit_b32 v55, v34, v2, 24 +; SI-NEXT: v_alignbit_b32 v40, v34, v2, 16 +; SI-NEXT: v_alignbit_b32 v42, v34, v2, 8 +; SI-NEXT: v_alignbit_b32 v52, v35, v1, 24 +; SI-NEXT: v_alignbit_b32 v53, v35, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v35, v1, 8 +; SI-NEXT: s_lshr_b32 s83, s57, 24 +; SI-NEXT: s_lshr_b32 s71, s57, 16 +; SI-NEXT: s_lshr_b32 s68, s57, 8 +; SI-NEXT: s_lshr_b32 s80, s47, 24 +; SI-NEXT: s_lshr_b32 s69, s47, 16 +; SI-NEXT: s_lshr_b32 s65, s47, 8 +; SI-NEXT: s_lshr_b32 s70, s45, 24 +; SI-NEXT: s_lshr_b32 s66, s45, 16 +; SI-NEXT: s_lshr_b32 s54, s45, 8 +; SI-NEXT: s_lshr_b32 s67, s43, 24 +; SI-NEXT: s_lshr_b32 s55, s43, 16 +; SI-NEXT: s_lshr_b32 s51, s43, 8 +; SI-NEXT: s_lshr_b32 s64, s41, 24 +; SI-NEXT: s_lshr_b32 s52, s41, 16 +; SI-NEXT: s_lshr_b32 s48, s41, 8 +; SI-NEXT: s_lshr_b32 s53, s40, 24 +; SI-NEXT: s_lshr_b32 s49, s40, 16 +; SI-NEXT: s_lshr_b32 s37, s40, 8 +; SI-NEXT: s_lshr_b32 s50, s15, 24 +; SI-NEXT: s_lshr_b32 s38, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s15, 8 +; SI-NEXT: s_lshr_b32 s39, s14, 24 +; SI-NEXT: s_lshr_b32 s35, s14, 16 +; SI-NEXT: s_lshr_b32 s95, s14, 8 +; SI-NEXT: s_lshr_b32 s36, s13, 24 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 8 +; SI-NEXT: s_lshr_b32 s31, s12, 24 +; SI-NEXT: s_lshr_b32 s93, s12, 16 +; SI-NEXT: s_lshr_b32 s89, s12, 8 +; SI-NEXT: s_lshr_b32 s94, s11, 24 +; SI-NEXT: s_lshr_b32 s90, s11, 16 +; SI-NEXT: s_lshr_b32 s91, s10, 24 +; SI-NEXT: v_writelane_b32 v62, s4, 15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: s_lshl_b32 s16, s83, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s71, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s69, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s80, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_lshl_b32 s16, s70, 24 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s44, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: s_lshl_b32 s16, s67, 24 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s55, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_lshl_b32 s16, s64, 24 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s52, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_lshl_b32 s16, s53, 24 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s49, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s50, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s39, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v57 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v47 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v45 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s13, s36, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s13, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v43 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s93, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v41 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s31, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v24 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v21 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s94, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 2 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s91, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 6 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v61 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v60 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 9 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v58 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v56 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s8, v62, 7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v46 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 12 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v42 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 11 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s7, v62, 10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v55 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v37, v21 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v29, v17 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v26, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v64i16_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: s_lshr_b32 s80, s21, 16 +; VI-NEXT: s_lshr_b32 s82, s21, 8 +; VI-NEXT: s_lshr_b32 s84, s20, 16 +; VI-NEXT: s_lshr_b32 s86, s20, 8 +; VI-NEXT: s_lshr_b32 s51, s19, 24 +; VI-NEXT: s_lshr_b32 s53, s19, 16 +; VI-NEXT: s_lshr_b32 s54, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s67, s17, 24 +; VI-NEXT: s_lshr_b32 s68, s17, 16 +; VI-NEXT: s_lshr_b32 s69, s17, 8 +; VI-NEXT: s_lshr_b32 s70, s16, 16 +; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s81, s41, 8 +; VI-NEXT: s_lshr_b32 s83, s40, 16 +; VI-NEXT: s_lshr_b32 s85, s40, 8 +; VI-NEXT: s_lshr_b32 s87, s43, 24 +; VI-NEXT: s_lshr_b32 s50, s43, 16 +; VI-NEXT: s_lshr_b32 s52, s43, 8 +; VI-NEXT: s_lshr_b32 s55, s42, 16 +; VI-NEXT: s_lshr_b32 s64, s42, 8 +; VI-NEXT: s_lshr_b64 s[76:77], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[42:43], 24 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: s_and_b32 s46, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_or_b32 s43, s46, s43 +; VI-NEXT: s_and_b32 s46, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_or_b32 s42, s46, s42 +; VI-NEXT: s_and_b32 s46, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_or_b32 s41, s46, s41 +; VI-NEXT: s_and_b32 s46, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_or_b32 s40, s46, s40 +; VI-NEXT: s_and_b32 s46, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_or_b32 s15, s46, s15 +; VI-NEXT: s_and_b32 s46, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_or_b32 s14, s46, s14 +; VI-NEXT: s_and_b32 s46, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_or_b32 s13, s46, s13 +; VI-NEXT: s_and_b32 s46, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_or_b32 s12, s46, s12 +; VI-NEXT: s_and_b32 s46, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_or_b32 s11, s46, s11 +; VI-NEXT: s_and_b32 s46, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_or_b32 s10, s46, s10 +; VI-NEXT: s_and_b32 s46, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_or_b32 s9, s46, s9 +; VI-NEXT: s_and_b32 s46, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_or_b32 s8, s46, s8 +; VI-NEXT: s_and_b32 s46, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s7, s46, s7 +; VI-NEXT: s_and_b32 s46, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s6, s46, s6 +; VI-NEXT: s_and_b32 s46, s5, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s5, s46, s5 +; VI-NEXT: s_and_b32 s46, s4, 0xffff0000 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s46, s4 +; VI-NEXT: s_and_b32 s46, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_or_b32 s17, s46, s17 +; VI-NEXT: s_and_b32 s46, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_or_b32 s16, s46, s16 +; VI-NEXT: s_and_b32 s46, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_or_b32 s19, s46, s19 +; VI-NEXT: s_and_b32 s46, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_or_b32 s18, s46, s18 +; VI-NEXT: s_and_b32 s46, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_or_b32 s21, s46, s21 +; VI-NEXT: s_and_b32 s46, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_or_b32 s20, s46, s20 +; VI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_or_b32 s23, s46, s23 +; VI-NEXT: s_and_b32 s46, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_or_b32 s22, s46, s22 +; VI-NEXT: s_and_b32 s46, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_or_b32 s25, s46, s25 +; VI-NEXT: s_and_b32 s46, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_or_b32 s24, s46, s24 +; VI-NEXT: s_and_b32 s46, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_or_b32 s27, s46, s27 +; VI-NEXT: s_and_b32 s46, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_or_b32 s26, s46, s26 +; VI-NEXT: s_and_b32 s46, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_or_b32 s29, s46, s29 +; VI-NEXT: s_and_b32 s46, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_or_b32 s28, s46, s28 +; VI-NEXT: s_and_b32 s46, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_or_b32 s45, s46, s45 +; VI-NEXT: s_and_b32 s46, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_or_b32 s44, s46, s44 +; VI-NEXT: s_lshr_b32 s46, s45, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 0 +; VI-NEXT: s_lshr_b32 s46, s45, 16 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 1 +; VI-NEXT: s_lshr_b32 s46, s45, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 2 +; VI-NEXT: s_lshr_b32 s46, s44, 16 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 3 +; VI-NEXT: s_lshr_b32 s46, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 4 +; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 5 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 6 +; VI-NEXT: s_lshr_b32 s46, s29, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 7 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: s_lshr_b32 s46, s28, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 9 +; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 11 +; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 13 +; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: s_lshr_b32 s46, s25, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 15 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 17 +; VI-NEXT: s_lshr_b32 s46, s24, 16 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 18 +; VI-NEXT: s_lshr_b32 s46, s24, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 19 +; VI-NEXT: s_lshr_b32 s46, s23, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 20 +; VI-NEXT: s_lshr_b32 s46, s23, 16 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 21 +; VI-NEXT: s_lshr_b32 s46, s23, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 22 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 23 +; VI-NEXT: s_lshr_b32 s46, s22, 8 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 25 +; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 26 +; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 27 +; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 28 +; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 29 +; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 30 +; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 31 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 32 +; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 33 +; VI-NEXT: s_lshr_b32 s46, s6, 16 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 34 +; VI-NEXT: s_lshr_b32 s46, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 35 +; VI-NEXT: s_lshr_b32 s46, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 36 +; VI-NEXT: s_lshr_b32 s46, s9, 16 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 37 +; VI-NEXT: s_lshr_b32 s46, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 38 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 39 +; VI-NEXT: s_lshr_b32 s46, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 40 +; VI-NEXT: s_lshr_b32 s46, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 41 +; VI-NEXT: s_lshr_b32 s46, s11, 16 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 42 +; VI-NEXT: s_lshr_b32 s46, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 43 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 44 +; VI-NEXT: s_lshr_b32 s46, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 45 +; VI-NEXT: s_lshr_b32 s46, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 46 +; VI-NEXT: s_lshr_b32 s46, s13, 16 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 47 +; VI-NEXT: s_lshr_b32 s46, s13, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 48 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 49 +; VI-NEXT: s_lshr_b32 s46, s12, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 50 +; VI-NEXT: s_lshr_b32 s46, s15, 24 +; VI-NEXT: v_writelane_b32 v21, s46, 51 +; VI-NEXT: s_lshr_b32 s46, s15, 16 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 52 +; VI-NEXT: s_lshr_b32 s46, s15, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 53 +; VI-NEXT: s_lshr_b32 s46, s14, 16 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 54 +; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 55 +; VI-NEXT: s_lshr_b32 s46, s41, 24 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 56 +; VI-NEXT: s_lshr_b32 s46, s41, 16 +; VI-NEXT: s_lshr_b32 s80, s21, 16 +; VI-NEXT: s_lshr_b32 s82, s21, 8 +; VI-NEXT: s_lshr_b32 s84, s20, 16 +; VI-NEXT: s_lshr_b32 s86, s20, 8 +; VI-NEXT: s_lshr_b32 s51, s19, 24 +; VI-NEXT: s_lshr_b32 s53, s19, 16 +; VI-NEXT: s_lshr_b32 s54, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s67, s17, 24 +; VI-NEXT: s_lshr_b32 s68, s17, 16 +; VI-NEXT: s_lshr_b32 s69, s17, 8 +; VI-NEXT: s_lshr_b32 s70, s16, 16 +; VI-NEXT: s_lshr_b32 s71, s16, 8 +; VI-NEXT: v_writelane_b32 v21, s46, 57 +; VI-NEXT: s_lshr_b32 s81, s41, 8 +; VI-NEXT: s_lshr_b32 s83, s40, 16 +; VI-NEXT: s_lshr_b32 s85, s40, 8 +; VI-NEXT: s_lshr_b32 s87, s43, 24 +; VI-NEXT: s_lshr_b32 s50, s43, 16 +; VI-NEXT: s_lshr_b32 s52, s43, 8 +; VI-NEXT: s_lshr_b32 s55, s42, 16 +; VI-NEXT: s_lshr_b32 s64, s42, 8 +; VI-NEXT: s_lshr_b64 s[76:77], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[42:43], 24 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: s_lshl_b32 s47, s71, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_or_b32 s16, s16, s47 +; VI-NEXT: s_lshl_b32 s47, s48, 8 +; VI-NEXT: s_and_b32 s57, s70, 0xff +; VI-NEXT: s_or_b32 s47, s57, s47 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s47, s47, 16 +; VI-NEXT: s_or_b32 s16, s16, s47 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: s_and_b32 s16, s17, 0xff +; VI-NEXT: s_lshl_b32 s17, s69, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s68, 0xff +; VI-NEXT: s_lshl_b32 s47, s67, 8 +; VI-NEXT: s_or_b32 s17, s17, s47 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_lshl_b32 s16, s66, 8 +; VI-NEXT: s_and_b32 s17, s18, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s38, 8 +; VI-NEXT: s_and_b32 s18, s65, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v3, s16 +; VI-NEXT: s_and_b32 s16, s19, 0xff +; VI-NEXT: s_lshl_b32 s17, s54, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s53, 0xff +; VI-NEXT: s_lshl_b32 s18, s51, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v4, s16 +; VI-NEXT: s_lshl_b32 s16, s86, 8 +; VI-NEXT: s_and_b32 s17, s20, 0xff +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s36, 8 +; VI-NEXT: s_and_b32 s18, s84, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v5, s16 +; VI-NEXT: s_and_b32 s16, s21, 0xff +; VI-NEXT: s_lshl_b32 s17, s82, 8 +; VI-NEXT: v_readlane_b32 s18, v21, 25 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s80, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v6, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 24 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s22, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 23 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s34, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 22 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: s_and_b32 s16, s23, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 21 +; VI-NEXT: v_readlane_b32 s18, v21, 20 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 19 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s24, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 18 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s30, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 17 +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: s_and_b32 s16, s25, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 16 +; VI-NEXT: v_readlane_b32 s18, v21, 15 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v10, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 14 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s26, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 13 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s90, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 12 +; VI-NEXT: v_mov_b32_e32 v11, s16 +; VI-NEXT: s_and_b32 s16, s27, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 11 +; VI-NEXT: v_readlane_b32 s18, v21, 10 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v12, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 9 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s28, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 8 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s88, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 7 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: s_and_b32 s16, s29, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 6 +; VI-NEXT: v_readlane_b32 s18, v21, 5 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 4 +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s17, s44, 0xff +; VI-NEXT: v_readlane_b32 s18, v21, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_lshl_b32 s17, s76, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 2 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: v_mov_b32_e32 v15, s16 +; VI-NEXT: s_and_b32 s16, s45, 0xff +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: v_readlane_b32 s17, v21, 1 +; VI-NEXT: v_readlane_b32 s18, v21, 0 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s42, 0xff +; VI-NEXT: s_lshl_b32 s17, s64, 8 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s55, 0xff +; VI-NEXT: s_lshl_b32 s18, s78, 8 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s43, 0xff +; VI-NEXT: s_lshl_b32 s17, s52, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s50, 0xff +; VI-NEXT: s_lshl_b32 s18, s87, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s40, 0xff +; VI-NEXT: s_lshl_b32 s17, s85, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, s83, 0xff +; VI-NEXT: s_lshl_b32 s18, s74, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: s_and_b32 s16, s41, 0xff +; VI-NEXT: s_lshl_b32 s17, s81, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: v_readlane_b32 s17, v21, 57 +; VI-NEXT: v_readlane_b32 s18, v21, 56 +; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_lshl_b32 s17, s17, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 55 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: v_readlane_b32 s16, v21, 54 +; VI-NEXT: s_and_b32 s16, s16, 0xff +; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; VI-NEXT: s_or_b32 s14, s14, s16 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: s_and_b32 s14, s15, 0xff +; VI-NEXT: v_readlane_b32 s15, v21, 53 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: v_readlane_b32 s15, v21, 52 +; VI-NEXT: v_readlane_b32 s16, v21, 51 +; VI-NEXT: s_and_b32 s15, s15, 0xff +; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_or_b32 s15, s15, s16 +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 50 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: v_readlane_b32 s14, v21, 49 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; VI-NEXT: s_or_b32 s12, s12, s14 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: s_and_b32 s12, s13, 0xff +; VI-NEXT: v_readlane_b32 s13, v21, 48 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: v_readlane_b32 s13, v21, 47 +; VI-NEXT: v_readlane_b32 s14, v21, 46 +; VI-NEXT: s_and_b32 s13, s13, 0xff +; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_or_b32 s13, s13, s14 +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 45 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: v_readlane_b32 s12, v21, 44 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_lshl_b32 s13, s60, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 +; VI-NEXT: s_or_b32 s10, s10, s12 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: s_and_b32 s10, s11, 0xff +; VI-NEXT: v_readlane_b32 s11, v21, 43 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: v_readlane_b32 s11, v21, 42 +; VI-NEXT: v_readlane_b32 s12, v21, 41 +; VI-NEXT: s_and_b32 s11, s11, 0xff +; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_or_b32 s11, s11, s12 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 40 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: v_readlane_b32 s10, v21, 39 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_lshl_b32 s11, s58, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 +; VI-NEXT: s_or_b32 s8, s8, s10 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_and_b32 s8, s9, 0xff +; VI-NEXT: v_readlane_b32 s9, v21, 38 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: v_readlane_b32 s9, v21, 37 +; VI-NEXT: v_readlane_b32 s10, v21, 36 +; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 35 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: v_readlane_b32 s8, v21, 34 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_lshl_b32 s9, s56, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; VI-NEXT: s_or_b32 s6, s6, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_and_b32 s6, s7, 0xff +; VI-NEXT: v_readlane_b32 s7, v21, 33 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_readlane_b32 s7, v21, 32 +; VI-NEXT: v_readlane_b32 s8, v21, 31 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 30 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_readlane_b32 s6, v21, 29 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s7, s46, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: v_readlane_b32 s5, v21, 28 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s5, v21, 27 +; VI-NEXT: v_readlane_b32 s6, v21, 26 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s87, v20, 31 +; VI-NEXT: v_readlane_b32 s86, v20, 30 +; VI-NEXT: v_readlane_b32 s85, v20, 29 +; VI-NEXT: v_readlane_b32 s84, v20, 28 +; VI-NEXT: v_readlane_b32 s83, v20, 27 +; VI-NEXT: v_readlane_b32 s82, v20, 26 +; VI-NEXT: v_readlane_b32 s81, v20, 25 +; VI-NEXT: v_readlane_b32 s80, v20, 24 +; VI-NEXT: v_readlane_b32 s71, v20, 23 +; VI-NEXT: v_readlane_b32 s70, v20, 22 +; VI-NEXT: v_readlane_b32 s69, v20, 21 +; VI-NEXT: v_readlane_b32 s68, v20, 20 +; VI-NEXT: v_readlane_b32 s67, v20, 19 +; VI-NEXT: v_readlane_b32 s66, v20, 18 +; VI-NEXT: v_readlane_b32 s65, v20, 17 +; VI-NEXT: v_readlane_b32 s64, v20, 16 +; VI-NEXT: v_readlane_b32 s55, v20, 15 +; VI-NEXT: v_readlane_b32 s54, v20, 14 +; VI-NEXT: v_readlane_b32 s53, v20, 13 +; VI-NEXT: v_readlane_b32 s52, v20, 12 +; VI-NEXT: v_readlane_b32 s51, v20, 11 +; VI-NEXT: v_readlane_b32 s50, v20, 10 +; VI-NEXT: v_readlane_b32 s49, v20, 9 +; VI-NEXT: v_readlane_b32 s48, v20, 8 +; VI-NEXT: v_readlane_b32 s39, v20, 7 +; VI-NEXT: v_readlane_b32 s38, v20, 6 +; VI-NEXT: v_readlane_b32 s37, v20, 5 +; VI-NEXT: v_readlane_b32 s36, v20, 4 +; VI-NEXT: v_readlane_b32 s35, v20, 3 +; VI-NEXT: v_readlane_b32 s34, v20, 2 +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v128i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_writelane_b32 v63, s64, 16 +; GFX9-NEXT: v_writelane_b32 v63, s65, 17 +; GFX9-NEXT: v_writelane_b32 v63, s66, 18 +; GFX9-NEXT: v_writelane_b32 v63, s67, 19 +; GFX9-NEXT: v_writelane_b32 v63, s68, 20 +; GFX9-NEXT: v_writelane_b32 v63, s69, 21 +; GFX9-NEXT: v_writelane_b32 v63, s70, 22 +; GFX9-NEXT: v_writelane_b32 v63, s71, 23 +; GFX9-NEXT: v_writelane_b32 v63, s80, 24 +; GFX9-NEXT: v_writelane_b32 v63, s81, 25 +; GFX9-NEXT: v_writelane_b32 v63, s82, 26 +; GFX9-NEXT: v_writelane_b32 v63, s83, 27 +; GFX9-NEXT: v_writelane_b32 v63, s84, 28 +; GFX9-NEXT: v_writelane_b32 v63, s85, 29 +; GFX9-NEXT: v_writelane_b32 v63, s86, 30 +; GFX9-NEXT: v_writelane_b32 v63, s87, 31 +; GFX9-NEXT: v_writelane_b32 v63, s96, 32 +; GFX9-NEXT: v_writelane_b32 v63, s97, 33 +; GFX9-NEXT: v_writelane_b32 v63, s98, 34 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_writelane_b32 v63, s99, 35 +; GFX9-NEXT: v_readfirstlane_b32 s44, v3 +; GFX9-NEXT: v_readfirstlane_b32 s45, v4 +; GFX9-NEXT: v_readfirstlane_b32 s42, v5 +; GFX9-NEXT: v_readfirstlane_b32 s43, v6 +; GFX9-NEXT: v_readfirstlane_b32 s40, v7 +; GFX9-NEXT: v_readfirstlane_b32 s41, v8 +; GFX9-NEXT: v_readfirstlane_b32 s14, v9 +; GFX9-NEXT: v_readfirstlane_b32 s15, v10 +; GFX9-NEXT: v_readfirstlane_b32 s12, v11 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 +; GFX9-NEXT: v_readfirstlane_b32 s10, v13 +; GFX9-NEXT: v_readfirstlane_b32 s11, v14 +; GFX9-NEXT: v_readfirstlane_b32 s8, v15 +; GFX9-NEXT: v_readfirstlane_b32 s9, v16 +; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX9-NEXT: s_cbranch_scc0 .LBB99_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 49 +; GFX9-NEXT: s_lshr_b32 s46, s5, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 48 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 47 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 46 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 45 +; GFX9-NEXT: s_lshr_b32 s46, s29, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 44 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 43 +; GFX9-NEXT: s_lshr_b32 s46, s29, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 42 +; GFX9-NEXT: s_lshr_b32 s46, s28, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s28, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s27, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s27, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s26, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s26, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s25, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s25, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s24, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s24, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s23, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s23, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s23, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s22, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s22, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s21, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s20, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s20, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s19, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s19, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s18, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s17, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s17, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s17, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 12 +; GFX9-NEXT: s_lshr_b32 s46, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 11 +; GFX9-NEXT: s_lshr_b32 s46, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 10 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 9 +; GFX9-NEXT: s_lshr_b32 s46, s7, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 7 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 6 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 5 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: v_writelane_b32 v62, s46, 4 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 3 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 2 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s46, 1 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s46, 0 +; GFX9-NEXT: s_lshr_b32 s82, s11, 24 +; GFX9-NEXT: s_lshr_b32 s83, s11, 16 +; GFX9-NEXT: s_lshr_b32 s85, s11, 8 +; GFX9-NEXT: s_lshr_b32 s84, s10, 16 +; GFX9-NEXT: s_lshr_b32 s86, s10, 8 +; GFX9-NEXT: s_lshr_b32 s87, s13, 24 +; GFX9-NEXT: s_lshr_b32 s96, s13, 16 +; GFX9-NEXT: s_lshr_b32 s98, s13, 8 +; GFX9-NEXT: s_lshr_b32 s97, s12, 16 +; GFX9-NEXT: s_lshr_b32 s99, s12, 8 +; GFX9-NEXT: s_lshr_b32 s38, s15, 24 +; GFX9-NEXT: s_lshr_b32 s39, s15, 16 +; GFX9-NEXT: s_lshr_b32 s49, s15, 8 +; GFX9-NEXT: s_lshr_b32 s48, s14, 16 +; GFX9-NEXT: s_lshr_b32 s50, s14, 8 +; GFX9-NEXT: s_lshr_b32 s51, s41, 24 +; GFX9-NEXT: s_lshr_b32 s52, s41, 16 +; GFX9-NEXT: s_lshr_b32 s54, s41, 8 +; GFX9-NEXT: s_lshr_b32 s53, s40, 16 +; GFX9-NEXT: s_lshr_b32 s55, s40, 8 +; GFX9-NEXT: s_lshr_b32 s64, s43, 24 +; GFX9-NEXT: s_lshr_b32 s65, s43, 16 +; GFX9-NEXT: s_lshr_b32 s67, s43, 8 +; GFX9-NEXT: s_lshr_b32 s66, s42, 16 +; GFX9-NEXT: s_lshr_b32 s68, s42, 8 +; GFX9-NEXT: s_lshr_b32 s69, s45, 24 +; GFX9-NEXT: s_lshr_b32 s70, s45, 16 +; GFX9-NEXT: s_lshr_b32 s80, s45, 8 +; GFX9-NEXT: s_lshr_b32 s71, s44, 16 +; GFX9-NEXT: s_lshr_b32 s81, s44, 8 +; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB99_4 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] +; GFX9-NEXT: v_pk_add_u16 v28, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] +; GFX9-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[29:30] +; GFX9-NEXT: v_pk_add_u16 v32, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] +; GFX9-NEXT: v_pk_add_u16 v34, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v33, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] +; GFX9-NEXT: v_pk_add_u16 v36, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v35, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] +; GFX9-NEXT: v_pk_add_u16 v38, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v37, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] +; GFX9-NEXT: v_pk_add_u16 v49, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v48, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] +; GFX9-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v8, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v10, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v12 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v11 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v14 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v38 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v4 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: s_branch .LBB99_5 +; GFX9-NEXT: .LBB99_3: +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB99_2 +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: v_mov_b32_e32 v15, s71 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s80 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s70 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s69 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s68 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s66 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s67 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s65 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s64 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s53 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s54 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s52 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s50 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s48 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s49 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s39 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s38 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s99 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s97 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s96 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s87 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s86 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s84 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s85 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s83 +; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s82 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: v_mov_b32_e32 v51, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: v_mov_b32_e32 v50, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: v_mov_b32_e32 v20, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 14 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 16 +; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 17 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 18 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 19 +; GFX9-NEXT: v_mov_b32_e32 v54, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 20 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 21 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 22 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 23 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 24 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 25 +; GFX9-NEXT: v_mov_b32_e32 v23, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 26 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 27 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 28 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 31 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 32 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 33 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 34 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 35 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 36 +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: v_mov_b32_e32 v56, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 38 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 39 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 40 +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 41 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 42 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 43 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 44 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 45 +; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 47 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 48 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s46 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s56 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s58 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s60 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s62 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s72 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s74 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s76 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s78 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s88 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s90 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s92 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s94 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v41, s36 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: v_mov_b32_e32 v22, s45 +; GFX9-NEXT: v_mov_b32_e32 v13, s42 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v11, s40 +; GFX9-NEXT: v_mov_b32_e32 v12, s41 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s13 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v48, s16 +; GFX9-NEXT: v_mov_b32_e32 v49, s17 +; GFX9-NEXT: v_mov_b32_e32 v37, s18 +; GFX9-NEXT: v_mov_b32_e32 v38, s19 +; GFX9-NEXT: v_mov_b32_e32 v35, s20 +; GFX9-NEXT: v_mov_b32_e32 v36, s21 +; GFX9-NEXT: v_mov_b32_e32 v33, s22 +; GFX9-NEXT: v_mov_b32_e32 v34, s23 +; GFX9-NEXT: v_mov_b32_e32 v31, s24 +; GFX9-NEXT: v_mov_b32_e32 v32, s25 +; GFX9-NEXT: v_mov_b32_e32 v29, s26 +; GFX9-NEXT: v_mov_b32_e32 v30, s27 +; GFX9-NEXT: v_mov_b32_e32 v27, s28 +; GFX9-NEXT: v_mov_b32_e32 v28, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s5 +; GFX9-NEXT: v_mov_b32_e32 v41, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v51 +; GFX9-NEXT: v_mov_b32_e32 v51, v52 +; GFX9-NEXT: v_mov_b32_e32 v52, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v55 +; GFX9-NEXT: v_mov_b32_e32 v55, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, s81 +; GFX9-NEXT: .LBB99_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_readlane_b32 s99, v63, 35 +; GFX9-NEXT: v_readlane_b32 s98, v63, 34 +; GFX9-NEXT: v_readlane_b32 s97, v63, 33 +; GFX9-NEXT: v_readlane_b32 s96, v63, 32 +; GFX9-NEXT: v_readlane_b32 s87, v63, 31 +; GFX9-NEXT: v_readlane_b32 s86, v63, 30 +; GFX9-NEXT: v_readlane_b32 s85, v63, 29 +; GFX9-NEXT: v_readlane_b32 s84, v63, 28 +; GFX9-NEXT: v_readlane_b32 s83, v63, 27 +; GFX9-NEXT: v_readlane_b32 s82, v63, 26 +; GFX9-NEXT: v_readlane_b32 s81, v63, 25 +; GFX9-NEXT: v_readlane_b32 s80, v63, 24 +; GFX9-NEXT: v_readlane_b32 s71, v63, 23 +; GFX9-NEXT: v_readlane_b32 s70, v63, 22 +; GFX9-NEXT: v_readlane_b32 s69, v63, 21 +; GFX9-NEXT: v_readlane_b32 s68, v63, 20 +; GFX9-NEXT: v_readlane_b32 s67, v63, 19 +; GFX9-NEXT: v_readlane_b32 s66, v63, 18 +; GFX9-NEXT: v_readlane_b32 s65, v63, 17 +; GFX9-NEXT: v_readlane_b32 s64, v63, 16 +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64i16_to_v128i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_hi, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-TRUE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s5, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s5, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s4, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s7, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s6, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s9, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s8, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s11, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s11, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s10, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s13, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s12, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s12, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s15, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s15, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s14, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s41, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s41, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s41, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s40, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s29, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s74, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s75, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_4 +; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v51, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v50, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s40, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v55, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v49, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v48, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[35:36] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 16, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 16, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v48 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 24, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 16, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 16, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v50 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 16, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v21 +; GFX11-TRUE16-NEXT: s_branch .LBB99_5 +; GFX11-TRUE16-NEXT: .LBB99_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s43, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB99_2 +; GFX11-TRUE16-NEXT: .LBB99_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, s104 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, s103 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, s102 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, s101 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, s100 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, s99 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, s98 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, s97 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, s96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, s87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, s86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, s85 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, s84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, s83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, s82 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, s81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, s80 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, s71 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, s70 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, s69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, s68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, s67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, s66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, s65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, s0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s1, v78, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s0 +; GFX11-TRUE16-NEXT: .LBB99_5: ; %end +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, v60, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, v54, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v69, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v50, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v51, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v81, v57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xff, v47 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v56, v50, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v81, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v44 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v57, v50, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v70, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v43 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v48, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v35, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v69, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v49, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v67, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v180 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v35, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v36, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v36, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, v67, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[54:57], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v160 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v66, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v36, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xff, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v38, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v49, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v52, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, v31, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, v27, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, v28, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, v21, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v32, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v129 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v32, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v128 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xff, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v49, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v51, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, v17, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, v18, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v102 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v103 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v27, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v165 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v70, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v24, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v26, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v67, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v23, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v20 +; GFX11-TRUE16-NEXT: s_clause 0x5 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[35:38], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[48:51], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-TRUE16-NEXT: s_clause 0x12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-TRUE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-TRUE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-TRUE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-TRUE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-TRUE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-TRUE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-TRUE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-TRUE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-TRUE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-TRUE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-TRUE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-TRUE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s40, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s41, v2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v76, s104, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s55, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s64, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s65, 17 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s66, 18 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s67, 19 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s68, 20 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s69, 21 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s70, 22 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s71, 23 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s80, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s81, 25 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s82, 26 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s83, 27 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s84, 28 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s85, 29 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s86, 30 +; GFX11-FAKE16-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s5, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s5, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s5, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s4, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s7, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s50, s7, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s51, s6, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s52, s9, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s53, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s55, s9, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s54, s8, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s64, s8, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s65, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s66, s11, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s68, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s67, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s69, s10, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s70, s13, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s71, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s81, s13, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s80, s12, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s82, s12, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s83, s15, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s84, s15, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s86, s15, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s85, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s87, s14, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s96, s41, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s97, s41, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s100, s41, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s98, s40, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s101, s40, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s102, s29, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s103, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s29, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s104, s28, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s2, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s74, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s75, 1 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_4 +; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v51, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v50, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s41, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s40, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v53, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v52, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 24, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 24, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 8, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 16, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-FAKE16-NEXT: s_branch .LBB99_5 +; GFX11-FAKE16-NEXT: .LBB99_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: s_mov_b32 s99, -1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr104 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr97 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr87 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; kill: killed $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB99_2 +; GFX11-FAKE16-NEXT: .LBB99_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v71, s50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s41 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v74, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v73, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v49, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v61, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v59, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v57, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v47, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v45, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v43, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v41, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v48, s62 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v54, s72 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v64, s60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v80, s46 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v182, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v181, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v180, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v179, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v177, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v176, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v167, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 31 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v166, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v163, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v162, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v161, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v151, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v150, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v149, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v148, s0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s1, v78, 1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-FAKE16-NEXT: .LBB99_5: ; %end +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v63 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v69, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, v71, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v82, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v70, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v66, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v35, 0x300, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v33, 0x300, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v38, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v32, 0x300, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v36, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v39, 0x300, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:404 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:408 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:412 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:416 -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:420 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:424 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:428 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:432 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:436 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:440 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:444 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:448 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:452 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:456 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:460 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:464 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:468 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:472 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:476 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:480 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:484 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:488 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:492 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:496 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:500 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:504 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:532 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:536 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:540 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:544 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:548 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:552 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:556 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:560 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:564 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:568 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:572 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:576 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v69, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v60, 0xff, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v50, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, v60, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v52, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v66, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v57, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v47 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v52, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v39, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v42 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v80, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xff, v183 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v38, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v70, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v81, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v80, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v33, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v38, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v66, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xff, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v167 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xff, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v165, 8, v166 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v67, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v80, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v32, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, v33, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, v28, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v29, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v163 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v29, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xff, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v146 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v32, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, v50, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v24, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v20, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v21, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v15, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v144 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v25, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v131 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v33, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v13, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v103 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v102 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v20, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 8, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v26, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v21, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v5, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v10, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v20, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v22, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v17 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:72 +; GFX11-FAKE16-NEXT: v_readlane_b32 s104, v76, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s103, v76, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s102, v76, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s101, v76, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s100, v76, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s99, v76, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s98, v76, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s97, v76, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s96, v76, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s87, v75, 31 +; GFX11-FAKE16-NEXT: v_readlane_b32 s86, v75, 30 +; GFX11-FAKE16-NEXT: v_readlane_b32 s85, v75, 29 +; GFX11-FAKE16-NEXT: v_readlane_b32 s84, v75, 28 +; GFX11-FAKE16-NEXT: v_readlane_b32 s83, v75, 27 +; GFX11-FAKE16-NEXT: v_readlane_b32 s82, v75, 26 +; GFX11-FAKE16-NEXT: v_readlane_b32 s81, v75, 25 +; GFX11-FAKE16-NEXT: v_readlane_b32 s80, v75, 24 +; GFX11-FAKE16-NEXT: v_readlane_b32 s71, v75, 23 +; GFX11-FAKE16-NEXT: v_readlane_b32 s70, v75, 22 +; GFX11-FAKE16-NEXT: v_readlane_b32 s69, v75, 21 +; GFX11-FAKE16-NEXT: v_readlane_b32 s68, v75, 20 +; GFX11-FAKE16-NEXT: v_readlane_b32 s67, v75, 19 +; GFX11-FAKE16-NEXT: v_readlane_b32 s66, v75, 18 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v75, 17 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v75, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s55, v75, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v75, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v75, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v75, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v75, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v75, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v75, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v75, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v75, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v75, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v75, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v75, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v75, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v75, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v75, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <128 x i8> %a, splat (i8 3) - %a2 = bitcast <128 x i8> %a1 to <64 x i16> + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> br label %end cmp.false: - %a3 = bitcast <128 x i8> %a to <64 x i16> + %a3 = bitcast <64 x i16> %a to <128 x i8> br label %end end: - %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <64 x i16> %phi + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi } define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v61 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v62 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v9 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v51 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v55 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v43 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v46 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v47 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v59 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v63 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v42, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v35 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v53 -; GCN-NEXT: v_add_f32_e32 v51, 0x40c00000, v54 -; GCN-NEXT: v_add_f32_e32 v52, 0x40c00000, v55 -; GCN-NEXT: v_add_f32_e32 v53, 0x40c00000, v40 -; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v41 -; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v42 -; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v43 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v44 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v45 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v46 -; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v47 -; GCN-NEXT: v_add_f32_e32 v45, 0x40c00000, v56 -; GCN-NEXT: v_add_f32_e32 v46, 0x40c00000, v57 -; GCN-NEXT: v_add_f32_e32 v47, 0x40c00000, v58 -; GCN-NEXT: v_add_f32_e32 v56, 0x40c00000, v59 -; GCN-NEXT: v_add_f32_e32 v57, 0x40c00000, v60 -; GCN-NEXT: v_add_f32_e32 v58, 0x40c00000, v61 -; GCN-NEXT: v_add_f32_e32 v59, 0x40c00000, v62 -; GCN-NEXT: v_add_f32_e32 v60, 0x40c00000, v63 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v63, 0x40c00000, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v31, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v29 -; GCN-NEXT: v_mov_b32_e32 v29, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v27, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v25, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v24, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v19, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v7, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v4, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v2, v1 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v2, v1 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v4, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v6, v6, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v8, v8, v2 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v10, v10, v2 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v12, v12, v2 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v14, v14, v2 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v16, v16, v2 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v18, v18, v2 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v20, v20, v2 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v22, v2 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v34, v33 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v36, v35 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v37, v38, v37 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v39, v48, v39 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v49, v50, v49 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v51, v52, v51 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v53, v54, v53 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v55, v40, v55 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v41, v42, v41 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64bf16_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: .LBB100_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v56 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: .LBB100_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16: ; VI: ; %bb.0: @@ -106519,7 +227102,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -107105,7 +227688,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: .LBB100_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -107126,1607 +227709,5513 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v64bf16_to_v64f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v34, v34, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_add3_u32 v35, v35, v34, s6 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; GFX9-NEXT: v_bfe_u32 v35, v18, 16, 1 -; GFX9-NEXT: v_add3_u32 v35, v35, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc -; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 -; GFX9-NEXT: v_add3_u32 v36, v36, v19, s6 -; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_add3_u32 v37, v37, v36, s6 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 -; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc -; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 -; GFX9-NEXT: v_add3_u32 v37, v37, v20, s6 -; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_add3_u32 v38, v38, v37, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 -; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc -; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 -; GFX9-NEXT: v_add3_u32 v38, v38, v21, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_add3_u32 v39, v39, v38, s6 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 -; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc -; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 -; GFX9-NEXT: v_add3_u32 v39, v39, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_add3_u32 v48, v48, v39, s6 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc -; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 -; GFX9-NEXT: v_add3_u32 v48, v48, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v48, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_add3_u32 v49, v49, v48, s6 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc -; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 -; GFX9-NEXT: v_add3_u32 v49, v49, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_add3_u32 v50, v50, v49, s6 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc -; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 -; GFX9-NEXT: v_add3_u32 v50, v50, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_add3_u32 v51, v51, v50, s6 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 -; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc -; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 -; GFX9-NEXT: v_add3_u32 v51, v51, v26, s6 -; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_add3_u32 v52, v52, v51, s6 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 -; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc -; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 -; GFX9-NEXT: v_add3_u32 v52, v52, v27, s6 -; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v52, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_add3_u32 v53, v53, v52, s6 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 -; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc -; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 -; GFX9-NEXT: v_add3_u32 v53, v53, v28, s6 -; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add3_u32 v54, v54, v53, s6 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 -; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc -; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 -; GFX9-NEXT: v_add3_u32 v54, v54, v29, s6 -; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v54, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 -; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_add3_u32 v55, v55, v54, s6 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 -; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc -; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 -; GFX9-NEXT: v_add3_u32 v55, v55, v30, s6 -; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 -; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX9-NEXT: v_add3_u32 v40, v40, v55, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc -; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 -; GFX9-NEXT: v_add3_u32 v40, v40, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 -; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_add3_u32 v41, v41, v40, s6 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 -; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc -; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v41, v41, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 -; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v42, v42, v41, s6 -; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc -; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v42, v42, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-LABEL: bitcast_v64bf16_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB100_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v34, v34, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v35, v35, v34, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v35, v35, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v36, v36, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v37, v37, v36, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v37, v37, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v38, v38, v37, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v38, v38, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v39, v39, v38, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v39, v39, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v48, v48, v39, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v48, v48, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v49, v49, v48, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v49, v49, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v50, v50, v49, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v50, v50, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v51, v51, v50, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v51, v51, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v52, v52, v51, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v52, v52, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v53, v53, v52, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v53, v53, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v54, v54, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v54, v54, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v55, v55, v54, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v55, v55, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v40, v40, v55, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v40, v40, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v41, v41, v40, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v42, v42, v41, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v42, v42, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v43, v43, v42, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v43, v43, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v44, v44, v43, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v44, v44, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v45, v45, v44, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v45, v45, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v46, v46, v45, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v46, v46, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v47, v47, v46, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v47, v47, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v56, v56, v47, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v56, v56, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v57, v57, v56, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v57, v57, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v58, v58, v57, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v58, v58, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v59, v59, v58, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v59, v59, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v60, v60, v59, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v60, v60, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v61, v61, v60, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v61, v61, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v62, v62, v61, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v62, v62, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v15, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v0, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v1, s6 +; GFX9-NEXT: v_perm_b32 v1, v16, v41, s6 +; GFX9-NEXT: v_perm_b32 v0, v17, v40, s6 +; GFX9-NEXT: v_perm_b32 v17, v32, v33, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v13, v13, v61, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v60, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v59, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v58, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v57, s6 +; GFX9-NEXT: v_perm_b32 v8, v8, v56, s6 +; GFX9-NEXT: v_perm_b32 v7, v7, v47, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v46, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v45, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v44, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v43, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v42, s6 +; GFX9-NEXT: v_perm_b32 v31, v31, v55, s6 +; GFX9-NEXT: v_perm_b32 v30, v30, v54, s6 +; GFX9-NEXT: v_perm_b32 v29, v29, v53, s6 +; GFX9-NEXT: v_perm_b32 v28, v28, v52, s6 +; GFX9-NEXT: v_perm_b32 v27, v27, v51, s6 +; GFX9-NEXT: v_perm_b32 v26, v26, v50, s6 +; GFX9-NEXT: v_perm_b32 v25, v25, v49, s6 +; GFX9-NEXT: v_perm_b32 v24, v24, v48, s6 +; GFX9-NEXT: v_perm_b32 v23, v23, v39, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v38, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v37, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v36, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v35, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 +; GFX9-NEXT: .LBB100_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB100_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v17, v17, v39 :: v_dual_lshlrev_b32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v37, v32 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v37, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v80, 0x40c00000, v80 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v38, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_add_f32 v39, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v19, v34, v35 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v82, 0x40c00000, v82 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v49 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v38, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v48, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v49, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_lshlrev_b32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v86 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v38, v39, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v39 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_cndmask_b32 v21, v37, v38 +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v49, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v22, v37, v38 :: v_dual_add_f32 v49, 0x40c00000, v51 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v51, 0x40c00000, v23 :: v_dual_add_f32 v14, 0x40c00000, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v50, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v98, 0x40c00000, v98 +; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v48, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v48 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-TRUE16-NEXT: v_add3_u32 v101, v101, v14, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v102, v98, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v98 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v38, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v49, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v52 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v51, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-TRUE16-NEXT: v_add3_u32 v102, v102, v98, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v50, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v53 +; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v52, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v51, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v54 :: v_dual_add_f32 v53, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v25, v48, v49 :: v_dual_and_b32 v26, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v53, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v54, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v50, v51, v52, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v52 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v49.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v50, v53, v54, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v55, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v64 :: v_dual_add_f32 v53, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v50, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v55, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v55 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v51, v52, v53, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v53 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v51, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v51, v55, v54, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v65 +; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v64, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v55, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v29 :: v_dual_lshlrev_b32 v30, 16, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v66 +; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v65, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v64, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v30 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v30, v53, v54 :: v_dual_and_b32 v31, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 +; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v66, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v67, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v55, v64, v65, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v65 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v54.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v67, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v68, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v69 :: v_dual_add_f32 v66, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v55, v64, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v68, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v68 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v67, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v66, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v66 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v70, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v69, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v70, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v64.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v65, v66, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v69, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v69 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v68, 0x40c00000, v68 :: v_dual_cndmask_b32 v65, v65, v66 +; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v70, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v70 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v68, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v65.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v67, v69, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v65, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v28.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v67, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v67, v70, v68, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v68 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v71, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v27.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v65, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v25.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v68, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 +; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v80, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v49, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v67 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v66, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v48, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v81, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v68 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v64, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v70, v71, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 +; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v82, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v29.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v55, v52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v23.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v64, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v54, v53 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v22.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v50, v39 +; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v51, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v52, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v84, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v19.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v18.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v71 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-TRUE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v85, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v69 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v80 +; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v68, v31 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v53, v21 +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v36, v35 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v82, v83, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 +; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v38, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v81 +; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v39, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v87, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v83, v84, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v84, v85, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v84, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v84, v86, v87, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v87 +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v96, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v85, v86, v12, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v83 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v85, v86 :: v_dual_add_f32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v85, v87, v96, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v101, v112 :: v_dual_add_f32 v87, 0x40c00000, v97 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v86, v86, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v87, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v102, v114 :: v_dual_add_f32 v15, 0x40c00000, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v87 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-TRUE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v96 +; GFX11-TRUE16-NEXT: v_bfe_u32 v113, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v98.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v99, v103, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_add3_u32 v101, v113, v15, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v84 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v101, v112, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v98, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v86, v100, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v85, v97, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v85 +; GFX11-TRUE16-NEXT: .LBB100_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB100_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v87, 0x40c00000, v87 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v11 +; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v17, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v37, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v33, v17, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v34, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v38, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v38, v37, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v39, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v35, v19, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v37, v21, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v48, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v39, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v51, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v39, v23, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v49, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v51, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v48, v24, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v49, v25, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v51, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v52, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v51, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v53, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v50, v26, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v64, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v51, v27, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v52, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v53, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v52, v28, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v53, v29, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v54, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v54, v64, v65, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v65 +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v64, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1 +; GFX11-FAKE16-NEXT: v_perm_b32 v30, v54, v30, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v68, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v64, v65, v68, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v68 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v31, v55, v31, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v66, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v64, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v69, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v69, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v69 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v65, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v68, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v66, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v70, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v70 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v69, v70, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v68, v69, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v67, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v71, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v68, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v80, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-FAKE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v69, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v81, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v70, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v82, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-FAKE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v7, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v71, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v80, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v84, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-FAKE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v81, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v85, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v82, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v11, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v87, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v87 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 +; GFX11-FAKE16-NEXT: v_add3_u32 v86, v86, v87, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v84, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v85, v85, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v97, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_add3_u32 v87, v98, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v98, v99, v84, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v84 +; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v97, v101, v13, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v86, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v87, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v98, v100, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v87 +; GFX11-FAKE16-NEXT: v_bfe_u32 v102, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101 +; GFX11-FAKE16-NEXT: v_add3_u32 v101, v102, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v103, v98, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v98 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15 +; GFX11-FAKE16-NEXT: v_add3_u32 v103, v103, v98, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v87, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v103, v112, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v99, v113, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v98, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v97, v100, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v84, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB100_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v55, v50 +; SI-NEXT: v_mov_b32_e32 v40, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v24, v47 +; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v26, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_mov_b32_e32 v35, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v38, v10 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v19, v28 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v39, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: v_mov_b32_e32 v15, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_mov_b32_e32 v51, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_mov_b32_e32 v37, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v26 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v48 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v53 +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v49 +; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 +; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 +; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 +; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 +; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_add_i32_e32 v16, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 +; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v44 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v40, v52 +; SI-NEXT: v_mov_b32_e32 v55, v50 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v26, v57 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v24, v47 +; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v42, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 +; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 +; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 +; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 +; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 +; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 +; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 +; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 +; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: s_branch .LBB101_5 +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB101_5: ; %end +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_readlane_b32 s31, v42, 1 +; VI-NEXT: v_readlane_b32 s30, v42, 0 +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 -; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_add3_u32 v43, v43, v42, s6 -; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc -; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v43, v43, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v43, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 -; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_add3_u32 v44, v44, v43, s6 -; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 -; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc -; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v44, v44, v3, s6 -; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 -; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_add3_u32 v45, v45, v44, s6 -; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 -; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc -; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v4, s6 -; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 -; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_add3_u32 v46, v46, v45, s6 -; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 -; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc -; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v46, v46, v5, s6 -; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 -; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_add3_u32 v47, v47, v46, s6 -; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc -; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v47, v47, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 -; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_add3_u32 v56, v56, v47, s6 -; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 -; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc -; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 -; GFX9-NEXT: v_add3_u32 v56, v56, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v56, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 -; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_add3_u32 v57, v57, v56, s6 -; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 -; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc -; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v57, v57, v8, s6 -; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 -; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v58, v58, v57, s6 -; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc -; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v58, v58, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 -; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_add3_u32 v59, v59, v58, s6 -; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 -; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc -; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 -; GFX9-NEXT: v_add3_u32 v59, v59, v10, s6 -; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 -; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v60, v60, v59, s6 -; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 -; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc -; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v60, v60, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 -; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v61, v61, v60, s6 -; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 -; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc -; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v61, v61, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 -; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v62, v62, v61, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 -; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc -; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v62, v62, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 -; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 -; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc -; GFX9-NEXT: v_add3_u32 v62, v62, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 -; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 -; GFX9-NEXT: v_bfe_u32 v62, v15, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc -; GFX9-NEXT: v_add3_u32 v62, v62, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc -; GFX9-NEXT: s_mov_b32 s6, 0x7060302 -; GFX9-NEXT: v_perm_b32 v15, v15, v0, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v1, s6 -; GFX9-NEXT: v_perm_b32 v1, v16, v41, s6 -; GFX9-NEXT: v_perm_b32 v0, v17, v40, s6 -; GFX9-NEXT: v_perm_b32 v17, v32, v33, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v13, v13, v61, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v60, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v59, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v58, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v57, s6 -; GFX9-NEXT: v_perm_b32 v8, v8, v56, s6 -; GFX9-NEXT: v_perm_b32 v7, v7, v47, s6 -; GFX9-NEXT: v_perm_b32 v6, v6, v46, s6 -; GFX9-NEXT: v_perm_b32 v5, v5, v45, s6 -; GFX9-NEXT: v_perm_b32 v4, v4, v44, s6 -; GFX9-NEXT: v_perm_b32 v3, v3, v43, s6 -; GFX9-NEXT: v_perm_b32 v2, v2, v42, s6 -; GFX9-NEXT: v_perm_b32 v31, v31, v55, s6 -; GFX9-NEXT: v_perm_b32 v30, v30, v54, s6 -; GFX9-NEXT: v_perm_b32 v29, v29, v53, s6 -; GFX9-NEXT: v_perm_b32 v28, v28, v52, s6 -; GFX9-NEXT: v_perm_b32 v27, v27, v51, s6 -; GFX9-NEXT: v_perm_b32 v26, v26, v50, s6 -; GFX9-NEXT: v_perm_b32 v25, v25, v49, s6 -; GFX9-NEXT: v_perm_b32 v24, v24, v48, s6 -; GFX9-NEXT: v_perm_b32 v23, v23, v39, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v38, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v37, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v36, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v35, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 -; GFX9-NEXT: .LBB49_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v33, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v33 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: v_and_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GFX9-NEXT: v_and_b32_sdwa v31, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; GFX9-NEXT: v_and_b32_sdwa v30, v18, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GFX9-NEXT: v_and_b32_sdwa v29, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; GFX9-NEXT: v_and_b32_sdwa v28, v18, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GFX9-NEXT: v_and_b32_sdwa v27, v18, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GFX9-NEXT: v_and_b32_sdwa v26, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GFX9-NEXT: v_and_b32_sdwa v25, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GFX9-NEXT: v_and_b32_sdwa v24, v18, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GFX9-NEXT: v_and_b32_sdwa v23, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GFX9-NEXT: v_and_b32_sdwa v22, v18, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GFX9-NEXT: v_and_b32_sdwa v21, v18, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX9-NEXT: v_and_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX9-NEXT: v_and_b32_sdwa v19, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GFX9-NEXT: v_and_b32_sdwa v32, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX9-NEXT: v_and_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v31, v40, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v28, v53, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v27, v52, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v24, v49, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v22, v39, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v36, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v32, v35, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v16 +; GFX9-NEXT: s_branch .LBB101_5 +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB101_5: ; %end +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_readlane_b32 s31, v43, 1 +; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16: +; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v71, 0x40c00000, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v17, v17, v39 :: v_dual_lshlrev_b32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v32, v37, v32 :: v_dual_lshlrev_b32 v29, 16, v29 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v37, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v80, 0x40c00000, v80 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_lshlrev_b32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v38, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_add_f32 v39, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v19, v34, v35 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v39, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v82, 0x40c00000, v82 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_lshlrev_b32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v49 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v38, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v48, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v48, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_lshlrev_b32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v86 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v38, v39, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v39 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v36.h -; GFX11-TRUE16-NEXT: v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_cndmask_b32 v21, v37, v38 -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v49, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v22, v37, v38 :: v_dual_add_f32 v49, 0x40c00000, v51 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v50, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v50 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v51, 0x40c00000, v23 :: v_dual_add_f32 v14, 0x40c00000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v50, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v48, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v49, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v98, 0x40c00000, v98 -; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v48, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v48 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-TRUE16-NEXT: v_add3_u32 v101, v101, v14, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v102, v98, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v98 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v38, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v38, v50, v49, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v52 -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v51, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-TRUE16-NEXT: v_add3_u32 v102, v102, v98, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v50, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v39, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v53 -; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v52, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v51, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v54 :: v_dual_add_f32 v53, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v25, v48, v49 :: v_dual_and_b32 v26, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v53, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-TRUE16-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-TRUE16-NEXT: .LBB101_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v54, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v50, v51, v52, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v52 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v49.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v50, v53, v54, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v55, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v64 :: v_dual_add_f32 v53, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v50, v51, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v55, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v55 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v51, v52, v53, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v53 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v51, v52, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v51, v55, v54, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v65 -; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v64, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v55, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v29 :: v_dual_lshlrev_b32 v30, 16, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v52, v53, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v66 -; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v65, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v64, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v65, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v30 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v30, v53, v54 :: v_dual_and_b32 v31, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 -; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v66, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-TRUE16-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v67, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v55, v64, v65, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v65 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v54.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v55, v66, v67, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v68, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v67, 0x40c00000, v69 :: v_dual_add_f32 v66, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v55, v64, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v68, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v68 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 -; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v67, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v66, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v66 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v70, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v68, v69, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v70, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v11, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v64.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v65, v66, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v65, v68, v69, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v69 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v69, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v68, 0x40c00000, v68 :: v_dual_cndmask_b32 v65, v65, v66 -; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v70, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v70 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v68, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v65.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v67, v69, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v65, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v28.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v67, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v67, v70, v68, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v68 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v71, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v66 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v27.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v65, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v25.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v68, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 -; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 -; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v80, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v49, v48 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v8, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v67 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v66, v26 -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v48, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 -; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v81, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v35, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v68 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v0.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v64, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v30.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v70, v71, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 -; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v82, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v29.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v55, v52 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v23.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v64, v51 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v54, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v22.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v50, v39 -; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v51, v38 -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v52, v37 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 -; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 -; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v84, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v19.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v18.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v7, v71 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v6, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v7, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 -; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v85, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v5, v69 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v80 -; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v68, v31 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v53, v21 -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v36, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v82, v83, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 -; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v10, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 -; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v38, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v9, v81 -; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v39, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v87, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v11, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v10.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v83, v84, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v84, v85, v11, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v10, v82 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v84, v85, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v84, v86, v87, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v87 -; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-TRUE16-NEXT: v_bfe_u32 v87, v96, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v11.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v85, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v85, v86, v12, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v11, v83 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v12, v85, v86 :: v_dual_add_f32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v85, v87, v96, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v12.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v101, v112 :: v_dual_add_f32 v87, 0x40c00000, v97 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v86, v86, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 -; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v87, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v102, v114 :: v_dual_add_f32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v87 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-TRUE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v96 -; GFX11-TRUE16-NEXT: v_bfe_u32 v113, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v98.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v99, v103, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_add3_u32 v101, v113, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v14.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v12, v84 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v101, v112, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v98, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v86, v100, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v13.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v85, v97, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v85 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v32 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v33, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v18 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v87, 0x40c00000, v87 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v11 -; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v17, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_add3_u32 v33, v34, v36, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v37, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v33, v17, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v34, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v37, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v36, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v19 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v38, v37, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v48, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v38, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v35, v19, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v38, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v22 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v37, v21, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v39, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v23 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v48, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v39, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v49, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v39, v23, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v49, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v50, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v50, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v51, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v48, v24, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v49, v25, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v51, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v52, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v51, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v53, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v50, v26, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v64, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v51, v27, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v29 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v52, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v54, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v53, v54, v65, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v65 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v53, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v65, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v52, v28, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v66, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v53, v29, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v54, v55, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v54, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v54, v64, v65, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v65 -; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v66, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v55, v64, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1 -; GFX11-FAKE16-NEXT: v_perm_b32 v30, v54, v30, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v68, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v11, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v82, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v83, v12, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v68, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v82, v83, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 16, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v83 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v82, v83 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v86, v82, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v86, v86, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v83, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v83 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v86, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v86 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v97, v82, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v96, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v97, v97, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v82 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v99, v99, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v85, v85, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97 +; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v98, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v101, v101, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v83 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v101, 0x400000, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v65, 16, v67 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff, v80 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v86 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff, v96 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v68 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v83, 16, v82 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v85, 16, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v87, 16, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v9, 16, v86 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v97, 16, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v81, 16, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v84, 16, v85 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v69, 16, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v64, 16, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v70, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v55, 16, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v50, 16, v65 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v52, 16, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v71, 16, v80 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v70 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v66 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v52 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v34, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v17, 16, v38 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB101_3: +; GFX11-TRUE16-NEXT: s_branch .LBB101_2 +; GFX11-TRUE16-NEXT: .LBB101_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-FAKE16-NEXT: .LBB101_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v64, v65, v68, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v68 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v31, v55, v31, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v65, v66, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v65, v68, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v64, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v69, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v66, v67, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v69, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v69 -; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v65, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v68, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v66, 0x7060302 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v11, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v70, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v70 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v69, v70, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v8, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v7, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v5, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v68, v69, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v8, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v67, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v71, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v70, v4, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v68, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v80, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 -; GFX11-FAKE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v71, v5, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v69, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v81, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v80, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v4, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v6, v70, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v82, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v82 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 -; GFX11-FAKE16-NEXT: v_add3_u32 v71, v81, v82, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v81, v7, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v80, v81, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v8, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v71, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v81, 0x400000, v83 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v80, v82, v83, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v81, v82, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v8, v80, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v84, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v84 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 -; GFX11-FAKE16-NEXT: v_add3_u32 v81, v83, v84, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v82, v83, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v9, v81, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v85, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, 0x400000, v85 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v82, v84, v85, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v84, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v10, v82, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v11, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13 -; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v87, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v87 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 -; GFX11-FAKE16-NEXT: v_add3_u32 v86, v86, v87, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_bfe_u32 v98, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v84, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v85, v85, v11, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v97, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v97, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add3_u32 v87, v98, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v98, v99, v84, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v84 -; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v97, v101, v13, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v12, v86, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v87, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v84, v98, v100, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v87 -; GFX11-FAKE16-NEXT: v_bfe_u32 v102, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101 -; GFX11-FAKE16-NEXT: v_add3_u32 v101, v102, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v103, v98, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v98 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_add3_u32 v103, v103, v98, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v14, v87, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v103, v112, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add3_u32 v99, v99, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v99, v113, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v98, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v97, v100, vcc_lo +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v82, v14 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v13, v84, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_bfe_u32 v83, v12, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v68, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v82, v83, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v83 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v82, v83 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v82, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v83, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v86, v86, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v83, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v85, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v86 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v86, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, 0x400000, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v82, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v86 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v96, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v97, v82, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v85, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v99, v96, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v98, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, v97, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v103, 0x400000, v82 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, v99, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v85, v85, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97 +; GFX11-FAKE16-NEXT: v_bfe_u32 v101, v98, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v101, v101, v98 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v99, v112, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v101 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v101, 0x400000, v98 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v96 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v85, v102, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v65, 16, v67 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v85 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v80 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v86 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v100, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v96 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v1, 16, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v83, 16, v82 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v85, 16, v86 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v87, 16, v83 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v9, 16, v86 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v97, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v81, 16, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v81, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v83, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v84, 16, v85 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v69, 16, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v64, 16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v70, 16, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v55, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v50, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v52, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v38, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v71, 16, v80 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v83 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v17, 16, v38 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB101_3: +; GFX11-FAKE16-NEXT: s_branch .LBB101_2 +; GFX11-FAKE16-NEXT: .LBB101_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -108746,1280 +233235,1299 @@ end: } define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v31 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB50_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v63 -; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v12 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v18 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: .LBB50_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v47, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v2, v1, v2, 16 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v3, v1, v3, 16 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v5, v1, v5, 16 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v7, v1, v7, 16 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v9, v1, v9, 16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v11, v1, v11, 16 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v13, v1, v13, 16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v15, v1, v15, 16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v17, v1, v17, 16 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v19, v1, v19, 16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v21, v1, v21, 16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v23, v1, v23, 16 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v25, v1, v25, 16 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v27, v1, v27, 16 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v29, v1, v29, 16 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v31, v1, v31, 16 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v33, v1, v33, 16 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v35, v1, v35, 16 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v37, v1, v37, 16 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v39, v1, v39, 16 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v49, v1, v49, 16 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v51, v1, v51, 16 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v53, v1, v53, 16 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v55, v1, v55, 16 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v41, v1, v41, 16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB102_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v60 +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v61 +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v57 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v41 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v38 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v3, v24 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: .LBB102_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64bf16: ; VI: ; %bb.0: @@ -110031,7 +234539,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 @@ -110131,7 +234639,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v33, v17 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -110146,7 +234654,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -110182,7 +234690,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB50_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -110199,7 +234707,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -110234,7 +234742,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: .LBB102_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -110255,1184 +234763,2661 @@ end: ret <64 x bfloat> %phi } +define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_mov_b32_e32 v46, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v38 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v48 +; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB103_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_mov_b32_e32 v37, v45 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_mov_b32_e32 v49, v47 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v58, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44 +; SI-NEXT: v_mov_b32_e32 v44, v18 +; SI-NEXT: v_mov_b32_e32 v5, v43 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: s_branch .LBB103_3 +; SI-NEXT: .LBB103_2: +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v49, v47 +; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_mov_b32_e32 v37, v45 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: v_mov_b32_e32 v5, v6 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB103_3: ; %Flow +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v43, v9 +; SI-NEXT: v_mov_b32_e32 v12, v31 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v9, v17 +; SI-NEXT: s_cbranch_vccnz .LBB103_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_mov_b32_e32 v3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: .LBB103_5: ; %end +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB103_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_3 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v33, v15 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v33, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v33, v13 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v33, v12 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v33, v11 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v33, v10 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v33, v9 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v33, v8 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v33, v6 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v33, v5 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v33, v4 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v33, v3 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v33, v2 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v33, v1 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v33, v0 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v31, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v33, v31 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v30, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v33, v30 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v29 +; VI-NEXT: v_add_f16_sdwa v29, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v33, v29 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v28 +; VI-NEXT: v_add_f16_sdwa v28, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v33, v28 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v27 +; VI-NEXT: v_add_f16_sdwa v27, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v33, v27 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v26 +; VI-NEXT: v_add_f16_sdwa v26, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v25 +; VI-NEXT: v_add_f16_sdwa v25, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v33, v25 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v24 +; VI-NEXT: v_add_f16_sdwa v24, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v33, v24 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v23 +; VI-NEXT: v_add_f16_sdwa v23, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v33, v23 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v22 +; VI-NEXT: v_add_f16_sdwa v22, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v33, v22 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v21 +; VI-NEXT: v_add_f16_sdwa v21, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v33, v21 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v20 +; VI-NEXT: v_add_f16_sdwa v20, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v33, v20 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v19 +; VI-NEXT: v_add_f16_sdwa v19, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v33, v19 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v32 +; VI-NEXT: v_add_f16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v32, v33, v32 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v33, v17 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB103_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_4: +; VI-NEXT: s_branch .LBB103_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_3 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB103_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: s_branch .LBB103_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-NEXT: .LBB103_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB103_3: +; GFX11-NEXT: s_branch .LBB103_2 +; GFX11-NEXT: .LBB103_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v64bf16_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v27 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v28 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v55 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v38 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v59 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v58 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v57 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v56 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v46 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v43 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v39 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v26 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v11 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v36 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v35 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v34 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v32 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v31 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: .LBB51_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v35 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v35 -; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v33 -; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GCN-NEXT: v_alignbit_b32 v10, v26, v10, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GCN-NEXT: v_alignbit_b32 v4, v8, v4, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_alignbit_b32 v4, v4, v8, 16 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_alignbit_b32 v8, v8, v10, 16 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v24 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GCN-NEXT: v_alignbit_b32 v10, v10, v24, 16 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v10 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v19 -; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v15 -; GCN-NEXT: v_alignbit_b32 v18, v21, v18, 16 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v43 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v37 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v51 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v53 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v55 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v32 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v34 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v36 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v38 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v48 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v49 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v50 -; GCN-NEXT: v_add_f32_e32 v56, 0x40c00000, v51 -; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v52 -; GCN-NEXT: v_add_f32_e32 v57, 0x40c00000, v53 -; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 -; GCN-NEXT: v_add_f32_e32 v59, 0x40c00000, v55 -; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v28 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v30 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v37 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v48 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v49 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v50 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v35 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v39, v51, v12, 16 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v49, v53, v7, 16 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v50, v54, v16, 16 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_alignbit_b32 v51, v21, v18, 16 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_alignbit_b32 v53, v23, v22, 16 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_alignbit_b32 v54, v25, v24, 16 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v55, v26, v41, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v40 -; GCN-NEXT: v_alignbit_b32 v40, v26, v42, 16 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v42, v44, v43, 16 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_alignbit_b32 v44, v45, v32, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v45, v27, v34, 16 -; GCN-NEXT: v_alignbit_b32 v46, v28, v36, 16 -; GCN-NEXT: v_alignbit_b32 v47, v29, v38, 16 -; GCN-NEXT: v_alignbit_b32 v56, v30, v56, 16 -; GCN-NEXT: v_alignbit_b32 v58, v48, v57, 16 -; GCN-NEXT: v_alignbit_b32 v62, v52, v59, 16 -; GCN-NEXT: v_alignbit_b32 v7, v62, v20, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v58, v19, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v56, v17, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v47, v9, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v6, v46, v6, 16 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v3, v45, v3, 16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v1, v44, v1, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v42, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v40, v4, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v55, v8, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v10, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v15, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v14, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v50, v13, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v49, v5, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v11, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: .LBB51_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: v_or_b32_e32 v56, v1, v2 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_or_b32_e32 v10, v10, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v44 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v42 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v55 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v54 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v30, v30, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v32, v32, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v53 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v34, v34, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v36, v36, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v51 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v38, v38, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v51, v51, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v50, v50, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v54, v54, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v49, v49, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v41, v41, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v39, v39, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v59, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64bf16_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v59 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v62 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v63 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v45 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v31 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v40 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v55 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v54 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v51 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB104_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v33 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v55 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v51 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_alignbit_b32 v15, v26, v15, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_alignbit_b32 v15, v18, v15, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_alignbit_b32 v15, v17, v15, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_alignbit_b32 v10, v15, v10, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_alignbit_b32 v3, v8, v3, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v3, v10, v3, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v2, v4, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v19, v10, v4, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v3, v15, v3, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v10, v8, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v3, v16, v3, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v23 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v16, v3, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v5, v20, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v7, v23, v7, 16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_alignbit_b32 v24, v45, v8, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v8, v24, v8, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 +; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_alignbit_b32 v26, v34, v9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v26, v9, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v52, v53, v14, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v14, v52, v14, 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v21, v6, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v19, v4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v18, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v27, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: .LBB104_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64i16: ; VI: ; %bb.0: @@ -111460,7 +237445,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -112046,7 +238031,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB104_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -112093,7 +238078,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -112583,7 +238568,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB104_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -112616,7 +238601,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 @@ -113182,7 +239167,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v37, 16, v38 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v16 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB104_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -113199,7 +239184,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v16 @@ -113709,7 +239694,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB104_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -113730,865 +239715,3837 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB105_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v42, v62 +; SI-NEXT: v_mov_b32_e32 v43, v63 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_mov_b32_e32 v25, v60 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v51, v61 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_mov_b32_e32 v24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_mov_b32_e32 v52, v10 +; SI-NEXT: v_mov_b32_e32 v53, v59 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_mov_b32_e32 v62, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v41, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: s_branch .LBB105_3 +; SI-NEXT: .LBB105_2: +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_mov_b32_e32 v51, v61 +; SI-NEXT: v_mov_b32_e32 v42, v62 +; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: v_mov_b32_e32 v25, v60 +; SI-NEXT: v_mov_b32_e32 v24, v56 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v43, v63 +; SI-NEXT: v_mov_b32_e32 v52, v10 +; SI-NEXT: v_mov_b32_e32 v53, v59 +; SI-NEXT: v_mov_b32_e32 v39, v4 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: .LBB105_3: ; %Flow +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB105_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 +; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16 +; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29 +; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15 +; SI-NEXT: v_mov_b32_e32 v15, v24 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16 +; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16 +; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16 +; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16 +; SI-NEXT: v_mov_b32_e32 v14, v51 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v42, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v42, s31, 1 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v18, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v33, vcc +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; VI-NEXT: v_add_f32_e32 v18, s4, v0 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_bfe_u32 v35, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v32 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x7fff, v37 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x7fff, v38 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x7fff, v39 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x7fff, v48 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x7fff, v49 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x7fff, v50 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x7fff, v51 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x7fff, v52 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x7fff, v53 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x7fff, v54 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x7fff, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x7fff, v40 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 +; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 +; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 +; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 +; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 +; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 +; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 +; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 +; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 +; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 +; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 +; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 +; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 +; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 +; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: v_readlane_b32 s31, v42, 1 +; VI-NEXT: v_readlane_b32 s30, v42, 0 +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v18, 0xffff0000 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_and_or_b32 v14, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_and_or_b32 v15, v3, v18, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v13, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v12, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v11, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v10, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v9, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v8, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v7, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v6, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v5, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v4, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v33, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v3, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v33, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v2 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v2, v1, v18, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v33, v33, v1 +; GFX9-NEXT: v_add_u32_e32 v33, 0x7fff, v33 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v1, v1, v18, v33 +; GFX9-NEXT: v_add_f32_e32 v33, s4, v0 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v0 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v34, v35, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v33, v18, v0 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v34, v34, v16 +; GFX9-NEXT: v_add_u32_e32 v34, 0x7fff, v34 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v34 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v35, v35, v17 +; GFX9-NEXT: v_add_u32_e32 v35, 0x7fff, v35 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v35, v36, vcc +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v35 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v32, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v36, v36, v32 +; GFX9-NEXT: v_add_u32_e32 v36, 0x7fff, v36 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v36, v37, vcc +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v36 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v37, v37, v19 +; GFX9-NEXT: v_add_u32_e32 v37, 0x7fff, v37 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v37, v38, vcc +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v37 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v20, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v38, v38, v20 +; GFX9-NEXT: v_add_u32_e32 v38, 0x7fff, v38 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v38, v39, vcc +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v21, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v39, v39, v21 +; GFX9-NEXT: v_add_u32_e32 v39, 0x7fff, v39 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v39, v48, vcc +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v39 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v22, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v48, v48, v22 +; GFX9-NEXT: v_add_u32_e32 v48, 0x7fff, v48 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v48 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v23, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v49, v49, v23 +; GFX9-NEXT: v_add_u32_e32 v49, 0x7fff, v49 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v49, v50, vcc +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v24, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v50, v50, v24 +; GFX9-NEXT: v_add_u32_e32 v50, 0x7fff, v50 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v50, v51, vcc +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v50 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v25, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v51, v51, v25 +; GFX9-NEXT: v_add_u32_e32 v51, 0x7fff, v51 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v51, v52, vcc +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v26, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v52, v52, v26 +; GFX9-NEXT: v_add_u32_e32 v52, 0x7fff, v52 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v52, v53, vcc +; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v52 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v27, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v53, v53, v27 +; GFX9-NEXT: v_add_u32_e32 v53, 0x7fff, v53 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v53, v54, vcc +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v28, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v54, v54, v28 +; GFX9-NEXT: v_add_u32_e32 v54, 0x7fff, v54 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v54, v55, vcc +; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v54 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v29, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v55, v55, v29 +; GFX9-NEXT: v_add_u32_e32 v55, 0x7fff, v55 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v55, v40, vcc +; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v55 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v30, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v40, v40, v30 +; GFX9-NEXT: v_add_u32_e32 v40, 0x7fff, v40 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v40, v41, vcc +; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v40 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v31, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v41, v41, v31 +; GFX9-NEXT: v_add_u32_e32 v41, 0x7fff, v41 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v42, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX9-NEXT: v_and_or_b32 v31, v40, v18, v31 +; GFX9-NEXT: v_and_or_b32 v30, v55, v18, v30 +; GFX9-NEXT: v_and_or_b32 v29, v54, v18, v29 +; GFX9-NEXT: v_and_or_b32 v28, v53, v18, v28 +; GFX9-NEXT: v_and_or_b32 v27, v52, v18, v27 +; GFX9-NEXT: v_and_or_b32 v26, v51, v18, v26 +; GFX9-NEXT: v_and_or_b32 v25, v50, v18, v25 +; GFX9-NEXT: v_and_or_b32 v24, v49, v18, v24 +; GFX9-NEXT: v_and_or_b32 v23, v48, v18, v23 +; GFX9-NEXT: v_and_or_b32 v22, v39, v18, v22 +; GFX9-NEXT: v_and_or_b32 v21, v38, v18, v21 +; GFX9-NEXT: v_and_or_b32 v20, v37, v18, v20 +; GFX9-NEXT: v_and_or_b32 v19, v36, v18, v19 +; GFX9-NEXT: v_and_or_b32 v32, v35, v18, v32 +; GFX9-NEXT: v_and_or_b32 v17, v34, v18, v17 +; GFX9-NEXT: v_and_or_b32 v16, v33, v18, v16 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: v_readlane_b32 s31, v43, 1 +; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-NEXT: .LBB105_2: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v0 +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v11, v3 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v6, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v7, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v20, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v35, v0, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v0, v1, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v1, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v38, v0, v1, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3 +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v48, v0, v1, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; GFX11-NEXT: v_cndmask_b32_e32 v49, v1, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; GFX11-NEXT: v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4 +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v50, v0, v1, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v28, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v51, v0, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v5, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v1, v2, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v1 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v55, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v6 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v64, v5, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v10 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v65, v6, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v65 +; GFX11-NEXT: v_cndmask_b32_e32 v66, v4, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v67, v6, v7, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_cndmask_b32_e32 v68, v5, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v10 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v9, v5 +; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v69, v6, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6 +; GFX11-NEXT: v_bfe_u32 v10, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v70, v7, v8, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v12 +; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v69, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v11, v7 +; GFX11-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v80, v8, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v13 +; GFX11-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v15, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-NEXT: v_bfe_u32 v71, v12, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v10, v14, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v81, v9, v15, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v9, v71, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v14 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-NEXT: v_bfe_u32 v71, v11, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v14 +; GFX11-NEXT: v_bfe_u32 v83, v13, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v71, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v14, v83, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v82, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v13 +; GFX11-NEXT: v_bfe_u32 v83, v15, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v84, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v71, v12, v71, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v83, v15 +; GFX11-NEXT: v_bfe_u32 v13, v84, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v14, v82, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v82, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v84 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_bfe_u32 v85, v14, 16, 1 +; GFX11-NEXT: v_bfe_u32 v86, v82, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v82 +; GFX11-NEXT: v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v84 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v85, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-NEXT: v_add_nc_u32_e32 v85, v86, v82 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v85, 0x7fff, v85 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v70, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v84, v15, v84, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-NEXT: v_add_f32_e64 v87, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v86, v13, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v82, v85, v96, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v85, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-NEXT: v_bfe_u32 v15, v87, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v96, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_bfe_u32 v97, v85, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v98, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v99, v96, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v100, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v113, 0x400000, v96 +; GFX11-NEXT: v_bfe_u32 v101, v98, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 +; GFX11-NEXT: v_add_nc_u32_e32 v99, v99, v96 +; GFX11-NEXT: v_add_nc_u32_e32 v97, v97, v85 +; GFX11-NEXT: v_bfe_u32 v103, v100, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v101, v101, v98 +; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v98 +; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v99 +; GFX11-NEXT: v_add_nc_u32_e32 v97, 0x7fff, v97 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v85 +; GFX11-NEXT: v_add_nc_u32_e32 v101, 0x7fff, v101 +; GFX11-NEXT: v_add_nc_u32_e32 v103, v103, v100 +; GFX11-NEXT: v_cndmask_b32_e32 v96, v99, v113, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v87 +; GFX11-NEXT: v_add_nc_u32_e32 v14, v86, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v99, 0x7fff, v103 +; GFX11-NEXT: v_or_b32_e32 v103, 0x400000, v100 +; GFX11-NEXT: v_cndmask_b32_e32 v98, v101, v114, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v87 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_or_b32_e32 v86, 0x400000, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v85, v97, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v96 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v64, v65 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v55, v69 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v97, v99, v103, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v28 +; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v68, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v87, v15, v102, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v66, v67 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v27 +; GFX11-NEXT: v_and_or_b32 v29, 0xffff0000, v52, v55 +; GFX11-NEXT: v_and_or_b32 v28, 0xffff0000, v51, v64 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v14, v86, vcc_lo +; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v85, v96 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v11 +; GFX11-NEXT: v_and_or_b32 v27, 0xffff0000, v50, v65 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v98 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v82 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v10 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v71, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v81 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_or_b32 v30, 0xffff0000, v53, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; GFX11-NEXT: v_and_or_b32 v25, 0xffff0000, v48, v49 +; GFX11-NEXT: v_and_or_b32 v24, 0xffff0000, v39, v50 +; GFX11-NEXT: v_and_or_b32 v23, 0xffff0000, v38, v51 +; GFX11-NEXT: v_and_or_b32 v22, 0xffff0000, v37, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v97, v98 +; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v85 +; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v84, v82 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v83, v86 +; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v9, v96 +; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v8, v71 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v80, v7 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v68 +; GFX11-NEXT: v_and_or_b32 v31, 0xffff0000, v31, v70 +; GFX11-NEXT: v_and_or_b32 v26, 0xffff0000, v26, v66 +; GFX11-NEXT: v_and_or_b32 v21, 0xffff0000, v21, v53 +; GFX11-NEXT: v_and_or_b32 v20, 0xffff0000, v35, v36 +; GFX11-NEXT: v_and_or_b32 v19, 0xffff0000, v34, v37 +; GFX11-NEXT: v_and_or_b32 v18, 0xffff0000, v33, v38 +; GFX11-NEXT: v_and_or_b32 v17, 0xffff0000, v32, v39 +; GFX11-NEXT: v_and_or_b32 v16, 0xffff0000, v16, v48 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB105_3: +; GFX11-NEXT: s_branch .LBB105_2 +; GFX11-NEXT: .LBB105_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v64bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v38 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v54 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v35 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v18 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; kill: killed $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: .LBB52_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v55, v10 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v54, v14 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v53, v12 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v16, v52, v16 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v18, v51, v18 -; GCN-NEXT: v_or_b32_e32 v20, v50, v20 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v24, v39, v24 -; GCN-NEXT: v_or_b32_e32 v26, v38, v26 -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v31 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v32, v33 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v33, v29 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v33, v27 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v33, v25 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v33, v21 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v33, v19 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v33, v15 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v33, v13 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v33, v11 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v33, v9 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v33, v7 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v33, v3 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v33, v1 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v31 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v32 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v31 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 -; GCN-NEXT: .LBB52_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v44, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v15, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v11, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v18, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v7, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v45, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v46, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v47, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v56, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v57, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v58, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v34, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v33, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v32, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v31, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v41, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x4c, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v30 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x50, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v8, 16 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x54, v0 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v39 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v10, 16 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x58, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v48 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v49 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v50 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v50, 16 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_alignbit_b32 v16, v51, v16, 16 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_alignbit_b32 v22, v52, v22, 16 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x6c, v0 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_alignbit_b32 v24, v53, v24, 16 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x70, v0 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_alignbit_b32 v26, v54, v26, 16 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x74, v0 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_alignbit_b32 v28, v55, v28, 16 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v18, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v64bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v46, vcc, 0x30000, v3 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v18, v11 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v54, v5 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v53, v7 +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v52, v9 +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v53 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v50 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v22, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64bf16: ; VI: ; %bb.0: @@ -114600,7 +243557,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB106_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 3 ; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -114700,7 +243657,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB52_2: ; %end +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -114715,7 +243672,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB106_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -114750,7 +243707,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB52_2: ; %end +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -114767,7 +243724,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -114802,7 +243759,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB52_2: ; %end +; GFX11-NEXT: .LBB106_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -114823,717 +243780,1996 @@ end: ret <64 x bfloat> %phi } +define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s60, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v41, s17, 0 +; SI-NEXT: s_mov_b32 s61, s19 +; SI-NEXT: v_writelane_b32 v41, s60, 1 +; SI-NEXT: s_mov_b32 s63, s18 +; SI-NEXT: v_writelane_b32 v41, s61, 2 +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s72, 4 +; SI-NEXT: s_mov_b32 s74, s23 +; SI-NEXT: v_writelane_b32 v41, s20, 5 +; SI-NEXT: v_writelane_b32 v41, s74, 6 +; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: v_writelane_b32 v41, s22, 7 +; SI-NEXT: v_writelane_b32 v41, s75, 8 +; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s24, 9 +; SI-NEXT: v_writelane_b32 v41, s76, 10 +; SI-NEXT: s_mov_b32 s93, s29 +; SI-NEXT: v_writelane_b32 v41, s26, 11 +; SI-NEXT: v_writelane_b32 v41, s93, 12 +; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_writelane_b32 v41, s28, 13 +; SI-NEXT: v_readfirstlane_b32 s73, v4 +; SI-NEXT: v_writelane_b32 v41, s16, 14 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_writelane_b32 v41, s73, 15 +; SI-NEXT: v_readfirstlane_b32 s90, v6 +; SI-NEXT: v_writelane_b32 v41, s89, 16 +; SI-NEXT: v_readfirstlane_b32 s91, v5 +; SI-NEXT: v_writelane_b32 v41, s90, 17 +; SI-NEXT: v_readfirstlane_b32 s34, v8 +; SI-NEXT: v_writelane_b32 v41, s91, 18 +; SI-NEXT: v_readfirstlane_b32 s35, v7 +; SI-NEXT: v_writelane_b32 v41, s34, 19 +; SI-NEXT: v_readfirstlane_b32 s36, v10 +; SI-NEXT: v_writelane_b32 v41, s35, 20 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s37, v9 +; SI-NEXT: v_writelane_b32 v41, s36, 21 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s80, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s69, v33 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s84, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s83, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s87, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s38, v12 +; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_writelane_b32 v40, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s14, v30 +; SI-NEXT: v_readfirstlane_b32 s15, v29 +; SI-NEXT: v_readfirstlane_b32 s12, v28 +; SI-NEXT: v_readfirstlane_b32 s13, v27 +; SI-NEXT: v_readfirstlane_b32 s10, v26 +; SI-NEXT: v_readfirstlane_b32 s11, v25 +; SI-NEXT: v_readfirstlane_b32 s8, v24 +; SI-NEXT: v_readfirstlane_b32 s9, v23 +; SI-NEXT: v_readfirstlane_b32 s88, v22 +; SI-NEXT: v_readfirstlane_b32 s29, v21 +; SI-NEXT: v_readfirstlane_b32 s79, v20 +; SI-NEXT: v_readfirstlane_b32 s27, v19 +; SI-NEXT: v_readfirstlane_b32 s78, v18 +; SI-NEXT: v_readfirstlane_b32 s25, v17 +; SI-NEXT: v_readfirstlane_b32 s77, v16 +; SI-NEXT: v_readfirstlane_b32 s23, v15 +; SI-NEXT: v_readfirstlane_b32 s39, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v13 +; SI-NEXT: v_readfirstlane_b32 s19, v11 +; SI-NEXT: v_readfirstlane_b32 s18, v1 +; SI-NEXT: v_writelane_b32 v41, s38, 23 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_writelane_b32 v41, s39, 24 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s59, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s56, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s57, v39 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s47, v49 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s44, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s42, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s43, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s40, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s41, v37 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB107_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s4, s60, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 25 +; SI-NEXT: s_lshl_b32 s4, s63, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 26 +; SI-NEXT: s_lshl_b32 s4, s20, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 27 +; SI-NEXT: s_lshl_b32 s4, s22, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 28 +; SI-NEXT: s_lshl_b32 s4, s24, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 29 +; SI-NEXT: s_lshl_b32 s4, s26, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 30 +; SI-NEXT: s_lshl_b32 s4, s28, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 31 +; SI-NEXT: s_lshl_b32 s4, s18, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 32 +; SI-NEXT: s_lshl_b32 s4, s89, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: s_lshl_b32 s4, s91, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 34 +; SI-NEXT: s_lshl_b32 s4, s35, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s37, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s96, s61, 16 +; SI-NEXT: s_lshl_b32 s99, s72, 16 +; SI-NEXT: s_lshl_b32 s97, s74, 16 +; SI-NEXT: s_lshl_b32 s92, s75, 16 +; SI-NEXT: s_lshl_b32 s94, s76, 16 +; SI-NEXT: s_lshl_b32 s95, s93, 16 +; SI-NEXT: s_lshl_b32 s93, s16, 16 +; SI-NEXT: s_lshl_b32 s30, s73, 16 +; SI-NEXT: s_lshl_b32 s31, s90, 16 +; SI-NEXT: s_lshl_b32 s34, s34, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 36 +; SI-NEXT: s_lshl_b32 s35, s36, 16 +; SI-NEXT: s_lshl_b32 s86, s19, 16 +; SI-NEXT: s_lshl_b32 s36, s38, 16 +; SI-NEXT: s_lshl_b32 s22, s21, 16 +; SI-NEXT: s_lshl_b32 s37, s39, 16 +; SI-NEXT: s_lshl_b32 s24, s23, 16 +; SI-NEXT: s_lshl_b32 s38, s77, 16 +; SI-NEXT: s_lshl_b32 s28, s25, 16 +; SI-NEXT: s_lshl_b32 s39, s78, 16 +; SI-NEXT: s_lshl_b32 s61, s27, 16 +; SI-NEXT: s_lshl_b32 s48, s79, 16 +; SI-NEXT: s_lshl_b32 s89, s29, 16 +; SI-NEXT: s_lshl_b32 s49, s88, 16 +; SI-NEXT: s_lshl_b32 s60, s9, 16 +; SI-NEXT: s_lshl_b32 s50, s8, 16 +; SI-NEXT: s_lshl_b32 s90, s11, 16 +; SI-NEXT: s_lshl_b32 s91, s10, 16 +; SI-NEXT: s_lshl_b32 s70, s13, 16 +; SI-NEXT: s_lshl_b32 s51, s12, 16 +; SI-NEXT: s_lshl_b32 s71, s15, 16 +; SI-NEXT: s_lshl_b32 s52, s14, 16 +; SI-NEXT: s_lshl_b32 s20, s41, 16 +; SI-NEXT: s_lshl_b32 s53, s40, 16 +; SI-NEXT: s_lshl_b32 s81, s43, 16 +; SI-NEXT: s_lshl_b32 s54, s42, 16 +; SI-NEXT: s_lshl_b32 s63, s45, 16 +; SI-NEXT: s_lshl_b32 s55, s44, 16 +; SI-NEXT: s_lshl_b32 s72, s47, 16 +; SI-NEXT: s_lshl_b32 s64, s46, 16 +; SI-NEXT: s_lshl_b32 s82, s57, 16 +; SI-NEXT: s_lshl_b32 s65, s56, 16 +; SI-NEXT: s_lshl_b32 s74, s59, 16 +; SI-NEXT: s_lshl_b32 s66, s58, 16 +; SI-NEXT: s_lshl_b32 s75, s87, 16 +; SI-NEXT: s_mov_b32 s73, s6 +; SI-NEXT: s_lshl_b32 s67, s6, 16 +; SI-NEXT: s_lshl_b32 s76, s83, 16 +; SI-NEXT: s_mov_b32 s16, s68 +; SI-NEXT: s_lshl_b32 s68, s68, 16 +; SI-NEXT: s_lshl_b32 s85, s84, 16 +; SI-NEXT: s_mov_b32 s98, s69 +; SI-NEXT: s_lshl_b32 s69, s69, 16 +; SI-NEXT: s_lshl_b32 s17, s80, 16 +; SI-NEXT: s_mov_b32 s6, s62 +; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB107_3 +; SI-NEXT: .LBB107_2: +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s16, s68 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s73, s6 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s6, s62 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b32 s98, s69 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr99 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: .LBB107_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_mov_b32 s5, s17 +; SI-NEXT: s_mov_b32 s17, s86 +; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_cbranch_vccnz .LBB107_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 24 +; SI-NEXT: s_lshl_b32 s20, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 23 +; SI-NEXT: s_lshl_b32 s17, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 +; SI-NEXT: s_lshl_b32 s61, s16, 16 +; SI-NEXT: s_add_i32 s16, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 21 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 20 +; SI-NEXT: s_or_b32 s7, s7, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s16, v41, 19 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s6, s16, s6 +; SI-NEXT: v_readlane_b32 s16, v41, 18 +; SI-NEXT: s_lshl_b32 s60, s98, 16 +; SI-NEXT: s_or_b32 s17, s17, s19 +; SI-NEXT: s_add_i32 s98, s16, 3 +; SI-NEXT: v_readlane_b32 s19, v41, 17 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_and_b32 s16, s98, 0xffff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s16, s19, s16 +; SI-NEXT: v_readlane_b32 s19, v41, 16 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: s_add_i32 s96, s19, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 15 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s29, 0xffff +; SI-NEXT: s_lshl_b32 s11, s88, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_and_b32 s19, s96, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s78, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_lshl_b32 s22, s77, 16 +; SI-NEXT: s_or_b32 s18, s21, s18 +; SI-NEXT: v_readlane_b32 s21, v41, 13 +; SI-NEXT: s_or_b32 s15, s22, s15 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_readlane_b32 s22, v41, 12 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: v_readlane_b32 s22, v41, 11 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_readlane_b32 s23, v41, 10 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_readlane_b32 s23, v41, 9 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_readlane_b32 s24, v41, 8 +; SI-NEXT: s_and_b32 s23, s23, 0xffff +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: v_readlane_b32 s24, v41, 7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_readlane_b32 s25, v41, 6 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_readlane_b32 s25, v41, 5 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_readlane_b32 s26, v41, 4 +; SI-NEXT: s_and_b32 s25, s25, 0xffff +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: v_readlane_b32 s26, v41, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_readlane_b32 s27, v41, 2 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: v_readlane_b32 s27, v41, 1 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_readlane_b32 s28, v41, 0 +; SI-NEXT: s_and_b32 s27, s27, 0xffff +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_or_b32 s27, s28, s27 +; SI-NEXT: s_add_i32 s27, s27, 0x30000 +; SI-NEXT: s_add_i32 s26, s26, 0x30000 +; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_add_i32 s25, s25, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s27, 25 +; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s26, 26 +; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s25, 27 +; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_add_i32 s80, s80, 3 +; SI-NEXT: s_add_i32 s22, s22, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s24, 28 +; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_add_i32 s84, s84, 3 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s23, 29 +; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s84, 0xffff +; SI-NEXT: s_add_i32 s83, s83, 3 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s22, 30 +; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s5, s60, s5 +; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_add_i32 s87, s87, 3 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s21, 31 +; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: s_and_b32 s60, s87, 0xffff +; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_and_b32 s59, s59, 0xffff +; SI-NEXT: s_lshl_b32 s58, s58, 16 +; SI-NEXT: s_and_b32 s57, s57, 0xffff +; SI-NEXT: s_lshl_b32 s56, s56, 16 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s18, 32 +; SI-NEXT: s_lshl_b32 s18, s19, 16 +; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_or_b32 s42, s42, s43 +; SI-NEXT: s_or_b32 s40, s40, s41 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s18, 33 +; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s76, s76, 0x30000 +; SI-NEXT: s_add_i32 s75, s75, 0x30000 +; SI-NEXT: s_add_i32 s58, s58, 0x30000 +; SI-NEXT: s_add_i32 s56, s56, 0x30000 +; SI-NEXT: s_add_i32 s46, s46, 0x30000 +; SI-NEXT: s_add_i32 s44, s44, 0x30000 +; SI-NEXT: s_add_i32 s42, s42, 0x30000 +; SI-NEXT: s_add_i32 s40, s40, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s17, s17, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s16, 34 +; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 +; SI-NEXT: v_writelane_b32 v41, s6, 35 +; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s20, 16 +; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s15, 16 +; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s13, 16 +; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s11, 16 +; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s9, 16 +; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s8, 16 +; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s10, 16 +; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s12, 16 +; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s71, s14, 16 +; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, s40, 16 +; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s42, 16 +; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s44, 16 +; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s72, s46, 16 +; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s82, s56, 16 +; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s58, 16 +; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s75, 16 +; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 +; SI-NEXT: s_lshl_b32 s76, s76, 16 +; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s5, 16 +; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s4, 16 +; SI-NEXT: v_writelane_b32 v41, s6, 36 +; SI-NEXT: .LBB107_5: ; %end +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_readlane_b32 s4, v41, 25 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 +; SI-NEXT: v_readlane_b32 s4, v41, 26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_readlane_b32 s4, v41, 27 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_readlane_b32 s4, v41, 28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_readlane_b32 s4, v41, 29 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_readlane_b32 s4, v41, 30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_readlane_b32 s4, v41, 31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 +; SI-NEXT: v_readlane_b32 s4, v41, 32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_readlane_b32 s4, v41, 33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_readlane_b32 s4, v41, 34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_readlane_b32 s4, v41, 35 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_readlane_b32 s4, v41, 36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s39, s7 +; VI-NEXT: s_or_b32 s6, s38, s6 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v64bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB107_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB107_4 +; GFX11-NEXT: .LBB107_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB107_3: +; GFX11-NEXT: s_branch .LBB107_2 +; GFX11-NEXT: .LBB107_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v64f16_to_v64i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v53 -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v51 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v40 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v9 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v29 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v56 -; GCN-NEXT: v_mov_b32_e32 v49, v57 -; GCN-NEXT: v_mov_b32_e32 v54, v58 -; GCN-NEXT: v_mov_b32_e32 v51, v62 -; GCN-NEXT: v_mov_b32_e32 v48, v4 -; GCN-NEXT: v_mov_b32_e32 v36, v5 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v12 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_or_b32_e32 v4, v5, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v8 -; GCN-NEXT: v_or_b32_e32 v5, v7, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v46 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; GCN-NEXT: v_or_b32_e32 v6, v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v32 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v33 -; GCN-NEXT: v_or_b32_e32 v32, v29, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 -; GCN-NEXT: v_or_b32_e32 v34, v29, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; GCN-NEXT: v_add_f32_e32 v63, 0x38000000, v63 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v62 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v30, v30, v29 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v61, v38, v36 -; GCN-NEXT: v_or_b32_e32 v49, v49, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v52, v51 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v42, v42, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v46, v44 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v58, v56 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v63, v2 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v57, v28 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v45, v45, v62 -; GCN-NEXT: v_or_b32_e32 v41, v41, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v53, v59 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v50, v50, v47 -; GCN-NEXT: v_or_b32_e32 v39, v39, v43 -; GCN-NEXT: v_or_b32_e32 v37, v37, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: v_alignbit_b32 v63, v34, v29, 16 -; GCN-NEXT: v_alignbit_b32 v36, v32, v36, 16 -; GCN-NEXT: v_alignbit_b32 v48, v46, v48, 16 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_alignbit_b32 v51, v5, v51, 16 -; GCN-NEXT: v_mov_b32_e32 v7, v4 -; GCN-NEXT: v_alignbit_b32 v54, v4, v54, 16 -; GCN-NEXT: v_alignbit_b32 v29, v3, v44, 16 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; GCN-NEXT: v_alignbit_b32 v29, v1, v56, 16 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v2, v26, v2, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v28, v24, v28, 16 -; GCN-NEXT: v_alignbit_b32 v53, v22, v62, 16 -; GCN-NEXT: v_alignbit_b32 v60, v20, v60, 16 -; GCN-NEXT: v_alignbit_b32 v59, v18, v59, 16 -; GCN-NEXT: v_alignbit_b32 v47, v16, v47, 16 -; GCN-NEXT: v_alignbit_b32 v43, v14, v43, 16 -; GCN-NEXT: v_alignbit_b32 v40, v11, v40, 16 -; GCN-NEXT: v_alignbit_b32 v55, v9, v55, 16 -; GCN-NEXT: .LBB53_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v30, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v2, v29 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: v_or_b32_e32 v56, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: v_or_b32_e32 v61, v1, v2 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: v_or_b32_e32 v10, v1, v2 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_or_b32_e32 v31, v1, v2 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v7, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v33, v1, v2 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v5, v1, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v35, v1, v2 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v1, v1, v30 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v53 -; GCN-NEXT: v_or_b32_e32 v49, v30, v49 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v60 -; GCN-NEXT: v_or_b32_e32 v45, v30, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v59 -; GCN-NEXT: v_or_b32_e32 v53, v30, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v47 -; GCN-NEXT: v_or_b32_e32 v50, v30, v50 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v43 -; GCN-NEXT: v_or_b32_e32 v39, v30, v39 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v40 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v37, v37, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v9, v9, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v56, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64f16_to_v64i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v45, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v36 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v59, v29 +; SI-NEXT: v_mov_b32_e32 v29, v27 +; SI-NEXT: v_mov_b32_e32 v57, v23 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_mov_b32_e32 v62, v3 +; SI-NEXT: v_mov_b32_e32 v63, v4 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v33, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v61 +; SI-NEXT: v_or_b32_e32 v58, v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_or_b32_e32 v31, v31, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_or_b32_e32 v18, v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v39, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_or_b32_e32 v52, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_or_b32_e32 v55, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_or_b32_e32 v43, v36, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v38, v38, v47 +; SI-NEXT: v_or_b32_e32 v54, v54, v42 +; SI-NEXT: v_or_b32_e32 v49, v49, v51 +; SI-NEXT: v_or_b32_e32 v45, v45, v50 +; SI-NEXT: v_or_b32_e32 v41, v41, v30 +; SI-NEXT: v_or_b32_e32 v46, v46, v32 +; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16 +; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 +; SI-NEXT: v_alignbit_b32 v51, v58, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v36, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v63 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v63, v37, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v39, v9, 16 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v29, v29, v23 +; SI-NEXT: v_or_b32_e32 v2, v36, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_mov_b32_e32 v2, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v62, v56, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v60 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v60, v56, v37 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v57, v56, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v59 +; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v59, v56, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_alignbit_b32 v2, v18, v36, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v19, v37, 16 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v24, v23, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v56, v56, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v21, v27, 16 +; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v34, v36, v34 +; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v44 +; SI-NEXT: v_or_b32_e32 v34, v34, v36 +; SI-NEXT: v_add_i32_e32 v36, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v64i16: ; VI: ; %bb.0: @@ -115545,7 +245781,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 @@ -115645,7 +245881,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v33, v17 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: .LBB108_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -115660,7 +245896,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -115696,7 +245932,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: .LBB108_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -115713,7 +245949,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -115748,7 +245984,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: .LBB108_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -115769,1080 +246005,2076 @@ end: ret <64 x i16> %phi } +define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64f16_to_v64i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v40 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v31, v31, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v33 +; SI-NEXT: v_or_b32_e32 v32, v32, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v36 +; SI-NEXT: v_or_b32_e32 v35, v35, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v39 +; SI-NEXT: v_or_b32_e32 v38, v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, v51 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v49, v48, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_or_b32_e32 v52, v48, v51 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v59 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v46, v48, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v57 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v61 +; SI-NEXT: v_or_b32_e32 v57, v48, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v63 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v60, v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_or_b32_e32 v59, v54, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_or_b32_e32 v56, v54, v48 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v45, v40, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_or_b32_e32 v7, v41, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v13 +; SI-NEXT: v_or_b32_e32 v23, v23, v17 +; SI-NEXT: v_or_b32_e32 v34, v34, v21 +; SI-NEXT: v_alignbit_b32 v4, v57, v4, 16 +; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16 +; SI-NEXT: v_alignbit_b32 v62, v29, v48, 16 +; SI-NEXT: v_alignbit_b32 v61, v52, v54, 16 +; SI-NEXT: v_alignbit_b32 v44, v49, v55, 16 +; SI-NEXT: v_alignbit_b32 v13, v32, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v2, v21, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v41, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v41, v20 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v41, v28 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 +; SI-NEXT: v_or_b32_e32 v7, v41, v27 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v41, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v43, v42, v24 +; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v41, v37 +; SI-NEXT: v_mov_b32_e32 v51, v7 +; SI-NEXT: v_alignbit_b32 v7, v38, v40, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v37, v1, v37, 16 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: v_and_b32_e32 v48, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v58 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_add_i32_e32 v48, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v53 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v31, v17 +; VI-NEXT: v_mov_b32_e32 v30, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB109_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB109_3 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 0x200 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v33, v15 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v33, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v33, v13 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v33, v12 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v33, v11 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v33, v10 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v33, v9 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v33, v8 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v33, v6 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v33, v5 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v33, v4 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v33, v3 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v33, v2 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v33, v1 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v33, v0 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v31, v31, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v33, v31 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v30, v30, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v33, v30 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v29 +; VI-NEXT: v_add_f16_sdwa v29, v29, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v33, v29 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v28 +; VI-NEXT: v_add_f16_sdwa v28, v28, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v33, v28 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v27 +; VI-NEXT: v_add_f16_sdwa v27, v27, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v33, v27 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v26 +; VI-NEXT: v_add_f16_sdwa v26, v26, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v25 +; VI-NEXT: v_add_f16_sdwa v25, v25, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v33, v25 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v24 +; VI-NEXT: v_add_f16_sdwa v24, v24, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v33, v24 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v23 +; VI-NEXT: v_add_f16_sdwa v23, v23, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v33, v23 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v22 +; VI-NEXT: v_add_f16_sdwa v22, v22, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v33, v22 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v21 +; VI-NEXT: v_add_f16_sdwa v21, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v33, v21 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v20 +; VI-NEXT: v_add_f16_sdwa v20, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v33, v20 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v19 +; VI-NEXT: v_add_f16_sdwa v19, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v33, v19 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v32 +; VI-NEXT: v_add_f16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v32, v33, v32 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v33, v17 +; VI-NEXT: v_or_b32_e32 v16, v16, v18 +; VI-NEXT: .LBB109_3: ; %end +; VI-NEXT: v_mov_b32_e32 v18, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_4: +; VI-NEXT: s_branch .LBB109_2 +; +; GFX9-LABEL: bitcast_v64f16_to_v64i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB109_3 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, v31, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB109_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: s_branch .LBB109_2 +; +; GFX11-LABEL: bitcast_v64f16_to_v64i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-NEXT: .LBB109_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB109_3: +; GFX11-NEXT: s_branch .LBB109_2 +; GFX11-NEXT: .LBB109_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v64f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB54_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v63 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: .LBB54_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v2, v1 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v2, v1 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i16_to_v64f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v64f16: ; VI: ; %bb.0: @@ -116854,7 +248086,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: s_cbranch_execz .LBB110_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 3 ; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -116954,7 +248186,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB54_2: ; %end +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -116969,7 +248201,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB110_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -117004,7 +248236,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB54_2: ; %end +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -117021,7 +248253,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -117056,7 +248288,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB54_2: ; %end +; GFX11-NEXT: .LBB110_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -117076,3 +248308,1288 @@ end: %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <64 x half> %phi } + +define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v64f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB111_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 +; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 +; SI-NEXT: v_mov_b32_e32 v24, v43 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v23 +; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: s_branch .LBB111_3 +; SI-NEXT: .LBB111_2: +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v3, v10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB111_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v46, v56 +; SI-NEXT: v_mov_b32_e32 v56, v58 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: v_mov_b32_e32 v11, v13 +; SI-NEXT: v_mov_b32_e32 v13, v15 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_mov_b32_e32 v17, v19 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: s_cbranch_vccnz .LBB111_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v60 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: .LBB111_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v64f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v32, s30, 0 +; VI-NEXT: v_writelane_b32 v32, s31, 1 +; VI-NEXT: v_writelane_b32 v32, s34, 2 +; VI-NEXT: v_writelane_b32 v32, s35, 3 +; VI-NEXT: v_writelane_b32 v32, s36, 4 +; VI-NEXT: v_writelane_b32 v32, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_writelane_b32 v32, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s47, v2 +; VI-NEXT: v_readfirstlane_b32 s46, v3 +; VI-NEXT: v_readfirstlane_b32 s45, v4 +; VI-NEXT: v_readfirstlane_b32 s44, v5 +; VI-NEXT: v_readfirstlane_b32 s43, v6 +; VI-NEXT: v_readfirstlane_b32 s42, v7 +; VI-NEXT: v_readfirstlane_b32 s41, v8 +; VI-NEXT: v_readfirstlane_b32 s40, v9 +; VI-NEXT: v_readfirstlane_b32 s15, v10 +; VI-NEXT: v_readfirstlane_b32 s14, v11 +; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s12, v13 +; VI-NEXT: v_readfirstlane_b32 s11, v14 +; VI-NEXT: v_readfirstlane_b32 s10, v15 +; VI-NEXT: v_readfirstlane_b32 s9, v16 +; VI-NEXT: v_readfirstlane_b32 s8, v17 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: v_writelane_b32 v32, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s47, 3 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: s_and_b32 s56, s45, 0xffff0000 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_and_b32 s57, s44, 0xffff0000 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_and_b32 s58, s43, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_and_b32 s59, s42, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_and_b32 s60, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_and_b32 s61, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_and_b32 s62, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_and_b32 s63, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_and_b32 s72, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_and_b32 s73, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_and_b32 s74, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_and_b32 s75, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_and_b32 s76, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_and_b32 s77, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_and_b32 s89, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_and_b32 s90, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_and_b32 s91, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_and_b32 vcc_lo, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_and_b32 vcc_hi, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_and_b32 s31, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s34, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s35, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s36, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s37, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s38, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s24, s24, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s16, s16, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s12, s12, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s14, s14, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s40, s40, 0xffff +; VI-NEXT: s_and_b32 s41, s41, 0xffff +; VI-NEXT: s_and_b32 s42, s42, 0xffff +; VI-NEXT: s_and_b32 s43, s43, 0xffff +; VI-NEXT: s_and_b32 s44, s44, 0xffff +; VI-NEXT: s_and_b32 s45, s45, 0xffff +; VI-NEXT: s_and_b32 s46, s46, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s39, s7 +; VI-NEXT: s_or_b32 s6, s38, s6 +; VI-NEXT: s_or_b32 s29, s37, s29 +; VI-NEXT: s_or_b32 s28, s36, s28 +; VI-NEXT: s_or_b32 s27, s35, s27 +; VI-NEXT: s_or_b32 s26, s34, s26 +; VI-NEXT: s_or_b32 s25, s31, s25 +; VI-NEXT: s_or_b32 s24, s30, s24 +; VI-NEXT: s_or_b32 s23, vcc_hi, s23 +; VI-NEXT: s_or_b32 s22, vcc_lo, s22 +; VI-NEXT: s_or_b32 s21, s91, s21 +; VI-NEXT: s_or_b32 s20, s90, s20 +; VI-NEXT: s_or_b32 s19, s89, s19 +; VI-NEXT: s_or_b32 s18, s88, s18 +; VI-NEXT: s_or_b32 s17, s79, s17 +; VI-NEXT: s_or_b32 s16, s78, s16 +; VI-NEXT: s_or_b32 s8, s77, s8 +; VI-NEXT: s_or_b32 s9, s76, s9 +; VI-NEXT: s_or_b32 s10, s75, s10 +; VI-NEXT: s_or_b32 s11, s74, s11 +; VI-NEXT: s_or_b32 s12, s73, s12 +; VI-NEXT: s_or_b32 s13, s72, s13 +; VI-NEXT: s_or_b32 s14, s63, s14 +; VI-NEXT: s_or_b32 s15, s62, s15 +; VI-NEXT: s_or_b32 s40, s61, s40 +; VI-NEXT: s_or_b32 s41, s60, s41 +; VI-NEXT: s_or_b32 s42, s59, s42 +; VI-NEXT: s_or_b32 s43, s58, s43 +; VI-NEXT: s_or_b32 s44, s57, s44 +; VI-NEXT: s_or_b32 s45, s56, s45 +; VI-NEXT: s_or_b32 s46, s47, s46 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 +; VI-NEXT: s_add_i32 s23, s23, 0x30000 +; VI-NEXT: s_add_i32 s22, s22, 0x30000 +; VI-NEXT: s_add_i32 s21, s21, 0x30000 +; VI-NEXT: s_add_i32 s20, s20, 0x30000 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 +; VI-NEXT: s_add_i32 s18, s18, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s43, s43, 0x30000 +; VI-NEXT: s_add_i32 s44, s44, 0x30000 +; VI-NEXT: s_add_i32 s45, s45, 0x30000 +; VI-NEXT: s_add_i32 s46, s46, 0x30000 +; VI-NEXT: s_add_i32 s47, s4, 0x30000 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: v_mov_b32_e32 v16, s47 +; VI-NEXT: v_mov_b32_e32 v17, s46 +; VI-NEXT: v_mov_b32_e32 v18, s45 +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: v_mov_b32_e32 v20, s43 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v24, s15 +; VI-NEXT: v_mov_b32_e32 v25, s14 +; VI-NEXT: v_mov_b32_e32 v26, s13 +; VI-NEXT: v_mov_b32_e32 v27, s12 +; VI-NEXT: v_mov_b32_e32 v28, s11 +; VI-NEXT: v_mov_b32_e32 v29, s10 +; VI-NEXT: v_mov_b32_e32 v30, s9 +; VI-NEXT: v_mov_b32_e32 v31, s8 +; VI-NEXT: v_readlane_b32 s39, v32, 7 +; VI-NEXT: v_readlane_b32 s38, v32, 6 +; VI-NEXT: v_readlane_b32 s37, v32, 5 +; VI-NEXT: v_readlane_b32 s36, v32, 4 +; VI-NEXT: v_readlane_b32 s35, v32, 3 +; VI-NEXT: v_readlane_b32 s34, v32, 2 +; VI-NEXT: v_readlane_b32 s31, v32, 1 +; VI-NEXT: v_readlane_b32 s30, v32, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v64i16_to_v64f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v31, v17 +; GFX9-NEXT: v_mov_b32_e32 v30, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v18, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-LABEL: bitcast_v64i16_to_v64f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-NEXT: v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12 +; GFX11-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10 +; GFX11-NEXT: v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8 +; GFX11-NEXT: v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6 +; GFX11-NEXT: v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4 +; GFX11-NEXT: v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB111_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB111_4 +; GFX11-NEXT: .LBB111_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB111_3: +; GFX11-NEXT: s_branch .LBB111_2 +; GFX11-NEXT: .LBB111_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index c0577b1c1a2b5..18fdc267851f6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1,28 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v4f32: ; VI: ; %bb.0: @@ -89,23 +88,124 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v4i32_to_v4f32_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v4i32_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v4i32: ; VI: ; %bb.0: @@ -170,23 +270,127 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v4f32_to_v4i32_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v2i64: ; VI: ; %bb.0: @@ -253,23 +457,124 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v4i32_to_v2i64_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v4i32_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v4i32: ; VI: ; %bb.0: @@ -337,23 +642,124 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v2i64_to_v4i32_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v2i64_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v2f64: ; VI: ; %bb.0: @@ -420,21 +826,122 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v4i32_to_v2f64_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v4i32_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v4i32: ; VI: ; %bb.0: @@ -443,11 +950,11 @@ define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -458,11 +965,11 @@ define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -474,11 +981,11 @@ define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -498,42 +1005,137 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v2f64_to_v4i32_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8i16: ; VI: ; %bb.0: @@ -600,65 +1202,185 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v4i32_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v4i32: ; VI: ; %bb.0: @@ -667,7 +1389,7 @@ define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -682,7 +1404,7 @@ define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -735,70 +1457,218 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v8i16_to_v4i32_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v8i16_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8f16: ; VI: ; %bb.0: @@ -865,83 +1735,213 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v4i32_to_v8f16_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v4i32_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4i32: ; VI: ; %bb.0: @@ -950,7 +1950,7 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -965,7 +1965,7 @@ define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1019,62 +2019,225 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v8f16_to_v4i32_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v8f16_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v11 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v3 +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v8bf16: ; VI: ; %bb.0: @@ -1141,75 +2304,205 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v4i32_to_v8bf16_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v4i32_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v4i32: ; VI: ; %bb.0: @@ -1218,7 +2511,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -1293,7 +2586,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1304,7 +2597,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -1368,7 +2661,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1380,7 +2673,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1455,7 +2748,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1467,7 +2760,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1533,7 +2826,7 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1553,66 +2846,427 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i32_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i32_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i32_to_v16i8: ; VI: ; %bb.0: @@ -1636,7 +3290,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -1650,9 +3304,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: .LBB24_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: s_cbranch_execz .LBB24_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -1670,7 +3324,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: .LBB24_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -1700,7 +3354,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -1714,9 +3368,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB24_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -1734,7 +3388,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -1758,7 +3412,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -1768,9 +3422,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 @@ -1786,7 +3440,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB24_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -1819,7 +3473,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -1833,9 +3487,9 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 @@ -1855,7 +3509,7 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -1879,127 +3533,484 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i32_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v4i32_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s12 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v4i32_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_v4i32_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_v4i32_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v4i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v4i32: ; VI: ; %bb.0: @@ -2018,14 +4029,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: s_cbranch_execnz .LBB26_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_4 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB26_4 +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: .LBB26_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2055,8 +4066,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: .LBB26_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2106,14 +4117,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: s_cbranch_execnz .LBB26_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_4 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: .LBB26_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2143,8 +4154,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: .LBB26_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2198,14 +4209,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -2257,8 +4268,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -2328,14 +4339,14 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -2382,8 +4393,8 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -2448,23 +4459,472 @@ end: ret <4 x i32> %phi } +define inreg <4 x i32> @bitcast_v16i8_to_v4i32_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v4i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v16i8_to_v4i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v4i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v4i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v2i64: ; VI: ; %bb.0: @@ -2529,23 +4989,127 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v4f32_to_v2i64_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v4f32: ; VI: ; %bb.0: @@ -2613,23 +5177,124 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v2i64_to_v4f32_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v2i64_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v2f64: ; VI: ; %bb.0: @@ -2694,21 +5359,125 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v4f32_to_v2f64_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v4f32: ; VI: ; %bb.0: @@ -2717,11 +5486,11 @@ define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2732,11 +5501,11 @@ define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2748,11 +5517,11 @@ define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2772,42 +5541,137 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v2f64_to_v4f32_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_4 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_3: +; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8i16: ; VI: ; %bb.0: @@ -2841,19 +5705,140 @@ define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v4f32_to_v8i16: +; GFX11-LABEL: bitcast_v4f32_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2873,64 +5858,64 @@ end: } define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v4f32: ; VI: ; %bb.0: @@ -2939,7 +5924,7 @@ define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -2954,7 +5939,7 @@ define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3007,70 +5992,218 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v8i16_to_v4f32_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v8i16_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8f16: ; VI: ; %bb.0: @@ -3135,83 +6268,215 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v4f32_to_v8f16_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v4f32_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v4f32: ; VI: ; %bb.0: @@ -3220,7 +6485,7 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3235,7 +6500,7 @@ define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3289,62 +6554,225 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v8f16_to_v4f32_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v8f16_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v3 +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8bf16: ; VI: ; %bb.0: @@ -3409,75 +6837,208 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v4f32_to_v8bf16_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v4f32: ; VI: ; %bb.0: @@ -3486,7 +7047,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -3561,7 +7122,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3572,7 +7133,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -3636,7 +7197,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3648,7 +7209,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3723,7 +7284,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,7 +7296,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3801,7 +7362,7 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3821,66 +7382,427 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f32_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f32_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v16i8: ; VI: ; %bb.0: @@ -3904,7 +7826,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -3918,9 +7840,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -3938,7 +7860,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -3968,7 +7890,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -3982,9 +7904,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4002,7 +7924,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -4026,7 +7948,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -4036,9 +7958,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v10, 1.0, v10 @@ -4052,7 +7974,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -4085,7 +8007,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -4099,9 +8021,9 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v18, 1.0, v18 @@ -4119,7 +8041,7 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -4143,127 +8065,513 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f32_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s9, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s13, s19, 8 +; VI-NEXT: s_lshr_b32 s12, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s22, s17, 8 +; VI-NEXT: s_lshr_b32 s21, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v19, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v18, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v16, s18, 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s12 +; VI-NEXT: v_mov_b32_e32 v13, s13 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s12, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s22, s17, 8 +; GFX9-NEXT: s_lshr_b32 s21, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v19, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v18, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v17, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v16, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s22 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4f32_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4f32_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v10, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v14, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v4f32: ; VI: ; %bb.0: @@ -4282,14 +8590,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: s_cbranch_execnz .LBB50_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_4 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB50_4 +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB25_3: ; %cmp.false +; VI-NEXT: .LBB50_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4319,8 +8627,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: .LBB50_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4370,14 +8678,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: s_cbranch_execnz .LBB50_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_4 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB50_4 +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: .LBB50_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4407,8 +8715,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: .LBB50_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4462,14 +8770,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -4521,8 +8829,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -4592,14 +8900,14 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -4646,8 +8954,8 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -4712,23 +9020,472 @@ end: ret <4 x float> %phi } +define inreg <4 x float> @bitcast_v16i8_to_v4f32_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v4f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v16i8_to_v4f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v4f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v4f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB51_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v2f64: ; VI: ; %bb.0: @@ -4796,21 +9553,121 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v2i64_to_v2f64_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v2i64_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v2i64: ; VI: ; %bb.0: @@ -4819,11 +9676,11 @@ define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4834,11 +9691,11 @@ define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4850,11 +9707,11 @@ define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4874,42 +9731,137 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v2f64_to_v2i64_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_3: +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8i16: ; VI: ; %bb.0: @@ -4977,65 +9929,185 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v2i64_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v2i64: ; VI: ; %bb.0: @@ -5044,7 +10116,7 @@ define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -5059,7 +10131,7 @@ define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5112,70 +10184,218 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v8i16_to_v2i64_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v8i16_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v3 -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8f16: ; VI: ; %bb.0: @@ -5243,83 +10463,213 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v2i64_to_v8f16_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v2i64_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2i64: ; VI: ; %bb.0: @@ -5328,7 +10678,7 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5343,7 +10693,7 @@ define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5397,62 +10747,225 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v8f16_to_v2i64_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v8f16_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v3 +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v8bf16: ; VI: ; %bb.0: @@ -5520,75 +11033,205 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v2i64_to_v8bf16_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_and_b32 s10, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_and_b32 s12, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s9, s18, 3 +; SI-NEXT: s_addc_u32 s7, s19, 0 +; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s10, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s5, 16 +; SI-NEXT: s_and_b32 s12, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v2i64_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v2i64: ; VI: ; %bb.0: @@ -5597,7 +11240,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -5672,7 +11315,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5683,7 +11326,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -5747,7 +11390,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,7 +11402,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -5834,7 +11477,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5846,7 +11489,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5912,7 +11555,7 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5932,66 +11575,427 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i64_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i64_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i64_to_v16i8: ; VI: ; %bb.0: @@ -6015,7 +12019,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -6029,9 +12033,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: .LBB68_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: s_cbranch_execz .LBB68_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -6049,7 +12053,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -6079,7 +12083,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -6093,9 +12097,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: s_cbranch_execz .LBB68_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -6113,7 +12117,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -6137,7 +12141,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -6147,9 +12151,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -6165,7 +12169,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_4: ; %end +; GFX11-TRUE16-NEXT: .LBB68_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -6198,7 +12202,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -6212,9 +12216,9 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB68_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -6234,7 +12238,7 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -6258,127 +12262,484 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i64_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s19, 24 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 8 +; SI-NEXT: s_lshr_b32 s9, s17, 24 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v2i64_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s12 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v2i64_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s12, s19, 8 +; GFX9-NEXT: s_lshr_b32 s13, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s21, s17, 8 +; GFX9-NEXT: s_lshr_b32 s22, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s22 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v2i64_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v2i64_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s18 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v2i64: ; VI: ; %bb.0: @@ -6397,14 +12758,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6434,8 +12795,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6485,14 +12846,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6522,8 +12883,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6577,14 +12938,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -6636,8 +12997,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -6707,14 +13068,14 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -6761,8 +13122,8 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -6827,44 +13188,492 @@ end: ret <2 x i64> %phi } +define inreg <2 x i64> @bitcast_v16i8_to_v2i64_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v2i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v16i8_to_v2i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v2i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v2i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v11, v10, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v11, v10, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v10 -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v8 -; GCN-NEXT: v_mov_b32_e32 v6, v9 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v2, v11 +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v6, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8i16: ; VI: ; %bb.0: @@ -6873,11 +13682,11 @@ define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6888,11 +13697,11 @@ define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6904,11 +13713,11 @@ define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: .LBB72_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6928,65 +13737,183 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB73_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_4 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_branch .LBB73_5 +; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: .LBB73_4: +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: .LBB73_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v2, v11 +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v6, v9 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_or_b32_e32 v1, v11, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v8, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v2f64: ; VI: ; %bb.0: @@ -6995,7 +13922,7 @@ define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_e32 v4, 3, v3 @@ -7010,7 +13937,7 @@ define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v4, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: .LBB74_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7063,62 +13990,210 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v8i16_to_v2f64_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v8i16_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v9 -; GCN-NEXT: v_mov_b32_e32 v1, v11 -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mov_b32_e32 v3, v10 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v1, v11 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8f16: ; VI: ; %bb.0: @@ -7127,11 +14202,11 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7142,11 +14217,11 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7158,11 +14233,11 @@ define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: .LBB76_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7182,83 +14257,207 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v2f64_to_v8f16_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v2f64_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v6 -; GCN-NEXT: v_or_b32_e32 v3, v4, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v2f64: ; VI: ; %bb.0: @@ -7267,7 +14466,7 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7282,7 +14481,7 @@ define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7336,54 +14535,216 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v8f16_to_v2f64_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v8f16_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_4 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_3: +; VI-NEXT: s_branch .LBB79_2 +; VI-NEXT: .LBB79_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_4 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_3: +; GFX9-NEXT: s_branch .LBB79_2 +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v11 -; GCN-NEXT: v_mov_b32_e32 v1, v10 -; GCN-NEXT: v_mov_b32_e32 v2, v9 -; GCN-NEXT: v_mov_b32_e32 v3, v8 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v11 +; SI-NEXT: v_mov_b32_e32 v1, v10 +; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v8bf16: ; VI: ; %bb.0: @@ -7392,11 +14753,11 @@ define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,11 +14768,11 @@ define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7423,11 +14784,11 @@ define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7447,75 +14808,200 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v2f64_to_v8bf16_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s13, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s19, 16 +; SI-NEXT: s_and_b32 s11, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s18, 16 +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v2f64: ; VI: ; %bb.0: @@ -7524,7 +15010,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -7599,7 +15085,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7610,7 +15096,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -7674,7 +15160,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7686,7 +15172,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -7761,7 +15247,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7773,7 +15259,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7839,7 +15325,7 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7859,68 +15345,429 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v0 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s4, s3, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s3 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v3 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v11, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v7, 16, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_3: +; GFX11-NEXT: s_branch .LBB83_2 +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f64_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v17, v16, 24 -; GCN-NEXT: v_alignbit_b32 v10, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v17, v16, 8 -; GCN-NEXT: v_alignbit_b32 v3, v19, v18, 24 -; GCN-NEXT: v_alignbit_b32 v2, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_alignbit_b32 v11, v17, v16, 24 -; GCN-NEXT: v_alignbit_b32 v10, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v17, v16, 8 -; GCN-NEXT: v_alignbit_b32 v3, v19, v18, 24 -; GCN-NEXT: v_alignbit_b32 v2, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v18 -; GCN-NEXT: v_mov_b32_e32 v4, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: v_mov_b32_e32 v12, v17 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f64_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v18 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v16i8: ; VI: ; %bb.0: @@ -7944,7 +15791,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -7958,9 +15805,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -7976,7 +15823,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -8006,7 +15853,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -8020,9 +15867,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -8038,7 +15885,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -8062,7 +15909,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -8072,9 +15919,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -8087,7 +15934,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_4: ; %end +; GFX11-TRUE16-NEXT: .LBB84_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -8120,7 +15967,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -8134,9 +15981,9 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB84_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -8153,7 +16000,7 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -8177,127 +16024,505 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f64_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 8 +; SI-NEXT: s_lshr_b32 s8, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; SI-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; SI-NEXT: s_branch .LBB85_5 +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v18 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s15, s19, 24 +; VI-NEXT: s_lshr_b32 s14, s19, 16 +; VI-NEXT: s_lshr_b32 s13, s19, 8 +; VI-NEXT: s_lshr_b32 s21, s18, 16 +; VI-NEXT: s_lshr_b32 s20, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s11, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s23, s16, 16 +; VI-NEXT: s_lshr_b32 s22, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v1, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v9, s20 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v15, s15 +; VI-NEXT: v_mov_b32_e32 v14, s14 +; VI-NEXT: v_mov_b32_e32 v13, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v6, s11 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s15, s19, 24 +; GFX9-NEXT: s_lshr_b32 s14, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s21, s18, 16 +; GFX9-NEXT: s_lshr_b32 s20, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 16 +; GFX9-NEXT: s_lshr_b32 s10, s17, 8 +; GFX9-NEXT: s_lshr_b32 s23, s16, 16 +; GFX9-NEXT: s_lshr_b32 s22, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s15 +; GFX9-NEXT: v_mov_b32_e32 v14, s14 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2f64_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s14 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2f64_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v1, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v9, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v14, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v6, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s11 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s9 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v2f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_4 -; GCN-NEXT: .LBB43_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB43_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: .LBB43_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v19, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v21, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 -; GCN-NEXT: v_or_b32_e32 v7, v9, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_4 +; SI-NEXT: .LBB86_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB86_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: .LBB86_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v2f64: ; VI: ; %bb.0: @@ -8316,14 +16541,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: s_cbranch_execnz .LBB86_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_4 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB86_4 +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: .LBB86_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8353,8 +16578,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 -; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB86_2 +; VI-NEXT: .LBB86_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8404,14 +16629,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: s_cbranch_execnz .LBB86_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_4 -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB86_4 +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: .LBB86_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8441,8 +16666,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 -; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB86_2 +; GFX9-NEXT: .LBB86_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8496,14 +16721,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -8555,8 +16780,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -8626,14 +16851,14 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -8680,8 +16905,8 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -8746,74 +16971,524 @@ end: ret <2 x double> %phi } +define inreg <2 x double> @bitcast_v16i8_to_v2f64_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v2f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v16i8_to_v2f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v2f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v2f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-NEXT: .LBB87_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB87_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB87_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v7 -; GCN-NEXT: v_mov_b32_e32 v9, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v11, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v13, v2 -; GCN-NEXT: v_mov_b32_e32 v14, v1 -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_4 -; GCN-NEXT: .LBB44_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB44_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: .LBB44_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v11, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_4 +; SI-NEXT: .LBB88_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB88_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: .LBB88_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8f16: ; VI: ; %bb.0: @@ -8822,7 +17497,7 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -8837,7 +17512,7 @@ define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v7 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: .LBB88_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8890,61 +17565,204 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v8i16_to_v8f16_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v8i16_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_4 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_3: +; GFX9-NEXT: s_branch .LBB89_2 +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v8 -; GCN-NEXT: v_or_b32_e32 v2, v2, v9 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8i16: ; VI: ; %bb.0: @@ -8953,7 +17771,7 @@ define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v0 @@ -8968,42 +17786,198 @@ define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v7, v2 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: .LBB90_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v8f16_to_v8i16: +; GFX9-LABEL: bitcast_v8f16_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v8f16_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_e32 v4, s16, v0 +; VI-NEXT: v_add_f16_sdwa v5, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v6, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v7, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v0, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v0 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v4, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_3: +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v8i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v8f16_to_v8i16: +; GFX11-LABEL: bitcast_v8f16_to_v8i16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9023,70 +17997,70 @@ end: } define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v11, v6 -; GCN-NEXT: v_mov_b32_e32 v12, v4 -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v8bf16: ; VI: ; %bb.0: @@ -9095,7 +18069,7 @@ define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9110,7 +18084,7 @@ define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v7 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: .LBB92_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9163,86 +18137,249 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v8i16_to_v8bf16_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s10, s20, 16 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_lshl_b32 s13, s22, 16 +; SI-NEXT: s_lshl_b32 s12, s23, 16 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xffff +; SI-NEXT: s_lshl_b32 s6, s21, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s5, 16 +; SI-NEXT: s_and_b32 s12, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s4, 16 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_4 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_3: +; GFX9-NEXT: s_branch .LBB93_2 +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_4 -; GCN-NEXT: .LBB47_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB47_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: .LBB47_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v10, v2, 16 -; GCN-NEXT: v_alignbit_b32 v6, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v5, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v11, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_4 +; SI-NEXT: .LBB94_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v8i16: ; VI: ; %bb.0: @@ -9251,7 +18388,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -9326,7 +18463,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9337,7 +18474,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -9401,7 +18538,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v5, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9413,7 +18550,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 @@ -9492,7 +18629,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v5 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9504,7 +18641,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9569,7 +18706,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9589,109 +18726,473 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB95_2 +; +; VI-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v3 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v3 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v3, v3, v8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX9-NEXT: v_and_or_b32 v2, v2, v8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_and_or_b32 v0, v4, v8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-NEXT: .LBB95_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v5, v4 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v6, v8 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB95_3: +; GFX11-NEXT: s_branch .LBB95_2 +; GFX11-NEXT: .LBB95_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i16_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v7 -; GCN-NEXT: v_mov_b32_e32 v16, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_4 -; GCN-NEXT: .LBB48_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB48_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_bfe_u32 v7, v21, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v24 -; GCN-NEXT: v_or_b32_e32 v4, v1, v25 -; GCN-NEXT: v_or_b32_e32 v8, v2, v22 -; GCN-NEXT: v_or_b32_e32 v12, v3, v23 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_bfe_u32 v15, v20, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: .LBB48_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v25, v3 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v3 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i16_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v7 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_4 +; SI-NEXT: .LBB96_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB96_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v8, v5, v24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v4, v1, v23 +; SI-NEXT: v_or_b32_e32 v12, v5, v22 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; SI-NEXT: v_bfe_u32 v7, v21, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v20, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: .LBB96_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i16_to_v16i8: ; VI: ; %bb.0: @@ -9717,7 +19218,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 @@ -9733,9 +19234,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v21, v19 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9762,7 +19263,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v16 ; VI-NEXT: v_mov_b32_e32 v1, v20 @@ -9792,7 +19293,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -9806,9 +19307,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -9826,7 +19327,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -9850,7 +19351,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -9860,9 +19361,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] @@ -9878,7 +19379,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %end +; GFX11-TRUE16-NEXT: .LBB96_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -9911,7 +19412,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -9925,9 +19426,9 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] @@ -9947,7 +19448,7 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -9971,140 +19472,569 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i16_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_alignbit_b32 v3, s8, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s8, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s8, v0, 8 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_alignbit_b32 v11, s9, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v0, 8 +; SI-NEXT: s_lshr_b32 s10, s8, 8 +; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_and_b32 s14, s23, 0xffff +; SI-NEXT: s_bfe_u32 s12, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s15, s23, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s8, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s8, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s8, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_alignbit_b32 v11, s9, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v0, 8 +; SI-NEXT: s_lshr_b32 s12, s8, 24 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s10, s8, 8 +; SI-NEXT: s_lshr_b32 s15, s9, 24 +; SI-NEXT: s_lshr_b32 s14, s9, 16 +; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v8, s7 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB97_2 +; +; VI-LABEL: bitcast_v8i16_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s12, s19, 8 +; VI-NEXT: s_lshr_b32 s13, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s21, s17, 8 +; VI-NEXT: s_lshr_b32 s22, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s13 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s12 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v8i16_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s12, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s22, s17, 8 +; GFX9-NEXT: s_lshr_b32 s21, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v19, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s22 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8i16_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8i16_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v10, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v14, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v8i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v9 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v1, v1, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v7, v7, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v3, v16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v8, v18, v5 -; GCN-NEXT: v_or_b32_e32 v5, v19, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_or_b32_e32 v15, v4, v8 -; GCN-NEXT: v_or_b32_e32 v9, v0, v2 -; GCN-NEXT: v_or_b32_e32 v13, v6, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v1, v11, v2, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v20, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: v_or_b32_e32 v3, v19, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_or_b32_e32 v7, v18, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v16, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v2 -; GCN-NEXT: v_alignbit_b32 v1, v11, v9, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v13, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v9 -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v13 -; GCN-NEXT: v_mov_b32_e32 v6, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v8i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v16, v2 +; SI-NEXT: v_or_b32_e32 v2, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_or_b32_e32 v4, v21, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v7, v15, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v13, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v10, v23, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v10, 16 +; SI-NEXT: v_or_b32_e32 v4, v0, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v13, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8i16: ; VI: ; %bb.0: @@ -10123,14 +20053,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: s_cbranch_execnz .LBB98_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_4 -; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB98_4 +; VI-NEXT: .LBB98_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB49_3: ; %cmp.false +; VI-NEXT: .LBB98_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10160,8 +20090,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 -; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: .LBB98_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10211,14 +20141,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: s_cbranch_execnz .LBB98_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_4 -; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB98_4 +; GFX9-NEXT: .LBB98_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: .LBB98_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10248,8 +20178,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: .LBB98_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10303,14 +20233,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -10362,8 +20292,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -10433,14 +20363,14 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -10487,8 +20417,8 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -10553,90 +20483,561 @@ end: ret <8 x i16> %phi } +define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v8i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_or_b32 s4, s7, s4 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s8, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s6, 24 +; SI-NEXT: s_or_b32 s13, s11, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s27, 24 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s16, 0xff +; SI-NEXT: s_lshl_b32 s12, s17, 8 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_or_b32 s11, s11, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s12, s25, 8 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: v_alignbit_b32 v1, s10, v0, 16 +; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v5, s7, v0, 16 +; SI-NEXT: s_or_b32 s9, s4, s9 +; SI-NEXT: s_lshr_b32 s12, s5, 16 +; SI-NEXT: s_lshr_b32 s13, s13, 16 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s5, s6, 24 +; SI-NEXT: s_and_b32 s6, s8, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s10, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_alignbit_b32 v1, s10, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v5, s7, v0, 16 +; SI-NEXT: s_lshr_b32 s12, s10, 16 +; SI-NEXT: s_lshr_b32 s13, s7, 16 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v16i8_to_v8i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v8i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v8i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-NEXT: .LBB99_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB99_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB99_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_4 -; GCN-NEXT: .LBB50_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB50_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: .LBB50_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v7 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_4 +; SI-NEXT: .LBB100_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB100_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: .LBB100_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v8bf16: ; VI: ; %bb.0: @@ -10645,7 +21046,7 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v0 @@ -10660,7 +21061,7 @@ define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v7, v2 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: .LBB100_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10714,98 +21115,268 @@ end: ret <8 x bfloat> %phi } +define inreg <8 x bfloat> @bitcast_v8f16_to_v8bf16_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s23 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_e32 v4, s16, v0 +; VI-NEXT: v_add_f16_sdwa v5, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v6, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v7, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v0, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v0 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v4, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_4 -; GCN-NEXT: .LBB51_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB51_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: .LBB51_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_4 +; SI-NEXT: .LBB102_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB102_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v8f16: ; VI: ; %bb.0: @@ -10814,7 +21385,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -10889,7 +21460,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10900,7 +21471,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -10964,7 +21535,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v5, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v4, v0, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10976,7 +21547,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11051,7 +21622,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v7, v3 ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v4 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11063,7 +21634,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11128,7 +21699,7 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11148,115 +21719,503 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s23 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB103_2 +; +; VI-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 +; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v3 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v3 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-NEXT: .LBB103_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s4 +; GFX11-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX11-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v10, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v9 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v3, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v9 +; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v6, 16, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB103_3: +; GFX11-NEXT: s_branch .LBB103_2 +; GFX11-NEXT: .LBB103_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f16_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v6 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_4 -; GCN-NEXT: .LBB52_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB52_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v17, v0 -; GCN-NEXT: v_or_b32_e32 v4, v16, v1 -; GCN-NEXT: v_or_b32_e32 v8, v19, v2 -; GCN-NEXT: v_or_b32_e32 v12, v18, v3 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: .LBB52_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v8, v1, v0 -; GCN-NEXT: v_or_b32_e32 v12, v2, v9 -; GCN-NEXT: v_or_b32_e32 v0, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v5, v10 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f16_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_4 +; SI-NEXT: .LBB104_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB104_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v8, v20, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v4, v16, v1 +; SI-NEXT: v_or_b32_e32 v12, v19, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: .LBB104_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f16_to_v16i8: ; VI: ; %bb.0: @@ -11280,7 +22239,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -11290,9 +22249,9 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -11319,7 +22278,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -11349,7 +22308,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -11363,9 +22322,9 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] @@ -11384,7 +22343,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -11408,7 +22367,7 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 @@ -11418,79 +22377,481 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX11-TRUE16-NEXT: .LBB104_4: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v17.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8f16_to_v16i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: .LBB104_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f16_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v8, v20, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_or_b32_e32 v4, v16, v1 +; SI-NEXT: v_or_b32_e32 v12, v19, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v8, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v8f16_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s19, 24 +; VI-NEXT: s_lshr_b32 s20, s19, 16 +; VI-NEXT: s_lshr_b32 s11, s19, 8 +; VI-NEXT: s_lshr_b32 s21, s18, 16 +; VI-NEXT: s_lshr_b32 s13, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s22, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s23, s16, 16 +; VI-NEXT: s_lshr_b32 s12, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v17, s17, v1 +; VI-NEXT: v_add_f16_e32 v2, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v19, v17, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v1 +; VI-NEXT: v_add_f16_e32 v14, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v18, v0, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; VI-NEXT: v_add_f16_e32 v16, s19, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: v_or_b32_e32 v21, v16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_add_f16_e32 v8, s18, v1 +; VI-NEXT: v_or_b32_e32 v20, v8, v3 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[20:21] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v2, s23 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v10, s21 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v9, s13 +; VI-NEXT: v_mov_b32_e32 v13, s11 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v17 +; VI-NEXT: v_mov_b32_e32 v12, v16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s19, 24 +; GFX9-NEXT: s_lshr_b32 s11, s19, 16 +; GFX9-NEXT: s_lshr_b32 s13, s19, 8 +; GFX9-NEXT: s_lshr_b32 s12, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s15, s17, 24 +; GFX9-NEXT: s_lshr_b32 s20, s17, 16 +; GFX9-NEXT: s_lshr_b32 s22, s17, 8 +; GFX9-NEXT: s_lshr_b32 s21, s16, 16 +; GFX9-NEXT: s_lshr_b32 s23, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v19, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v5, s22 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v10, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-NEXT: v_mov_b32_e32 v14, s11 +; GFX9-NEXT: v_mov_b32_e32 v15, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8f16_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[16:17] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_4: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v17.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v8f16_to_v16i8: +; GFX11-FAKE16-LABEL: bitcast_v8f16_to_v16i8_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, s0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 @@ -11505,8 +22866,33 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v17, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v10, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v14, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v16 @@ -11530,122 +22916,122 @@ end: } define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v0, v16 -; GCN-NEXT: v_or_b32_e32 v1, v1, v18 -; GCN-NEXT: v_or_b32_e32 v2, v2, v19 -; GCN-NEXT: v_or_b32_e32 v3, v3, v20 -; GCN-NEXT: v_or_b32_e32 v4, v4, v21 -; GCN-NEXT: v_or_b32_e32 v5, v5, v22 -; GCN-NEXT: v_or_b32_e32 v6, v6, v23 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: .LBB53_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v15, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v3, v22, v3 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_or_b32_e32 v2, v18, v2 -; GCN-NEXT: v_or_b32_e32 v7, v16, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 -; GCN-NEXT: .LBB53_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v9 -; GCN-NEXT: v_mov_b32_e32 v6, v13 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v8f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_or_b32_e32 v4, v4, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: v_mov_b32_e32 v6, v13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8f16: ; VI: ; %bb.0: @@ -11664,14 +23050,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: s_cbranch_execnz .LBB106_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_4 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB106_4 +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: .LBB106_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -11701,8 +23087,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 -; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB106_2 +; VI-NEXT: .LBB106_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -11752,14 +23138,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: s_cbranch_execnz .LBB106_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_4 -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB106_4 +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: .LBB106_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -11789,8 +23175,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 -; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB106_2 +; GFX9-NEXT: .LBB106_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -11844,14 +23230,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -11903,8 +23289,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -11974,14 +23360,14 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -12028,8 +23414,8 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -12094,110 +23480,555 @@ end: ret <8 x half> %phi } +define inreg <8 x half> @bitcast_v16i8_to_v8f16_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v8f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v0 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s24, 0xff +; SI-NEXT: s_lshl_b32 s8, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: s_lshl_b32 s9, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s20, 0xff +; SI-NEXT: s_lshl_b32 s10, s21, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s18, 0xff +; SI-NEXT: s_lshl_b32 s11, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s16, 0xff +; SI-NEXT: s_lshl_b32 s12, s17, 8 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB107_2 +; +; VI-LABEL: bitcast_v16i8_to_v8f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v8f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v8f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-NEXT: .LBB107_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB107_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB107_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v8bf16_to_v16i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_4 -; GCN-NEXT: .LBB54_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB54_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v20 -; GCN-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v16, 16 -; GCN-NEXT: v_alignbit_b32 v8, v1, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v19, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: .LBB54_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v2, 16 -; GCN-NEXT: v_alignbit_b32 v0, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8bf16_to_v16i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_4 +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB108_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v20, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v18, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: .LBB108_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8bf16_to_v16i8: ; VI: ; %bb.0: @@ -12221,7 +24052,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -12235,9 +24066,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -12324,7 +24155,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 ; VI-NEXT: v_mov_b32_e32 v4, v19 @@ -12354,7 +24185,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -12368,9 +24199,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -12450,7 +24281,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v18 ; GFX9-NEXT: v_mov_b32_e32 v4, v19 @@ -12481,7 +24312,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11 @@ -12497,9 +24328,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v11.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12578,7 +24409,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h @@ -12612,7 +24443,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 @@ -12626,9 +24457,9 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; GFX11-FAKE16-NEXT: .LBB54_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v19 @@ -12717,7 +24548,7 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v18 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v19 @@ -12741,139 +24572,839 @@ end: ret <16 x i8> %phi } +define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s22 +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_alignbit_b32 v0, v0, v19, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v17, 16 +; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s19, 24 +; VI-NEXT: s_lshr_b32 s11, s19, 16 +; VI-NEXT: s_lshr_b32 s13, s19, 8 +; VI-NEXT: s_lshr_b32 s12, s18, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s15, s17, 24 +; VI-NEXT: s_lshr_b32 s20, s17, 16 +; VI-NEXT: s_lshr_b32 s22, s17, 8 +; VI-NEXT: s_lshr_b32 s21, s16, 16 +; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v18, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v17, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v16, v0, v1, 16 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: s_branch .LBB109_5 +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr20 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v1, s23 +; VI-NEXT: v_mov_b32_e32 v2, s21 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v9, s14 +; VI-NEXT: v_mov_b32_e32 v10, s12 +; VI-NEXT: v_mov_b32_e32 v13, s13 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB109_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s19, 24 +; GFX9-NEXT: s_lshr_b32 s23, s19, 16 +; GFX9-NEXT: s_lshr_b32 s15, s19, 8 +; GFX9-NEXT: s_lshr_b32 s21, s18, 16 +; GFX9-NEXT: s_lshr_b32 s20, s18, 8 +; GFX9-NEXT: s_lshr_b32 s10, s17, 24 +; GFX9-NEXT: s_lshr_b32 s22, s17, 16 +; GFX9-NEXT: s_lshr_b32 s11, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b32 s13, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB109_4 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s19 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: s_branch .LBB109_5 +; GFX9-NEXT: .LBB109_3: +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr22 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr20 +; GFX9-NEXT: ; implicit-def: $sgpr21 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr23 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB109_2 +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v17, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB109_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v17 +; GFX9-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v7, v9 :: v_dual_and_b32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v11, v8 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v12, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v14, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_branch .LBB109_5 +; GFX11-TRUE16-NEXT: .LBB109_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s4 +; GFX11-TRUE16-NEXT: .LBB109_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s4, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v10, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: s_branch .LBB109_5 +; GFX11-FAKE16-NEXT: .LBB109_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB109_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v17 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v16 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i8_to_v8bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v15 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v11, v1, v0 -; GCN-NEXT: v_or_b32_e32 v15, v18, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v19, v4 -; GCN-NEXT: v_or_b32_e32 v13, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v20, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v21, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v12 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v23, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v3, v21, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v7 -; GCN-NEXT: v_or_b32_e32 v7, v20, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v19, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v11 -; GCN-NEXT: v_mov_b32_e32 v1, v15 -; GCN-NEXT: v_mov_b32_e32 v2, v17 -; GCN-NEXT: v_mov_b32_e32 v4, v13 -; GCN-NEXT: v_mov_b32_e32 v6, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i8_to_v8bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v20, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: v_or_b32_e32 v11, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v21, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v7, v13, v2 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v15 +; SI-NEXT: v_mov_b32_e32 v4, v11 +; SI-NEXT: v_mov_b32_e32 v6, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i8_to_v8bf16: ; VI: ; %bb.0: @@ -12892,14 +25423,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: s_cbranch_execnz .LBB110_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_4 -; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB110_4 +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: .LBB110_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -12929,8 +25460,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 -; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB110_2 +; VI-NEXT: .LBB110_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v17 ; VI-NEXT: v_add_u16_e32 v1, 3, v18 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -12980,14 +25511,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: s_cbranch_execnz .LBB110_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_4 -; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB110_4 +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: .LBB110_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13017,8 +25548,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 -; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB110_2 +; GFX9-NEXT: .LBB110_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -13072,14 +25603,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l @@ -13131,8 +25662,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 @@ -13202,14 +25733,14 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v18 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -13256,8 +25787,8 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v17, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v18, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -13321,3 +25852,468 @@ end: %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <8 x bfloat> %phi } + +define inreg <8 x bfloat> @bitcast_v16i8_to_v8bf16_scalar(<16 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s11, v0 +; SI-NEXT: s_cbranch_scc0 .LBB111_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB111_3 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s9, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s8, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s10, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s5, 16 +; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: .LBB111_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB111_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB111_2 +; +; VI-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s11, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s11, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-LABEL: bitcast_v16i8_to_v8bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s6, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s21, 8 +; GFX11-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-NEXT: s_and_b32 s9, s9, 0xffff +; GFX11-NEXT: s_lshl_b32 s10, s10, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s9, s10 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-NEXT: .LBB111_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s21, 8 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s23, 8 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s25, 8 +; GFX11-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s7, s6 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_or_b32 s7, s2, s3 +; GFX11-NEXT: .LBB111_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB111_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index 25bf7b2255e5c..c87d52c1e6907 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -1,28 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i32_to_v5f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i32_to_v5f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v5f32: ; VI: ; %bb.0: @@ -92,24 +92,133 @@ end: ret <5 x float> %phi } +define inreg <5 x float> @bitcast_v5i32_to_v5f32_scalar(<5 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i32_to_v5f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v5i32_to_v5f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v5i32_to_v5f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v5i32_to_v5f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i32> %a, splat (i32 3) + %a2 = bitcast <5 x i32> %a1 to <5 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x i32> %a to <5 x float> + br label %end + +end: + %phi = phi <5 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x float> %phi +} + define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f32_to_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f32_to_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v5i32: ; VI: ; %bb.0: @@ -177,50 +286,164 @@ end: ret <5 x i32> %phi } +define inreg <5 x i32> @bitcast_v5f32_to_v5i32_scalar(<5 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f32_to_v5i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f32_to_v5i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f32_to_v5i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f32_to_v5i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <5 x float> %a1 to <5 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x float> %a to <5 x i32> + br label %end + +end: + %phi = phi <5 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i32> %phi +} + define <10 x i16> @bitcast_v5i32_to_v10i16(<5 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i32_to_v10i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i32_to_v10i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10i16: ; VI: ; %bb.0: @@ -290,76 +513,209 @@ end: ret <10 x i16> %phi } +define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i32_to_v10i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v5i32_to_v10i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v5i32_to_v10i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v5i32_to_v10i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i32> %a, splat (i32 3) + %a2 = bitcast <5 x i32> %a1 to <10 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x i32> %a to <10 x i16> + br label %end + +end: + %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i16> %phi +} + define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i16_to_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v10 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i16_to_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v5i32: ; VI: ; %bb.0: @@ -368,7 +724,7 @@ define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 3 ; VI-NEXT: v_add_u16_e32 v5, 3, v4 @@ -386,7 +742,7 @@ define <5 x i32> @bitcast_v10i16_to_v5i32(<10 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v5, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -441,81 +797,249 @@ end: ret <5 x i32> %phi } +define inreg <5 x i32> @bitcast_v10i16_to_v5i32_scalar(<10 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i16_to_v5i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v10i16_to_v5i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v10i16_to_v5i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i16_to_v5i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i16> %a, splat (i16 3) + %a2 = bitcast <10 x i16> %a1 to <5 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x i16> %a to <5 x i32> + br label %end + +end: + %phi = phi <5 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i32> %phi +} + define <10 x half> @bitcast_v5i32_to_v10f16(<5 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i32_to_v10f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i32_to_v10f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i32_to_v10f16: ; VI: ; %bb.0: @@ -585,97 +1109,242 @@ end: ret <10 x half> %phi } +define inreg <10 x half> @bitcast_v5i32_to_v10f16_scalar(<5 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i32_to_v10f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v5i32_to_v10f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v5i32_to_v10f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v5i32_to_v10f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s16 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i32> %a, splat (i32 3) + %a2 = bitcast <5 x i32> %a1 to <10 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x i32> %a to <10 x half> + br label %end + +end: + %phi = phi <10 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x half> %phi +} + define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f16_to_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_or_b32_e32 v3, v6, v3 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v8, v4 -; GCN-NEXT: v_or_b32_e32 v3, v6, v9 -; GCN-NEXT: v_or_b32_e32 v4, v5, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f16_to_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5i32: ; VI: ; %bb.0: @@ -684,7 +1353,7 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -702,7 +1371,7 @@ define <5 x i32> @bitcast_v10f16_to_v5i32(<10 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -758,50 +1427,236 @@ end: ret <5 x i32> %phi } +define inreg <5 x i32> @bitcast_v10f16_to_v5i32_scalar(<10 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f16_to_v5i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s24 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v10f16_to_v5i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v5, v1 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f16_to_v5i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f16_to_v5i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x half> %a, splat (half 0xH0200) + %a2 = bitcast <10 x half> %a1 to <5 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x half> %a to <5 x i32> + br label %end + +end: + %phi = phi <5 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i32> %phi +} + define <10 x i16> @bitcast_v5f32_to_v10i16(<5 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f32_to_v10i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f32_to_v10i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10i16: ; VI: ; %bb.0: @@ -869,76 +1724,217 @@ end: ret <10 x i16> %phi } +define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f32_to_v10i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f32_to_v10i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f32_to_v10i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f32_to_v10i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <5 x float> %a1 to <10 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x float> %a to <10 x i16> + br label %end + +end: + %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i16> %phi +} + define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i16_to_v5f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v10 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v10, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i16_to_v5f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v5f32: ; VI: ; %bb.0: @@ -947,7 +1943,7 @@ define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 3 ; VI-NEXT: v_add_u16_e32 v5, 3, v4 @@ -965,7 +1961,7 @@ define <5 x float> @bitcast_v10i16_to_v5f32(<10 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v5, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1020,81 +2016,249 @@ end: ret <5 x float> %phi } +define inreg <5 x float> @bitcast_v10i16_to_v5f32_scalar(<10 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i16_to_v5f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v10i16_to_v5f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v10i16_to_v5f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i16_to_v5f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i16> %a, splat (i16 3) + %a2 = bitcast <10 x i16> %a1 to <5 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x i16> %a to <5 x float> + br label %end + +end: + %phi = phi <5 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x float> %phi +} + define <10 x half> @bitcast_v5f32_to_v10f16(<5 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f32_to_v10f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f32_to_v10f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10f16: ; VI: ; %bb.0: @@ -1162,97 +2326,252 @@ end: ret <10 x half> %phi } +define inreg <10 x half> @bitcast_v5f32_to_v10f16_scalar(<5 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f32_to_v10f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s21, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v5f32_to_v10f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f32_to_v10f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f32_to_v10f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <5 x float> %a1 to <10 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x float> %a to <10 x half> + br label %end + +end: + %phi = phi <10 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x half> %phi +} + define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f16_to_v5f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_or_b32_e32 v3, v6, v3 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v8, v4 -; GCN-NEXT: v_or_b32_e32 v3, v6, v9 -; GCN-NEXT: v_or_b32_e32 v4, v5, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f16_to_v5f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v5f32: ; VI: ; %bb.0: @@ -1261,7 +2580,7 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1279,7 +2598,7 @@ define <5 x float> @bitcast_v10f16_to_v5f32(<10 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1335,86 +2654,272 @@ end: ret <5 x float> %phi } +define inreg <5 x float> @bitcast_v10f16_to_v5f32_scalar(<10 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f16_to_v5f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s24 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_or_b32_e32 v2, v9, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v10f16_to_v5f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v5, v1 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f16_to_v5f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f16_to_v5f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s5 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x half> %a, splat (half 0xH0200) + %a2 = bitcast <10 x half> %a1 to <5 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x half> %a to <5 x float> + br label %end + +end: + %phi = phi <5 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x float> %phi +} + define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i16_to_v10f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v19, v9 -; GCN-NEXT: v_mov_b32_e32 v18, v8 -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v16, v6 -; GCN-NEXT: v_mov_b32_e32 v15, v5 -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NEXT: v_mov_b32_e32 v12, v2 -; GCN-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i16_to_v10f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v9 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v15, v5 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v12, v2 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i16_to_v10f16: ; VI: ; %bb.0: @@ -1423,7 +2928,7 @@ define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 3 ; VI-NEXT: v_add_u16_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1441,7 +2946,7 @@ define <10 x half> @bitcast_v10i16_to_v10f16(<10 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v8 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1496,71 +3001,237 @@ end: ret <10 x half> %phi } +define inreg <10 x half> @bitcast_v10i16_to_v10f16_scalar(<10 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i16_to_v10f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v10i16_to_v10f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v10i16_to_v10f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i16_to_v10f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i16> %a, splat (i16 3) + %a2 = bitcast <10 x i16> %a1 to <10 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i16> %a to <10 x half> + br label %end + +end: + %phi = phi <10 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x half> %phi +} + define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f16_to_v10i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_or_b32_e32 v6, v6, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v12 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f16_to_v10i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f16_to_v10i16: ; VI: ; %bb.0: @@ -1569,7 +3240,7 @@ define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v0 @@ -1587,7 +3258,7 @@ define <10 x i16> @bitcast_v10f16_to_v10i16(<10 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v8, v2 ; VI-NEXT: v_or_b32_e32 v1, v7, v1 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1642,3 +3313,190 @@ end: %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <10 x i16> %phi } + +define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f16_to_v10i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v10f16_to_v10i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s21, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_e32 v5, s16, v0 +; VI-NEXT: v_add_f16_sdwa v6, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v7, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v8, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v9, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v0 +; VI-NEXT: v_or_b32_e32 v3, v3, v9 +; VI-NEXT: v_or_b32_e32 v2, v2, v8 +; VI-NEXT: v_or_b32_e32 v1, v1, v7 +; VI-NEXT: v_or_b32_e32 v0, v5, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f16_to_v10i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f16_to_v10i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x half> %a, splat (half 0xH0200) + %a2 = bitcast <10 x half> %a1 to <10 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x half> %a to <10 x i16> + br label %end + +end: + %phi = phi <10 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll index 27d32fc05e428..7556d355a3844 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -1,37 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define half @bitcast_i16_to_f16(i16 %a, i32 %b) { -; GCN-LABEL: bitcast_i16_to_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_4 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB0_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: .LBB0_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i16_to_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_4 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB0_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: .LBB0_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_f16: ; VI: ; %bb.0: @@ -111,23 +111,126 @@ end: ret half %phi } +define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i16_to_f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_i16_to_f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_i16_to_f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-TRUE16-LABEL: bitcast_i16_to_f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-TRUE16-NEXT: .LBB1_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s0, 3 +; GFX11-TRUE16-NEXT: .LBB1_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB1_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB1_2 +; +; GFX11-FAKE16-LABEL: bitcast_i16_to_f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB1_3: +; GFX11-FAKE16-NEXT: .LBB1_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i16 %a, 3 + %a2 = bitcast i16 %a1 to half + br label %end + +cmp.false: + %a3 = bitcast i16 %a to half + br label %end + +end: + %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret half %phi +} + define i16 @bitcast_f16_to_i16(half %a, i32 %b) { -; GCN-LABEL: bitcast_f16_to_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f16_to_i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f16_to_i16: ; VI: ; %bb.0: @@ -168,10 +271,10 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB1_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 0x200, v0.l -; GFX11-TRUE16-NEXT: .LBB1_4: ; %end +; GFX11-TRUE16-NEXT: .LBB2_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l @@ -207,22 +310,127 @@ end: ret i16 %phi } +define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f16_to_i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_f16_to_i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f16_to_i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_add_f16_e32 v0, s16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f16_to_i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-TRUE16-NEXT: .LBB3_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, 0x200, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB3_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB3_2 +; GFX11-TRUE16-NEXT: .LBB3_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f16_to_i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-FAKE16-NEXT: .LBB3_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, 0x200, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB3_3: +; GFX11-FAKE16-NEXT: s_branch .LBB3_2 +; GFX11-FAKE16-NEXT: .LBB3_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd half %a, 0xH0200 + %a2 = bitcast half %a1 to i16 + br label %end + +cmp.false: + %a3 = bitcast half %a to i16 + br label %end + +end: + %phi = phi i16 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i16 %phi +} + define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) { -; GCN-LABEL: bitcast_i16_to_bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i16_to_bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i16_to_bf16: ; VI: ; %bb.0: @@ -263,10 +471,10 @@ define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v0.l, 3 -; GFX11-TRUE16-NEXT: .LBB2_4: ; %end +; GFX11-TRUE16-NEXT: .LBB4_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l @@ -302,33 +510,138 @@ end: ret bfloat %phi } +define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i16_to_bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s7, s6, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_lshl_b32 s4, s6, 16 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_i16_to_bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_i16_to_bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-TRUE16-LABEL: bitcast_i16_to_bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s0, 3 +; GFX11-TRUE16-NEXT: .LBB5_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB5_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB5_2 +; +; GFX11-FAKE16-LABEL: bitcast_i16_to_bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB5_3: +; GFX11-FAKE16-NEXT: .LBB5_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i16 %a, 3 + %a2 = bitcast i16 %a1 to bfloat + br label %end + +cmp.false: + %a3 = bitcast i16 %a to bfloat + br label %end + +end: + %phi = phi bfloat [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret bfloat %phi +} + define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { -; GCN-LABEL: bitcast_bf16_to_i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_bf16_to_i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_bf16_to_i16: ; VI: ; %bb.0: @@ -386,7 +699,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB3_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -397,7 +710,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: .LBB3_4: ; %end +; GFX11-TRUE16-NEXT: .LBB6_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h @@ -411,7 +724,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -424,7 +737,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: .LBB3_2: ; %end +; GFX11-FAKE16-NEXT: .LBB6_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -444,34 +757,180 @@ end: ret i16 %phi } +define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_bf16_to_i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_bf16_to_i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_4 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_3: +; VI-NEXT: s_branch .LBB7_2 +; VI-NEXT: .LBB7_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_bf16_to_i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_bf16_to_i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-TRUE16-NEXT: .LBB7_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB7_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB7_2 +; GFX11-TRUE16-NEXT: .LBB7_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_bf16_to_i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-FAKE16-NEXT: .LBB7_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB7_3: +; GFX11-FAKE16-NEXT: s_branch .LBB7_2 +; GFX11-FAKE16-NEXT: .LBB7_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd bfloat %a, 0xR40C0 + %a2 = bitcast bfloat %a1 to i16 + br label %end + +cmp.false: + %a3 = bitcast bfloat %a to i16 + br label %end + +end: + %phi = phi i16 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i16 %phi +} + define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { -; GCN-LABEL: bitcast_f16_to_bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f16_to_bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f16_to_bf16: ; VI: ; %bb.0: @@ -512,10 +971,10 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 0x200, v0.l -; GFX11-TRUE16-NEXT: .LBB4_4: ; %end +; GFX11-TRUE16-NEXT: .LBB8_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l @@ -551,35 +1010,144 @@ end: ret bfloat %phi } +define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f16_to_bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_f16_to_bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_4 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_3: +; VI-NEXT: s_branch .LBB9_2 +; VI-NEXT: .LBB9_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f16_to_bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_add_f16_e32 v0, s16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_branch .LBB9_2 +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f16_to_bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-TRUE16-NEXT: .LBB9_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, 0x200, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB9_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB9_2 +; GFX11-TRUE16-NEXT: .LBB9_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f16_to_bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-FAKE16-NEXT: .LBB9_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, 0x200, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB9_3: +; GFX11-FAKE16-NEXT: s_branch .LBB9_2 +; GFX11-FAKE16-NEXT: .LBB9_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd half %a, 0xH0200 + %a2 = bitcast half %a1 to bfloat + br label %end + +cmp.false: + %a3 = bitcast half %a to bfloat + br label %end + +end: + %phi = phi bfloat [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret bfloat %phi +} + define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { -; GCN-LABEL: bitcast_bf16_to_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_bf16_to_f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_bf16_to_f16: ; VI: ; %bb.0: @@ -637,7 +1205,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB5_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -648,7 +1216,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: .LBB5_4: ; %end +; GFX11-TRUE16-NEXT: .LBB10_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h @@ -662,7 +1230,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -675,7 +1243,7 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: .LBB5_2: ; %end +; GFX11-FAKE16-NEXT: .LBB10_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -690,6 +1258,154 @@ cmp.false: %a3 = bitcast bfloat %a to half br label %end +end: + %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret half %phi +} + +define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_bf16_to_f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_bf16_to_f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_bf16_to_f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_bf16_to_f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-TRUE16-NEXT: .LBB11_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB11_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB11_2 +; GFX11-TRUE16-NEXT: .LBB11_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_bf16_to_f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-FAKE16-NEXT: .LBB11_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB11_3: +; GFX11-FAKE16-NEXT: s_branch .LBB11_2 +; GFX11-FAKE16-NEXT: .LBB11_4: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd bfloat %a, 0xR40C0 + %a2 = bitcast bfloat %a1 to half + br label %end + +cmp.false: + %a3 = bitcast bfloat %a to half + br label %end + end: %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret half %phi diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index 32d21e19e8e01..c3ace0ac5af71 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1,29 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v6f32: ; VI: ; %bb.0: @@ -96,25 +96,141 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v6i32_to_v6f32_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v6i32_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v6i32: ; VI: ; %bb.0: @@ -184,25 +300,146 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v6f32_to_v6i32_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v3i64: ; VI: ; %bb.0: @@ -275,25 +512,141 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v6i32_to_v3i64_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v6i32_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v6i32: ; VI: ; %bb.0: @@ -368,25 +721,141 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v3i64_to_v6i32_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v3i64_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v3f64: ; VI: ; %bb.0: @@ -459,22 +928,138 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v6i32_to_v3f64_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v6i32_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v6i32: ; VI: ; %bb.0: @@ -483,12 +1068,12 @@ define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -499,12 +1084,12 @@ define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -516,12 +1101,12 @@ define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -541,52 +1126,161 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v3f64_to_v6i32_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <12 x i16> @bitcast_v6i32_to_v12i16(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v6, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12i16: ; VI: ; %bb.0: @@ -659,85 +1353,229 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v6i32_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v6i32: ; VI: ; %bb.0: @@ -746,7 +1584,7 @@ define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -767,7 +1605,7 @@ define <6 x i32> @bitcast_v12i16_to_v6i32(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -824,92 +1662,279 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v12i16_to_v6i32_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v12i16_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <12 x half> @bitcast_v6i32_to_v12f16(<6 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i32_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NEXT: v_mov_b32_e32 v12, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i32_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i32_to_v12f16: ; VI: ; %bb.0: @@ -982,125 +2007,284 @@ end: ret <12 x half> %phi } -define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v6i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x half> @bitcast_v6i32_to_v12f16_scalar(<6 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i32_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB17_2 ; -; VI-LABEL: bitcast_v12f16_to_v6i32: +; VI-LABEL: bitcast_v6i32_to_v12f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v6, 0x200 -; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v7 +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v6i32_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v6i32_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + +define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v12f16_to_v6i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f16_to_v6i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v6, 0x200 +; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v7 ; VI-NEXT: v_add_f16_sdwa v7, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 ; VI-NEXT: v_or_b32_e32 v4, v4, v7 @@ -1116,7 +2300,7 @@ define <6 x i32> @bitcast_v12f16_to_v6i32(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1174,25 +2358,232 @@ end: ret <6 x i32> %phi } +define inreg <6 x i32> @bitcast_v12f16_to_v6i32_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v6i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v12f16_to_v6i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v6i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v6i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v3i64: ; VI: ; %bb.0: @@ -1262,25 +2653,146 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v6f32_to_v3i64_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_3: +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v6f32: ; VI: ; %bb.0: @@ -1355,25 +2867,141 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v3i64_to_v6f32_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v3i64_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v3f64: ; VI: ; %bb.0: @@ -1443,22 +3071,143 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v6f32_to_v3f64_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v6f32: ; VI: ; %bb.0: @@ -1467,12 +3216,12 @@ define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1483,12 +3232,12 @@ define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1500,12 +3249,12 @@ define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1525,52 +3274,161 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v3f64_to_v6f32_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_4 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_3: +; SI-NEXT: s_branch .LBB27_2 +; SI-NEXT: .LBB27_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_4 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_3: +; VI-NEXT: s_branch .LBB27_2 +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <12 x i16> @bitcast_v6f32_to_v12i16(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v6, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12i16: ; VI: ; %bb.0: @@ -1640,85 +3498,236 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v6f32: ; VI: ; %bb.0: @@ -1727,7 +3736,7 @@ define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -1748,7 +3757,7 @@ define <6 x float> @bitcast_v12i16_to_v6f32(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1805,92 +3814,279 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v12i16_to_v6f32_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v12i16_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <12 x half> @bitcast_v6f32_to_v12f16(<6 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f32_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NEXT: v_mov_b32_e32 v12, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f32_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12f16: ; VI: ; %bb.0: @@ -1960,111 +4156,279 @@ end: ret <12 x half> %phi } +define inreg <12 x half> @bitcast_v6f32_to_v12f16_scalar(<6 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f32_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v6f32_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v6f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v6f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v6f32: ; VI: ; %bb.0: @@ -2073,7 +4437,7 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2094,7 +4458,7 @@ define <6 x float> @bitcast_v12f16_to_v6f32(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2152,25 +4516,232 @@ end: ret <6 x float> %phi } +define inreg <6 x float> @bitcast_v12f16_to_v6f32_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v6f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v12f16_to_v6f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v6f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v6f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v3f64: ; VI: ; %bb.0: @@ -2245,22 +4816,137 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v3i64_to_v3f64_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v3i64_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: s_branch .LBB37_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v3i64: ; VI: ; %bb.0: @@ -2269,12 +4955,12 @@ define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2285,12 +4971,12 @@ define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2302,12 +4988,12 @@ define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2327,52 +5013,161 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v3f64_to_v3i64_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_4 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_3: +; SI-NEXT: s_branch .LBB39_2 +; SI-NEXT: .LBB39_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_3: +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <12 x i16> @bitcast_v3i64_to_v12i16(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v6, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v12, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v6, v12 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12i16: ; VI: ; %bb.0: @@ -2447,85 +5242,229 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v3i64_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB41_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v3i64: ; VI: ; %bb.0: @@ -2534,7 +5473,7 @@ define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -2555,7 +5494,7 @@ define <3 x i64> @bitcast_v12i16_to_v3i64(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2612,92 +5551,279 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v12i16_to_v3i64_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v12i16_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i64_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v5 -; GCN-NEXT: v_mov_b32_e32 v13, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v3 -; GCN-NEXT: v_mov_b32_e32 v15, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v12, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v14, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i64_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v5 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v13, vcc +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i64_to_v12f16: ; VI: ; %bb.0: @@ -2735,26 +5861,185 @@ define <12 x half> @bitcast_v3i64_to_v12f16(<3 x i64> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v3i64_to_v12f16: +; GFX11-LABEL: bitcast_v3i64_to_v12f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + +define inreg <12 x half> @bitcast_v3i64_to_v12f16_scalar(<3 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i64_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v3i64_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v3i64_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v3i64_to_v12f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB45_3: ; %end ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2773,110 +6058,110 @@ end: } define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v3i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v3i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3i64: ; VI: ; %bb.0: @@ -2885,7 +6170,7 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2906,7 +6191,7 @@ define <3 x i64> @bitcast_v12f16_to_v3i64(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2964,55 +6249,262 @@ end: ret <3 x i64> %phi } +define inreg <3 x i64> @bitcast_v12f16_to_v3i64_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v3i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v12f16_to_v3i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v3i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v3i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v5 -; GCN-NEXT: v_mov_b32_e32 v12, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v16 -; GCN-NEXT: v_mov_b32_e32 v2, v17 -; GCN-NEXT: v_mov_b32_e32 v4, v14 -; GCN-NEXT: v_mov_b32_e32 v6, v15 -; GCN-NEXT: v_mov_b32_e32 v8, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v5 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: v_mov_b32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v6, v15 +; SI-NEXT: v_mov_b32_e32 v8, v12 +; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12i16: ; VI: ; %bb.0: @@ -3021,12 +6513,12 @@ define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3037,12 +6529,12 @@ define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3054,12 +6546,12 @@ define <12 x i16> @bitcast_v3f64_to_v12i16(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3079,85 +6571,231 @@ end: ret <12 x i16> %phi } +define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: s_branch .LBB49_5 +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: v_mov_b32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v6, v15 +; SI-NEXT: v_mov_b32_e32 v8, v12 +; SI-NEXT: v_mov_b32_e32 v10, v13 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} + define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v4 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v17, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v4 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v3f64: ; VI: ; %bb.0: @@ -3166,7 +6804,7 @@ define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_e32 v6, 3, v5 @@ -3187,7 +6825,7 @@ define <3 x double> @bitcast_v12i16_to_v3f64(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v6, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3244,82 +6882,269 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v12i16_to_v3f64_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v12i16_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f64_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v0 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NEXT: v_mov_b32_e32 v1, v17 -; GCN-NEXT: v_mov_b32_e32 v2, v13 -; GCN-NEXT: v_mov_b32_e32 v3, v16 -; GCN-NEXT: v_mov_b32_e32 v4, v14 -; GCN-NEXT: v_mov_b32_e32 v5, v15 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f64_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v15 +; SI-NEXT: v_mov_b32_e32 v1, v17 +; SI-NEXT: v_mov_b32_e32 v2, v13 +; SI-NEXT: v_mov_b32_e32 v3, v16 +; SI-NEXT: v_mov_b32_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v5, v14 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f64_to_v12f16: ; VI: ; %bb.0: @@ -3328,12 +7153,12 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3344,12 +7169,12 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3361,12 +7186,12 @@ define <12 x half> @bitcast_v3f64_to_v12f16(<3 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3386,111 +7211,267 @@ end: ret <12 x half> %phi } +define inreg <12 x half> @bitcast_v3f64_to_v12f16_scalar(<3 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f64_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_add_f64 v[12:13], s[16:17], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v3f64_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: +; VI-NEXT: s_branch .LBB53_2 +; VI-NEXT: .LBB53_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v3f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v16, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v7, v9 -; GCN-NEXT: v_or_b32_e32 v5, v6, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v3f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v10 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v3f64: ; VI: ; %bb.0: @@ -3499,7 +7480,7 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 0x200 ; VI-NEXT: v_add_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3520,7 +7501,7 @@ define <3 x double> @bitcast_v12f16_to_v3f64(<12 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v7 ; VI-NEXT: v_or_b32_e32 v0, v0, v6 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3578,98 +7559,306 @@ end: ret <3 x double> %phi } +define inreg <3 x double> @bitcast_v12f16_to_v3f64_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v3f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s26 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v8, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v12f16_to_v3f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v3f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v3f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i16_to_v12f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v11 -; GCN-NEXT: v_mov_b32_e32 v22, v10 -; GCN-NEXT: v_mov_b32_e32 v21, v9 -; GCN-NEXT: v_mov_b32_e32 v20, v8 -; GCN-NEXT: v_mov_b32_e32 v19, v7 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v3 -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_4 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB28_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: .LBB28_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i16_to_v12f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v11 +; SI-NEXT: v_mov_b32_e32 v22, v10 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v19, v7 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v1 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_4 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB56_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: .LBB56_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i16_to_v12f16: ; VI: ; %bb.0: @@ -3678,7 +7867,7 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v6, 3 ; VI-NEXT: v_add_u16_sdwa v7, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3699,7 +7888,7 @@ define <12 x half> @bitcast_v12i16_to_v12f16(<12 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v9 ; VI-NEXT: v_or_b32_e32 v1, v1, v8 ; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3756,82 +7945,265 @@ end: ret <12 x half> %phi } +define inreg <12 x half> @bitcast_v12i16_to_v12f16_scalar(<12 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i16_to_v12f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v12i16_to_v12f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v12i16_to_v12f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i16_to_v12f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i16> %a, splat (i16 3) + %a2 = bitcast <12 x i16> %a1 to <12 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i16> %a to <12 x half> + br label %end + +end: + %phi = phi <12 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x half> %phi +} + define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f16_to_v12i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v2, v2, v14 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f16_to_v12i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f16_to_v12i16: ; VI: ; %bb.0: @@ -3840,7 +8212,7 @@ define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 0x200 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v0 @@ -3861,7 +8233,7 @@ define <12 x i16> @bitcast_v12f16_to_v12i16(<12 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v9, v2 ; VI-NEXT: v_or_b32_e32 v1, v8, v1 ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3918,3 +8290,209 @@ end: %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <12 x i16> %phi } + +define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f16_to_v12i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v12f16_to_v12i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s22, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_e32 v6, s16, v0 +; VI-NEXT: v_add_f16_sdwa v7, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v8, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v9, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v10, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v11, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v0, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v0 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_or_b32_e32 v3, v3, v10 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_or_b32_e32 v1, v1, v8 +; VI-NEXT: v_or_b32_e32 v0, v6, v7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f16_to_v12i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f16_to_v12i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x half> %a, splat (half 0xH0200) + %a2 = bitcast <12 x half> %a1 to <12 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x half> %a to <12 x i16> + br label %end + +end: + %phi = phi <12 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index 3ec705baa9c82..c830d6b344b6f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -1,30 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i32_to_v7f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i32_to_v7f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v7f32: ; VI: ; %bb.0: @@ -101,26 +101,150 @@ end: ret <7 x float> %phi } +define inreg <7 x float> @bitcast_v7i32_to_v7f32_scalar(<7 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i32_to_v7f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v7i32_to_v7f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v7i32_to_v7f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v7i32_to_v7f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_mov_b32_e32 v6, s18 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i32> %a, splat (i32 3) + %a2 = bitcast <7 x i32> %a1 to <7 x float> + br label %end + +cmp.false: + %a3 = bitcast <7 x i32> %a to <7 x float> + br label %end + +end: + %phi = phi <7 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x float> %phi +} + define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f32_to_v7i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f32_to_v7i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v7i32: ; VI: ; %bb.0: @@ -193,60 +317,191 @@ end: ret <7 x i32> %phi } +define inreg <7 x i32> @bitcast_v7f32_to_v7i32_scalar(<7 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f32_to_v7i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f32_to_v7i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f32_to_v7i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f32_to_v7i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <7 x float> %a1 to <7 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x float> %a to <7 x i32> + br label %end + +end: + %phi = phi <7 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i32> %phi +} + define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i32_to_v14i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i32_to_v14i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14i16: ; VI: ; %bb.0: @@ -294,7 +549,7 @@ define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 @@ -303,7 +558,7 @@ define <14 x i16> @bitcast_v7i32_to_v14i16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -323,95 +578,252 @@ end: ret <14 x i16> %phi } +define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i32_to_v14i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v7i32_to_v14i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v7i32_to_v14i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v7i32_to_v14i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_mov_b32_e32 v6, s18 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i32> %a, splat (i32 3) + %a2 = bitcast <7 x i32> %a1 to <14 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x i32> %a to <14 x i16> + br label %end + +end: + %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i16> %phi +} + define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i16_to_v7i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v0, v0, v14 -; GCN-NEXT: v_or_b32_e32 v1, v1, v19 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v19, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i16_to_v7i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v7i32: ; VI: ; %bb.0: @@ -420,7 +832,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 3 ; VI-NEXT: v_add_u16_e32 v7, 3, v6 @@ -444,7 +856,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v7, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -475,7 +887,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] @@ -484,7 +896,7 @@ define <7 x i32> @bitcast_v14i16_to_v7i32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -504,103 +916,311 @@ end: ret <7 x i32> %phi } +define inreg <7 x i32> @bitcast_v14i16_to_v7i32_scalar(<14 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i16_to_v7i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v14i16_to_v7i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v14i16_to_v7i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i16_to_v7i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i16> %a, splat (i16 3) + %a2 = bitcast <14 x i16> %a1 to <7 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x i16> %a to <7 x i32> + br label %end + +end: + %phi = phi <7 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i32> %phi +} + define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i32_to_v14f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v1 -; GCN-NEXT: v_mov_b32_e32 v14, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i32_to_v14f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i32_to_v14f16: ; VI: ; %bb.0: @@ -648,7 +1268,7 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 ; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 @@ -657,7 +1277,7 @@ define <14 x half> @bitcast_v7i32_to_v14f16(<7 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -677,125 +1297,299 @@ end: ret <14 x half> %phi } +define inreg <14 x half> @bitcast_v7i32_to_v14f16_scalar(<7 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i32_to_v14f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v7i32_to_v14f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v7i32_to_v14f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v7i32_to_v14f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_mov_b32_e32 v6, s18 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i32> %a, splat (i32 3) + %a2 = bitcast <7 x i32> %a1 to <14 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x i32> %a to <14 x half> + br label %end + +end: + %phi = phi <14 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x half> %phi +} + define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f16_to_v7i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v12 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v15, v3 -; GCN-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-NEXT: v_or_b32_e32 v5, v8, v5 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v10, v6 -; GCN-NEXT: v_or_b32_e32 v4, v13, v12 -; GCN-NEXT: v_or_b32_e32 v5, v8, v11 -; GCN-NEXT: v_or_b32_e32 v6, v7, v9 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f16_to_v7i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7i32: ; VI: ; %bb.0: @@ -804,7 +1598,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 0x200 ; VI-NEXT: v_add_f16_sdwa v8, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -828,7 +1622,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v8 ; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -860,7 +1654,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] @@ -869,7 +1663,7 @@ define <7 x i32> @bitcast_v14f16_to_v7i32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -889,60 +1683,292 @@ end: ret <7 x i32> %phi } +define inreg <7 x i32> @bitcast_v14f16_to_v7i32_scalar(<14 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f16_to_v7i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v14f16_to_v7i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v7, v1 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_add_f16_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f16_to_v7i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f16_to_v7i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x half> %a, splat (half 0xH0200) + %a2 = bitcast <14 x half> %a1 to <7 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x half> %a to <7 x i32> + br label %end + +end: + %phi = phi <7 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i32> %phi +} + define <14 x i16> @bitcast_v7f32_to_v14i16(<7 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f32_to_v14i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f32_to_v14i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14i16: ; VI: ; %bb.0: @@ -1015,95 +2041,256 @@ end: ret <14 x i16> %phi } +define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f32_to_v14i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f32_to_v14i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f32_to_v14i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f32_to_v14i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <7 x float> %a1 to <14 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x float> %a to <14 x i16> + br label %end + +end: + %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i16> %phi +} + define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i16_to_v7f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v17, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v0, v0, v14 -; GCN-NEXT: v_or_b32_e32 v1, v1, v19 -; GCN-NEXT: v_or_b32_e32 v2, v2, v20 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v15 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v1, v19, v1 -; GCN-NEXT: v_or_b32_e32 v2, v20, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i16_to_v7f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v20, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v7f32: ; VI: ; %bb.0: @@ -1112,7 +2299,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 3 ; VI-NEXT: v_add_u16_e32 v7, 3, v6 @@ -1136,7 +2323,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v7, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1167,7 +2354,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] @@ -1176,7 +2363,7 @@ define <7 x float> @bitcast_v14i16_to_v7f32(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1196,103 +2383,311 @@ end: ret <7 x float> %phi } +define inreg <7 x float> @bitcast_v14i16_to_v7f32_scalar(<14 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i16_to_v7f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v14i16_to_v7f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v14i16_to_v7f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i16_to_v7f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i16> %a, splat (i16 3) + %a2 = bitcast <14 x i16> %a1 to <7 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x i16> %a to <7 x float> + br label %end + +end: + %phi = phi <7 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x float> %phi +} + define <14 x half> @bitcast_v7f32_to_v14f16(<7 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f32_to_v14f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v1 -; GCN-NEXT: v_mov_b32_e32 v14, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f32_to_v14f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14f16: ; VI: ; %bb.0: @@ -1365,125 +2760,306 @@ end: ret <14 x half> %phi } +define inreg <14 x half> @bitcast_v7f32_to_v14f16_scalar(<7 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f32_to_v14f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s23, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v7f32_to_v14f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f32_to_v14f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f32_to_v14f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <7 x float> %a1 to <14 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x float> %a to <14 x half> + br label %end + +end: + %phi = phi <14 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x half> %phi +} + define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f16_to_v7f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v12 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v20, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v15, v3 -; GCN-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-NEXT: v_or_b32_e32 v5, v8, v5 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v10, v6 -; GCN-NEXT: v_or_b32_e32 v4, v13, v12 -; GCN-NEXT: v_or_b32_e32 v5, v8, v11 -; GCN-NEXT: v_or_b32_e32 v6, v7, v9 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f16_to_v7f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v2, v19, v2 +; SI-NEXT: v_or_b32_e32 v3, v17, v3 +; SI-NEXT: v_or_b32_e32 v4, v15, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v7f32: ; VI: ; %bb.0: @@ -1492,7 +3068,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 0x200 ; VI-NEXT: v_add_f16_sdwa v8, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1516,7 +3092,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v8 ; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1548,7 +3124,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] @@ -1557,7 +3133,7 @@ define <7 x float> @bitcast_v14f16_to_v7f32(<14 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1577,110 +3153,342 @@ end: ret <7 x float> %phi } +define inreg <7 x float> @bitcast_v14f16_to_v7f32_scalar(<14 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f16_to_v7f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v2, v15, v2 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_or_b32_e32 v5, v9, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v14f16_to_v7f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v7, v1 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_add_f16_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f16_to_v7f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f16_to_v7f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_mov_b32_e32 v6, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x half> %a, splat (half 0xH0200) + %a2 = bitcast <14 x half> %a1 to <7 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x half> %a to <7 x float> + br label %end + +end: + %phi = phi <7 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x float> %phi +} + define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i16_to_v14f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v27, v13 -; GCN-NEXT: v_mov_b32_e32 v26, v12 -; GCN-NEXT: v_mov_b32_e32 v25, v11 -; GCN-NEXT: v_mov_b32_e32 v24, v10 -; GCN-NEXT: v_mov_b32_e32 v23, v9 -; GCN-NEXT: v_mov_b32_e32 v22, v8 -; GCN-NEXT: v_mov_b32_e32 v21, v7 -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NEXT: v_mov_b32_e32 v15, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i16_to_v14f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i16_to_v14f16: ; VI: ; %bb.0: @@ -1689,7 +3497,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v7, 3 ; VI-NEXT: v_add_u16_sdwa v8, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1713,7 +3521,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v10 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1744,7 +3552,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] @@ -1753,7 +3561,7 @@ define <14 x half> @bitcast_v14i16_to_v14f16(<14 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1773,92 +3581,293 @@ end: ret <14 x half> %phi } +define inreg <14 x half> @bitcast_v14i16_to_v14f16_scalar(<14 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i16_to_v14f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v14i16_to_v14f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v14i16_to_v14f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i16_to_v14f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i16> %a, splat (i16 3) + %a2 = bitcast <14 x i16> %a1 to <14 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x i16> %a to <14 x half> + br label %end + +end: + %phi = phi <14 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x half> %phi +} + define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f16_to_v14i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v14 -; GCN-NEXT: v_or_b32_e32 v10, v10, v15 -; GCN-NEXT: v_or_b32_e32 v6, v6, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v17 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f16_to_v14i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f16_to_v14i16: ; VI: ; %bb.0: @@ -1867,7 +3876,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_e32 v7, 0x200, v0 @@ -1891,7 +3900,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v10, v2 ; VI-NEXT: v_or_b32_e32 v1, v9, v1 ; VI-NEXT: v_or_b32_e32 v0, v7, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1923,7 +3932,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] @@ -1932,7 +3941,7 @@ define <14 x i16> @bitcast_v14f16_to_v14i16(<14 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1951,3 +3960,229 @@ end: %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <14 x i16> %phi } + +define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f16_to_v14i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v14f16_to_v14i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s23, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_e32 v7, s16, v0 +; VI-NEXT: v_add_f16_sdwa v8, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v9, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v11, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v12, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v13, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v0 +; VI-NEXT: v_or_b32_e32 v5, v5, v13 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_or_b32_e32 v2, v2, v10 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v0, v7, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f16_to_v14i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f16_to_v14i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x half> %a, splat (half 0xH0200) + %a2 = bitcast <14 x half> %a1 to <14 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x half> %a to <14 x i16> + br label %end + +end: + %phi = phi <14 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index cc32c19b267bf..4a52cb9f6459a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -1,32 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v8f32: ; VI: ; %bb.0: @@ -106,27 +105,158 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v8i32_to_v8f32_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v8i32_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v8i32: ; VI: ; %bb.0: @@ -201,27 +331,165 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v8f32_to_v8i32_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v4i64: ; VI: ; %bb.0: @@ -271,7 +539,7 @@ define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -281,7 +549,7 @@ define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -301,27 +569,158 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v8i32_to_v4i64_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v8i32_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v8i32: ; VI: ; %bb.0: @@ -371,7 +770,7 @@ define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -383,7 +782,7 @@ define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -403,27 +802,158 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v4i64_to_v8i32_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v4i64_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v4f64: ; VI: ; %bb.0: @@ -473,7 +1003,7 @@ define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -483,7 +1013,7 @@ define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -503,23 +1033,154 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v8i32_to_v4f64_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v8i32_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v8i32: ; VI: ; %bb.0: @@ -528,13 +1189,13 @@ define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -545,13 +1206,13 @@ define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -563,13 +1224,13 @@ define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -589,62 +1250,184 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v4f64_to_v8i32_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16i16: ; VI: ; %bb.0: @@ -694,7 +1477,7 @@ define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -704,7 +1487,7 @@ define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -724,104 +1507,272 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v8i32_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v8i32: ; VI: ; %bb.0: @@ -830,7 +1781,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 3 ; VI-NEXT: v_add_u16_e32 v8, 3, v7 @@ -857,7 +1808,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -889,7 +1840,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -899,7 +1850,7 @@ define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -919,130 +1870,363 @@ end: ret <8 x i32> %phi } -define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i32> @bitcast_v16i16_to_v8i32_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v8i32_to_v16f16: +; VI-LABEL: bitcast_v16i16_to_v8i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v8i32_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: ; %bb.2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1076,7 +2260,7 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -1086,7 +2270,7 @@ define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1106,139 +2290,327 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v8i32_to_v16f16_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v8i32_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8i32: ; VI: ; %bb.0: @@ -1247,7 +2619,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1274,7 +2646,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1307,7 +2679,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -1317,7 +2689,7 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1337,98 +2709,352 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v16f16_to_v8i32_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v16f16_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i32_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i32_to_v16bf16: ; VI: ; %bb.0: @@ -1478,7 +3104,7 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 @@ -1488,7 +3114,7 @@ define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1508,123 +3134,311 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v8i32_to_v16bf16_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s29 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v4, s25 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v8i32_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v8i32: ; VI: ; %bb.0: @@ -1633,7 +3447,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -1780,7 +3594,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1791,7 +3605,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -1915,7 +3729,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1927,7 +3741,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2071,7 +3885,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2083,7 +3897,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -2214,7 +4028,7 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2234,222 +4048,875 @@ end: ret <8 x i32> %phi } -define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i32_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v8i32_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v8i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB12_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v35 -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 -; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB12_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v8i32_to_v32i8: +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v8i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v8i32_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v33, v5 @@ -2483,7 +4950,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -2509,9 +4976,9 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB24_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 @@ -2545,7 +5012,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -2585,7 +5052,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -2603,9 +5070,9 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v32 @@ -2631,7 +5098,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB24_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -2689,7 +5156,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -2715,9 +5182,9 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 3, v39 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 3, v37 @@ -2751,7 +5218,7 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -2779,228 +5246,856 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i32_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v21, s12 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v29, s9 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v8i32_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s59 +; VI-NEXT: v_mov_b32_e32 v2, s58 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s57 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s46 +; VI-NEXT: v_mov_b32_e32 v10, s45 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s44 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s41 +; VI-NEXT: v_mov_b32_e32 v18, s40 +; VI-NEXT: v_mov_b32_e32 v19, s6 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v21, s29 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v29, s24 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v8i32_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s58 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s57 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s45 +; GFX9-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s44 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 +; GFX9-NEXT: v_mov_b32_e32 v21, s29 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v27, s4 +; GFX9-NEXT: v_mov_b32_e32 v28, s23 +; GFX9-NEXT: v_mov_b32_e32 v29, s24 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_v8i32_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_v8i32_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s29 :: v_dual_mov_b32 v11, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v15, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s15 :: v_dual_mov_b32 v27, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s19 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v8i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v8i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v8i32: ; VI: ; %bb.0: @@ -3033,14 +6128,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: s_cbranch_execnz .LBB26_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_4 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB26_4 +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: .LBB26_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3098,8 +6193,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: .LBB26_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3191,14 +6286,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: s_cbranch_execnz .LBB26_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_4 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: .LBB26_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3256,8 +6351,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: .LBB26_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3357,14 +6452,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -3456,8 +6551,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -3582,14 +6677,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -3679,8 +6774,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -3788,27 +6883,951 @@ end: ret <8 x i32> %phi } +define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v8i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v32i8_to_v8i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v8i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v8i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-TRUE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB27_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB27_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB27_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v8i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-FAKE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB27_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB27_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v4i64: ; VI: ; %bb.0: @@ -3883,27 +7902,165 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v8f32_to_v4i64_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v8f32: ; VI: ; %bb.0: @@ -3953,7 +8110,7 @@ define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3965,7 +8122,7 @@ define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3985,27 +8142,158 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v4i64_to_v8f32_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v4i64_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v4f64: ; VI: ; %bb.0: @@ -4080,23 +8368,161 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v8f32_to_v4f64_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v8f32: ; VI: ; %bb.0: @@ -4105,13 +8531,13 @@ define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4122,13 +8548,13 @@ define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4140,13 +8566,13 @@ define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4166,62 +8592,184 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v4f64_to_v8f32_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_4 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_3: +; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16i16: ; VI: ; %bb.0: @@ -4296,117 +8844,288 @@ end: ret <16 x i16> %phi } -define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v16i16_to_v8f32: +; VI-LABEL: bitcast_v8f32_to_v16i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_e32 v8, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v8, v7 ; VI-NEXT: v_add_u16_e32 v8, 3, v6 ; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4429,7 +9148,7 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4461,7 +9180,7 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -4471,7 +9190,7 @@ define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4491,114 +9210,347 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v16i16_to_v8f32_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v16i16_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v7 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v5 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16f16: ; VI: ; %bb.0: @@ -4673,139 +9625,333 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v8f32_to_v16f16_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v8f32_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v8f32: ; VI: ; %bb.0: @@ -4814,7 +9960,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4841,7 +9987,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4874,7 +10020,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -4884,7 +10030,7 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4904,98 +10050,352 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v16f16_to_v8f32_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v16f16_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v23 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f32_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16bf16: ; VI: ; %bb.0: @@ -5070,123 +10470,318 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v8f32_to_v16bf16_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v0, s29 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v4, s25 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v7, s7, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s6, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s5, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s3, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v8f32: ; VI: ; %bb.0: @@ -5195,7 +10790,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -5342,7 +10937,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5353,7 +10948,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -5477,7 +11072,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5489,7 +11084,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5633,7 +11228,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5645,7 +11240,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -5776,7 +11371,7 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5796,179 +11391,832 @@ end: ret <8 x float> %phi } -define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f32_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB47_2 ; -; VI-LABEL: bitcast_v8f32_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v8f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB24_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; VI-NEXT: v_add_f32_e32 v34, 1.0, v34 +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v8f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v8f32_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; VI-NEXT: v_add_f32_e32 v34, 1.0, v34 ; VI-NEXT: v_add_f32_e32 v33, 1.0, v33 ; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 @@ -5997,7 +12245,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -6045,7 +12293,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -6071,9 +12319,9 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 @@ -6107,7 +12355,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -6147,7 +12395,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -6165,9 +12413,9 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v32, 1.0, v32 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 1.0, v33 :: v_dual_add_f32 v10, 1.0, v10 @@ -6191,7 +12439,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -6249,7 +12497,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -6275,9 +12523,9 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v39, 1.0, v39 :: v_dual_add_f32 v32, 1.0, v32 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 1.0, v37 :: v_dual_add_f32 v34, 1.0, v34 @@ -6309,7 +12557,7 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -6337,268 +12585,937 @@ end: ret <32 x i8> %phi } -define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v8f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f32_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s15, s23, 24 +; SI-NEXT: s_lshr_b32 s24, s23, 16 +; SI-NEXT: s_lshr_b32 s25, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s9, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s22, 1.0 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v13, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v21, s14 +; SI-NEXT: v_mov_b32_e32 v22, s13 +; SI-NEXT: v_mov_b32_e32 v23, s12 +; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v30, s24 +; SI-NEXT: v_mov_b32_e32 v31, s15 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v32i8_to_v8f32: +; VI-LABEL: bitcast_v8f32_to_v32i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v4 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s25, s23, 8 +; VI-NEXT: s_lshr_b32 s24, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s40, s21, 8 +; VI-NEXT: s_lshr_b32 s29, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s45, s19, 8 +; VI-NEXT: s_lshr_b32 s44, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: s_lshr_b32 s57, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v16, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v25, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v24, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v35, s59 +; VI-NEXT: v_mov_b32_e32 v2, s57 +; VI-NEXT: v_mov_b32_e32 v5, s58 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v34, s46 +; VI-NEXT: v_mov_b32_e32 v10, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v21, s40 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v32, s26 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v12, v9 +; VI-NEXT: v_mov_b32_e32 v20, v17 +; VI-NEXT: v_mov_b32_e32 v28, v25 +; VI-NEXT: v_mov_b32_e32 v1, v35 +; VI-NEXT: v_mov_b32_e32 v9, v34 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s23, 8 +; GFX9-NEXT: s_lshr_b32 s24, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 8 +; GFX9-NEXT: s_lshr_b32 s29, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s45, s19, 8 +; GFX9-NEXT: s_lshr_b32 s44, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v17, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v16, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v25, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v24, s22, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s57 +; GFX9-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s44 +; GFX9-NEXT: v_mov_b32_e32 v13, s45 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8f32_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v20, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v28, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8f32_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v39, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v37, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v36, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v38, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v2, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v6, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s42 :: v_dual_mov_b32 v10, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s41 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v18, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v22, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v26, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v30, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v32i8_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: s_cbranch_execnz .LBB50_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_4 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB50_4 +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB25_3: ; %cmp.false +; VI-NEXT: .LBB50_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6656,8 +13573,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: .LBB50_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6749,14 +13666,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: s_cbranch_execnz .LBB50_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_4 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB50_4 +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: .LBB50_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6814,8 +13731,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: .LBB50_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6915,14 +13832,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -7014,8 +13931,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -7140,14 +14057,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -7237,8 +14154,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -7346,27 +14263,951 @@ end: ret <8 x float> %phi } +define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v8f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v32i8_to_v8f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v8f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v8f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v8f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v4f64: ; VI: ; %bb.0: @@ -7416,7 +15257,7 @@ define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7428,7 +15269,7 @@ define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7448,23 +15289,153 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v4i64_to_v4f64_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v4i64_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v4i64: ; VI: ; %bb.0: @@ -7473,13 +15444,13 @@ define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7490,13 +15461,13 @@ define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7508,13 +15479,13 @@ define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7534,62 +15505,184 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v4f64_to_v4i64_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_3: +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16i16: ; VI: ; %bb.0: @@ -7639,7 +15732,7 @@ define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7651,7 +15744,7 @@ define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7671,117 +15764,285 @@ end: ret <16 x i16> %phi } -define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB57_2 ; -; VI-LABEL: bitcast_v16i16_to_v4i64: +; VI-LABEL: bitcast_v4i64_to_v16i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_e32 v8, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB58_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v8, v7 ; VI-NEXT: v_add_u16_e32 v8, 3, v6 ; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7804,7 +16065,7 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7836,7 +16097,7 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -7846,7 +16107,7 @@ define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7866,114 +16127,347 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v16i16_to_v4i64_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v16i16_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, v7 -; GCN-NEXT: v_mov_b32_e32 v17, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v22, v3 -; GCN-NEXT: v_mov_b32_e32 v21, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v18, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16f16: ; VI: ; %bb.0: @@ -8023,7 +16517,7 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8035,7 +16529,7 @@ define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: .LBB60_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8055,139 +16549,327 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v4i64_to_v16f16_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v4i64_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4i64: ; VI: ; %bb.0: @@ -8196,7 +16878,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -8223,7 +16905,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8256,7 +16938,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -8266,7 +16948,7 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: .LBB62_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8286,98 +16968,352 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v16f16_to_v4i64_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v16f16_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v16, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v23, vcc -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i64_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v23, vcc +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i64_to_v16bf16: ; VI: ; %bb.0: @@ -8427,7 +17363,7 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8439,7 +17375,7 @@ define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8459,123 +17395,311 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v4i64_to_v16bf16_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_and_b32 s8, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s22, 16 +; SI-NEXT: s_and_b32 s10, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s20, 16 +; SI-NEXT: s_and_b32 s14, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_and_b32 s24, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s18, 16 +; SI-NEXT: s_and_b32 s26, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s17, 16 +; SI-NEXT: s_and_b32 s28, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s15, s19, 0 +; SI-NEXT: s_add_u32 s13, s20, 3 +; SI-NEXT: s_addc_u32 s11, s21, 0 +; SI-NEXT: s_add_u32 s9, s22, 3 +; SI-NEXT: s_addc_u32 s7, s23, 0 +; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s10, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s12, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s24, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s25, s16, 16 +; SI-NEXT: s_and_b32 s26, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s27, s5, 16 +; SI-NEXT: s_and_b32 s28, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s29, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s29 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v4, s25 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v8, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v14, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v4i64_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v4i64: ; VI: ; %bb.0: @@ -8584,7 +17708,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -8731,7 +17855,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8742,7 +17866,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -8866,7 +17990,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8878,7 +18002,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9022,7 +18146,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9034,7 +18158,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -9165,7 +18289,7 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9185,194 +18309,847 @@ end: ret <4 x i64> %phi } -define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i64_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v20, v5 -; GCN-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB67_2 ; -; VI-LABEL: bitcast_v4i64_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v4i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB34_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 -; VI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc -; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v4i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v4i64_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB68_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB68_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB68_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 +; VI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 @@ -9386,7 +19163,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -9434,7 +19211,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -9460,9 +19237,9 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: s_cbranch_execz .LBB68_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -9496,7 +19273,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -9536,7 +19313,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -9554,9 +19331,9 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9585,7 +19362,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB34_4: ; %end +; GFX11-TRUE16-NEXT: .LBB68_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -9643,7 +19420,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -9669,9 +19446,9 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB68_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v36, vcc_lo, v36, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9708,7 +19485,7 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -9736,228 +19513,856 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i64_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s23, 24 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 8 +; SI-NEXT: s_lshr_b32 s8, s21, 24 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s19, 8 +; SI-NEXT: s_lshr_b32 s15, s17, 24 +; SI-NEXT: s_lshr_b32 s24, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v14, s13 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v21, s12 +; SI-NEXT: v_mov_b32_e32 v22, s10 +; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v28, s23 +; SI-NEXT: v_mov_b32_e32 v29, s9 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v4i64_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s59 +; VI-NEXT: v_mov_b32_e32 v2, s58 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s57 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s46 +; VI-NEXT: v_mov_b32_e32 v10, s45 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s44 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s41 +; VI-NEXT: v_mov_b32_e32 v18, s40 +; VI-NEXT: v_mov_b32_e32 v19, s6 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v21, s29 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v29, s24 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v4i64_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s24, s23, 8 +; GFX9-NEXT: s_lshr_b32 s25, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s29, s21, 8 +; GFX9-NEXT: s_lshr_b32 s40, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s44, s19, 8 +; GFX9-NEXT: s_lshr_b32 s45, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s57, s17, 8 +; GFX9-NEXT: s_lshr_b32 s58, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s58 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s57 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s45 +; GFX9-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s44 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s21 +; GFX9-NEXT: v_mov_b32_e32 v21, s29 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v27, s4 +; GFX9-NEXT: v_mov_b32_e32 v28, s23 +; GFX9-NEXT: v_mov_b32_e32 v29, s24 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v4i64_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v4i64_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s44 :: v_dual_mov_b32 v3, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s29 :: v_dual_mov_b32 v11, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v13, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v15, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s15 :: v_dual_mov_b32 v27, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s19 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v4i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v4i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v4i64: ; VI: ; %bb.0: @@ -9990,14 +20395,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10055,8 +20460,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10148,14 +20553,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -10213,8 +20618,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -10314,14 +20719,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -10413,8 +20818,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -10539,14 +20944,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -10636,8 +21041,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -10745,66 +21150,990 @@ end: ret <4 x i64> %phi } +define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v4i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v32i8_to_v4i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v4i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v4i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-TRUE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB71_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB71_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB71_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v4i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-FAKE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB71_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB71_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v7 -; GCN-NEXT: v_mov_b32_e32 v16, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v5 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v22, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v13, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v5, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_alignbit_b32 v13, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v9, v19, v18, 16 -; GCN-NEXT: v_alignbit_b32 v5, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v22 -; GCN-NEXT: v_mov_b32_e32 v2, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v20 -; GCN-NEXT: v_mov_b32_e32 v6, v21 -; GCN-NEXT: v_mov_b32_e32 v8, v18 -; GCN-NEXT: v_mov_b32_e32 v10, v19 -; GCN-NEXT: v_mov_b32_e32 v12, v16 -; GCN-NEXT: v_mov_b32_e32 v14, v17 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_mov_b32_e32 v16, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v6, v21 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v10, v19 +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16i16: ; VI: ; %bb.0: @@ -10813,13 +22142,13 @@ define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10830,13 +22159,13 @@ define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10848,13 +22177,13 @@ define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: .LBB72_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -10874,104 +22203,268 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB73_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_4 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[20:21], 1.0 +; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: s_branch .LBB73_5 +; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: .LBB73_4: +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: .LBB73_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v6, v21 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v10, v19 +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v6 -; GCN-NEXT: v_mov_b32_e32 v19, v4 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v23 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i16_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v24, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v21, v4 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v6, v11, v6 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v4f64: ; VI: ; %bb.0: @@ -10980,7 +22473,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 3 ; VI-NEXT: v_add_u16_e32 v8, 3, v7 @@ -11007,7 +22500,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v8, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: .LBB74_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11039,7 +22532,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -11049,7 +22542,7 @@ define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB37_2: ; %end +; GFX11-NEXT: .LBB74_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11069,102 +22562,335 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v16i16_to_v4f64_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v9 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v16i16_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v19 -; GCN-NEXT: v_mov_b32_e32 v1, v23 -; GCN-NEXT: v_mov_b32_e32 v2, v16 -; GCN-NEXT: v_mov_b32_e32 v3, v22 -; GCN-NEXT: v_mov_b32_e32 v4, v17 -; GCN-NEXT: v_mov_b32_e32 v5, v21 -; GCN-NEXT: v_mov_b32_e32 v6, v18 -; GCN-NEXT: v_mov_b32_e32 v7, v20 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v2, v21 +; SI-NEXT: v_mov_b32_e32 v3, v20 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v5, v17 +; SI-NEXT: v_mov_b32_e32 v6, v18 +; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16f16: ; VI: ; %bb.0: @@ -11173,13 +22899,13 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11190,13 +22916,13 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11208,13 +22934,13 @@ define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: .LBB76_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11234,139 +22960,317 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v4f64_to_v16f16_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v4f64_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v23, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_or_b32_e32 v3, v17, v3 -; GCN-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v11, v12 -; GCN-NEXT: v_or_b32_e32 v6, v9, v13 -; GCN-NEXT: v_or_b32_e32 v7, v8, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_or_b32_e32 v3, v21, v3 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v5, v17, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v4f64: ; VI: ; %bb.0: @@ -11375,7 +23279,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 0x200 ; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -11402,7 +23306,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v9 ; VI-NEXT: v_or_b32_e32 v0, v0, v8 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11435,7 +23339,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -11445,7 +23349,7 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: .LBB78_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11465,86 +23369,340 @@ end: ret <4 x double> %phi } +define inreg <4 x double> @bitcast_v16f16_to_v4f64_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_or_b32_e32 v4, v14, v4 +; SI-NEXT: v_or_b32_e32 v5, v12, v5 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v16f16_to_v4f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_4 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_3: +; VI-NEXT: s_branch .LBB79_2 +; VI-NEXT: .LBB79_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_4 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_3: +; GFX9-NEXT: s_branch .LBB79_2 +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v2 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v0 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v23 -; GCN-NEXT: v_mov_b32_e32 v1, v22 -; GCN-NEXT: v_mov_b32_e32 v2, v21 -; GCN-NEXT: v_mov_b32_e32 v3, v20 -; GCN-NEXT: v_mov_b32_e32 v4, v19 -; GCN-NEXT: v_mov_b32_e32 v5, v18 -; GCN-NEXT: v_mov_b32_e32 v6, v17 -; GCN-NEXT: v_mov_b32_e32 v7, v16 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f64_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: .LBB80_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v23 +; SI-NEXT: v_mov_b32_e32 v1, v22 +; SI-NEXT: v_mov_b32_e32 v2, v21 +; SI-NEXT: v_mov_b32_e32 v3, v20 +; SI-NEXT: v_mov_b32_e32 v4, v19 +; SI-NEXT: v_mov_b32_e32 v5, v18 +; SI-NEXT: v_mov_b32_e32 v6, v17 +; SI-NEXT: v_mov_b32_e32 v7, v16 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v16bf16: ; VI: ; %bb.0: @@ -11553,13 +23711,13 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11570,13 +23728,13 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11588,13 +23746,13 @@ define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11614,123 +23772,302 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v4f64_to_v16bf16_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s29, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s23, 16 +; SI-NEXT: s_and_b32 s27, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s22, 16 +; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s21, 16 +; SI-NEXT: s_and_b32 s15, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s20, 16 +; SI-NEXT: s_and_b32 s13, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s19, 16 +; SI-NEXT: s_and_b32 s11, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s18, 16 +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[20:21], 1.0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v4f64: ; VI: ; %bb.0: @@ -11739,7 +24076,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -11886,7 +24223,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -11897,7 +24234,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -12021,7 +24358,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -12033,7 +24370,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12177,7 +24514,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v14, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v12, v0 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12189,7 +24526,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -12320,7 +24657,7 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12340,133 +24677,786 @@ end: ret <4 x double> %phi } -define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f64_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v5 -; GCN-NEXT: v_mov_b32_e32 v34, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v7, v6, 24 -; GCN-NEXT: v_alignbit_b32 v26, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v25, v7, v6, 8 -; GCN-NEXT: v_alignbit_b32 v19, v35, v34, 24 -; GCN-NEXT: v_alignbit_b32 v18, v35, v34, 16 -; GCN-NEXT: v_alignbit_b32 v17, v35, v34, 8 -; GCN-NEXT: v_alignbit_b32 v11, v3, v2, 24 -; GCN-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v9, v3, v2, 8 -; GCN-NEXT: v_alignbit_b32 v38, v1, v0, 24 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v33, v1, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_alignbit_b32 v27, v7, v6, 24 -; GCN-NEXT: v_alignbit_b32 v26, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v25, v7, v6, 8 -; GCN-NEXT: v_alignbit_b32 v19, v35, v34, 24 -; GCN-NEXT: v_alignbit_b32 v18, v35, v34, 16 -; GCN-NEXT: v_alignbit_b32 v17, v35, v34, 8 -; GCN-NEXT: v_alignbit_b32 v11, v3, v2, 24 -; GCN-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v9, v3, v2, 8 -; GCN-NEXT: v_alignbit_b32 v38, v1, v0, 24 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v33, v1, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NEXT: v_mov_b32_e32 v16, v34 -; GCN-NEXT: v_mov_b32_e32 v20, v35 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v1, v33 -; GCN-NEXT: v_mov_b32_e32 v2, v32 -; GCN-NEXT: v_mov_b32_e32 v3, v38 -; GCN-NEXT: v_mov_b32_e32 v6, v37 -; GCN-NEXT: v_mov_b32_e32 v7, v36 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB83_2 ; -; VI-LABEL: bitcast_v4f64_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v4f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: v_add_f32_e32 v8, s4, v0 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v9, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v2 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v1 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v9, s4, v0 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v9 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v10, v10, v0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x7fff, v10 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX9-NEXT: v_and_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v4f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s8, s7, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-NEXT: s_and_b32 s8, s6, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_and_b32 s7, s5, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v11, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s4, s3, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v9, v10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s3 +; GFX11-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v6, v8 +; GFX11-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v8 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v3, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s2 +; GFX11-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v9 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_bfe_u32 v11, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s2 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v11, v13, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v11, v11, v13 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v15, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v15, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v12, 16, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_3: +; GFX11-NEXT: s_branch .LBB83_2 +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v4f64_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v5 +; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v7, v6, 24 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; SI-NEXT: v_alignbit_b32 v19, v35, v34, 24 +; SI-NEXT: v_alignbit_b32 v18, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v17, v35, v34, 8 +; SI-NEXT: v_alignbit_b32 v11, v3, v2, 24 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 8 +; SI-NEXT: v_alignbit_b32 v38, v1, v0, 24 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v33, v1, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v7, v6, 24 +; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; SI-NEXT: v_alignbit_b32 v19, v35, v34, 24 +; SI-NEXT: v_alignbit_b32 v18, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v17, v35, v34, 8 +; SI-NEXT: v_alignbit_b32 v11, v3, v2, 24 +; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 8 +; SI-NEXT: v_alignbit_b32 v38, v1, v0, 24 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v33, v1, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_mov_b32_e32 v20, v35 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v3, v38 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v7, v36 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr23 @@ -12481,7 +25471,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -12507,9 +25497,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 @@ -12539,7 +25529,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -12587,7 +25577,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -12613,9 +25603,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 @@ -12645,7 +25635,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -12685,7 +25675,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -12703,9 +25693,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 @@ -12729,7 +25719,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB42_4: ; %end +; GFX11-TRUE16-NEXT: .LBB84_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -12787,7 +25777,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -12813,9 +25803,9 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB84_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 @@ -12847,7 +25837,7 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -12875,228 +25865,889 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f64_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_alignbit_b32 v32, s23, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 +; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 +; SI-NEXT: v_alignbit_b32 v33, s21, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 +; SI-NEXT: v_alignbit_b32 v34, s19, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v35, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s25, s23, 24 +; SI-NEXT: s_lshr_b32 s24, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 8 +; SI-NEXT: s_lshr_b32 s14, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 8 +; SI-NEXT: s_lshr_b32 s11, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 8 +; SI-NEXT: s_lshr_b32 s8, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v25, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v25, v24, 8 +; SI-NEXT: v_alignbit_b32 v19, v17, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v33, v17, v16, 8 +; SI-NEXT: v_alignbit_b32 v11, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v34, v9, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v1, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: s_branch .LBB85_5 +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v25, s23 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v23, s14 +; SI-NEXT: v_mov_b32_e32 v22, s13 +; SI-NEXT: v_mov_b32_e32 v21, s12 +; SI-NEXT: v_mov_b32_e32 v31, s25 +; SI-NEXT: v_mov_b32_e32 v30, s24 +; SI-NEXT: v_mov_b32_e32 v29, s15 +; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v12, v9 +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v17, v33 +; SI-NEXT: v_mov_b32_e32 v25, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s43, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s41, s23, 8 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s44, s22, 8 +; VI-NEXT: s_lshr_b32 s40, s21, 24 +; VI-NEXT: s_lshr_b32 s29, s21, 16 +; VI-NEXT: s_lshr_b32 s28, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s46, s20, 8 +; VI-NEXT: s_lshr_b32 s27, s19, 24 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: s_lshr_b32 s25, s19, 8 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s56, s18, 8 +; VI-NEXT: s_lshr_b32 s24, s17, 24 +; VI-NEXT: s_lshr_b32 s15, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s58, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s59 +; VI-NEXT: v_mov_b32_e32 v35, s58 +; VI-NEXT: v_mov_b32_e32 v10, s57 +; VI-NEXT: v_mov_b32_e32 v34, s56 +; VI-NEXT: v_mov_b32_e32 v18, s47 +; VI-NEXT: v_mov_b32_e32 v33, s46 +; VI-NEXT: v_mov_b32_e32 v26, s45 +; VI-NEXT: v_mov_b32_e32 v32, s44 +; VI-NEXT: v_mov_b32_e32 v31, s43 +; VI-NEXT: v_mov_b32_e32 v30, s42 +; VI-NEXT: v_mov_b32_e32 v29, s41 +; VI-NEXT: v_mov_b32_e32 v23, s40 +; VI-NEXT: v_mov_b32_e32 v22, s29 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: v_mov_b32_e32 v14, s26 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v12, v9 +; VI-NEXT: v_mov_b32_e32 v20, v17 +; VI-NEXT: v_mov_b32_e32 v28, v25 +; VI-NEXT: v_mov_b32_e32 v1, v35 +; VI-NEXT: v_mov_b32_e32 v9, v34 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s43, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s41, s23, 8 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s44, s22, 8 +; GFX9-NEXT: s_lshr_b32 s40, s21, 24 +; GFX9-NEXT: s_lshr_b32 s29, s21, 16 +; GFX9-NEXT: s_lshr_b32 s28, s21, 8 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s46, s20, 8 +; GFX9-NEXT: s_lshr_b32 s27, s19, 24 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: s_lshr_b32 s25, s19, 8 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s18, 8 +; GFX9-NEXT: s_lshr_b32 s24, s17, 24 +; GFX9-NEXT: s_lshr_b32 s15, s17, 16 +; GFX9-NEXT: s_lshr_b32 s14, s17, 8 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_lshr_b32 s58, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s59 +; GFX9-NEXT: v_mov_b32_e32 v35, s58 +; GFX9-NEXT: v_mov_b32_e32 v10, s57 +; GFX9-NEXT: v_mov_b32_e32 v34, s56 +; GFX9-NEXT: v_mov_b32_e32 v18, s47 +; GFX9-NEXT: v_mov_b32_e32 v33, s46 +; GFX9-NEXT: v_mov_b32_e32 v26, s45 +; GFX9-NEXT: v_mov_b32_e32 v32, s44 +; GFX9-NEXT: v_mov_b32_e32 v31, s43 +; GFX9-NEXT: v_mov_b32_e32 v30, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s41 +; GFX9-NEXT: v_mov_b32_e32 v23, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v15, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4f64_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s28 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4f64_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[32:33], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[34:35], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[36:37], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[38:39], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s46 :: v_dual_mov_b32 v1, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v9, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s42 :: v_dual_mov_b32 v17, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s40 :: v_dual_mov_b32 v25, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v30, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s6 :: v_dual_mov_b32 v22, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s8 :: v_dual_mov_b32 v14, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s10 :: v_dual_mov_b32 v6, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v21, s23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s13 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v4f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_4 -; GCN-NEXT: .LBB43_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB43_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v12, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: .LBB43_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v37, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v39, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v48, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v49, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v21, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v16, v23, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v13, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v13, v17, v18 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 -; GCN-NEXT: v_or_b32_e32 v15, v19, v21 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_4 +; SI-NEXT: .LBB86_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB86_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: .LBB86_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v19, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v17, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v15, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v4f64: ; VI: ; %bb.0: @@ -13129,14 +26780,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: s_cbranch_execnz .LBB86_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_4 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB86_4 +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: .LBB86_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13194,8 +26845,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 -; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB86_2 +; VI-NEXT: .LBB86_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -13287,14 +26938,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: s_cbranch_execnz .LBB86_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_4 -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB86_4 +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: .LBB86_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13352,8 +27003,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 -; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB86_2 +; GFX9-NEXT: .LBB86_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -13453,14 +27104,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h @@ -13552,8 +27203,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 @@ -13678,14 +27329,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -13775,8 +27426,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -13884,139 +27535,1064 @@ end: ret <4 x double> %phi } -define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v31, v15 -; GCN-NEXT: v_mov_b32_e32 v30, v14 -; GCN-NEXT: v_mov_b32_e32 v29, v13 -; GCN-NEXT: v_mov_b32_e32 v28, v12 -; GCN-NEXT: v_mov_b32_e32 v27, v11 -; GCN-NEXT: v_mov_b32_e32 v26, v10 -; GCN-NEXT: v_mov_b32_e32 v25, v9 -; GCN-NEXT: v_mov_b32_e32 v24, v8 -; GCN-NEXT: v_mov_b32_e32 v23, v7 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v5 -; GCN-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NEXT: v_mov_b32_e32 v18, v2 -; GCN-NEXT: v_mov_b32_e32 v17, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_4 -; GCN-NEXT: .LBB44_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB44_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: .LBB44_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v4f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v23, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; SI-NEXT: s_branch .LBB87_2 ; -; VI-LABEL: bitcast_v16i16_to_v16f16: +; VI-LABEL: bitcast_v32i8_to_v4f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v8, 3 -; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v10, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v11, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v12, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v13, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v22, v6 +; VI-NEXT: v_mov_b32_e32 v21, v4 +; VI-NEXT: v_mov_b32_e32 v20, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v19 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v4f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-NEXT: v_mov_b32_e32 v20, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v19 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v4f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-TRUE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v21 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB87_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB87_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB87_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v4f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v15, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v20 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-FAKE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v18 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v14, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v15 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v20, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: .LBB87_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB87_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v31, v15 +; SI-NEXT: v_mov_b32_e32 v30, v14 +; SI-NEXT: v_mov_b32_e32 v29, v13 +; SI-NEXT: v_mov_b32_e32 v28, v12 +; SI-NEXT: v_mov_b32_e32 v27, v11 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_mov_b32_e32 v25, v9 +; SI-NEXT: v_mov_b32_e32 v24, v8 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v5 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v18, v2 +; SI-NEXT: v_mov_b32_e32 v17, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_4 +; SI-NEXT: .LBB88_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB88_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: .LBB88_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB88_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 3 +; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v11, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v12, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v15, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v8, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v7, 3, v7 @@ -14035,7 +28611,7 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v11 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: .LBB88_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14067,7 +28643,7 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: s_cbranch_execz .LBB88_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -14077,7 +28653,7 @@ define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: .LBB88_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14097,103 +28673,323 @@ end: ret <16 x half> %phi } +define inreg <16 x half> @bitcast_v16i16_to_v16f16_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v16, v1 +; SI-NEXT: v_mov_b32_e32 v17, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v17 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v16i16_to_v16f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_4 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_3: +; GFX9-NEXT: s_branch .LBB89_2 +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v14, v14, v16 -; GCN-NEXT: v_or_b32_e32 v10, v10, v17 -; GCN-NEXT: v_or_b32_e32 v6, v6, v18 -; GCN-NEXT: v_or_b32_e32 v2, v2, v19 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16i16: ; VI: ; %bb.0: @@ -14202,7 +28998,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_e32 v8, 0x200, v0 @@ -14229,7 +29025,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v11, v2 ; VI-NEXT: v_or_b32_e32 v1, v10, v1 ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: .LBB90_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14262,7 +29058,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -14272,7 +29068,7 @@ define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB45_2: ; %end +; GFX11-NEXT: .LBB90_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14292,115 +29088,359 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v16f16_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_add_f16_e32 v5, s22, v0 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v13, s5 +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v5, v6 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_e32 v8, s16, v0 +; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v10, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v11, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s21, v0 +; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_or_b32_e32 v3, v3, v12 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v0, v8, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_3: +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v14 -; GCN-NEXT: v_mov_b32_e32 v22, v12 -; GCN-NEXT: v_mov_b32_e32 v21, v10 -; GCN-NEXT: v_mov_b32_e32 v20, v8 -; GCN-NEXT: v_mov_b32_e32 v19, v6 -; GCN-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NEXT: v_mov_b32_e32 v17, v2 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v0, v15, v0 -; GCN-NEXT: v_or_b32_e32 v2, v13, v2 -; GCN-NEXT: v_or_b32_e32 v4, v11, v4 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_or_b32_e32 v5, v5, v10 -; GCN-NEXT: v_or_b32_e32 v3, v3, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i16_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v14 +; SI-NEXT: v_mov_b32_e32 v22, v12 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i16_to_v16bf16: ; VI: ; %bb.0: @@ -14409,7 +29449,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v8, 3 ; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -14436,7 +29476,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v11 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: .LBB92_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14468,7 +29508,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] @@ -14478,7 +29518,7 @@ define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB46_2: ; %end +; GFX11-NEXT: .LBB92_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14498,146 +29538,400 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v16i16_to_v16bf16_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s10, s20, 16 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_lshl_b32 s12, s22, 16 +; SI-NEXT: s_lshl_b32 s13, s23, 16 +; SI-NEXT: s_lshl_b32 s14, s24, 16 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_lshl_b32 s40, s26, 16 +; SI-NEXT: s_lshl_b32 s41, s27, 16 +; SI-NEXT: s_lshl_b32 s42, s28, 16 +; SI-NEXT: s_lshl_b32 s43, s29, 16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s27, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s14, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s12, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s10, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s19, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s41, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s5, 16 +; SI-NEXT: s_and_b32 s43, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s4, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s40 +; SI-NEXT: v_mov_b32_e32 v11, s41 +; SI-NEXT: v_mov_b32_e32 v12, s42 +; SI-NEXT: v_mov_b32_e32 v13, s43 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_4 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_3: +; GFX9-NEXT: s_branch .LBB93_2 +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_4 -; GCN-NEXT: .LBB47_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB47_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: .LBB47_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v0, v14, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v19, v2, 16 -; GCN-NEXT: v_alignbit_b32 v8, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v12, v21, v5, 16 -; GCN-NEXT: v_alignbit_b32 v14, v15, v17, 16 -; GCN-NEXT: v_alignbit_b32 v10, v11, v9, 16 -; GCN-NEXT: v_alignbit_b32 v6, v7, v18, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v13, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v23, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v22, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_4 +; SI-NEXT: .LBB94_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16i16: ; VI: ; %bb.0: @@ -14646,7 +29940,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -14793,7 +30087,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14804,7 +30098,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -14928,7 +30222,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v10, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v9, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14940,7 +30234,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 @@ -15093,7 +30387,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v10 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v11, 16, v9 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v12, 16, v8 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15105,7 +30399,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -15240,7 +30534,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v11, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15260,277 +30554,915 @@ end: ret <16 x i16> %phi } -define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i16_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v48, v15 -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v49, v11 -; GCN-NEXT: v_mov_b32_e32 v33, v10 -; GCN-NEXT: v_mov_b32_e32 v36, v8 -; GCN-NEXT: v_mov_b32_e32 v50, v7 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v38, v4 -; GCN-NEXT: v_mov_b32_e32 v51, v3 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v39, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v48 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v48 -; GCN-NEXT: v_bfe_u32 v7, v51, 8, 8 -; GCN-NEXT: v_bfe_u32 v15, v50, 8, 8 -; GCN-NEXT: v_bfe_u32 v23, v49, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v52 -; GCN-NEXT: v_or_b32_e32 v4, v1, v53 -; GCN-NEXT: v_or_b32_e32 v8, v2, v54 -; GCN-NEXT: v_or_b32_e32 v12, v3, v55 -; GCN-NEXT: v_or_b32_e32 v16, v5, v40 -; GCN-NEXT: v_or_b32_e32 v20, v9, v41 -; GCN-NEXT: v_or_b32_e32 v24, v10, v42 -; GCN-NEXT: v_or_b32_e32 v28, v11, v43 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_bfe_u32 v31, v48, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: .LBB48_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v37 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v35 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v43, v1 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_or_b32_e32 v3, v41, v3 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: v_or_b32_e32 v5, v55, v5 -; GCN-NEXT: v_or_b32_e32 v6, v52, v6 -; GCN-NEXT: v_or_b32_e32 v7, v53, v7 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v7 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: .LBB48_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v1 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB95_2 ; -; VI-LABEL: bitcast_v16i16_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v16, v4 -; VI-NEXT: v_mov_b32_e32 v49, v5 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: .LBB48_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v1 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_lshl_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc +; VI-NEXT: s_lshl_b32 s5, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s5, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 +; VI-NEXT: v_add_f32_e32 v12, s4, v1 +; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 +; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_lshl_b32 s5, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: s_lshl_b32 s5, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 +; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v13, v13, v7 +; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_or_b32 v7, v6, v13, v7 +; GFX9-NEXT: v_and_or_b32 v6, v4, v13, v5 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v14, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v5 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_or_b32 v5, v4, v13, v5 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v4 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc +; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v4, v4, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GFX9-NEXT: v_and_or_b32 v3, v3, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; GFX9-NEXT: v_and_or_b32 v2, v2, v13, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_and_or_b32 v1, v9, v13, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v13, v8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v16i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-NEXT: .LBB95_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: s_and_b32 s8, s1, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s8 +; GFX11-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: s_and_b32 s1, s2, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: s_and_b32 s1, s5, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v8, v6 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v5, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v4, v7 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v10, v4 +; GFX11-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v3, v11, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v12, v5 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s4, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v12, v7 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: s_lshl_b32 s0, s5, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v14, v10 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v5, v12, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v14, v15, v11 +; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v13, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v7, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s6, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v5 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s7, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16 +; GFX11-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v7 +; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v20, v22, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v18, v21, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v16 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v23, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v11, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v19 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v3, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v6, v15, vcc_lo +; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v17 +; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v13, v10 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v8, v12 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v9, v14 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB95_3: +; GFX11-NEXT: s_branch .LBB95_2 +; GFX11-NEXT: .LBB95_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v16i16_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_mov_b32_e32 v50, v7 +; SI-NEXT: v_mov_b32_e32 v51, v3 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v50 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v48 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v8, v5, v55 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v5, v54 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v5, v41 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v20, v5, v40 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v24, v5, v43 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v53 +; SI-NEXT: v_or_b32_e32 v4, v1, v52 +; SI-NEXT: v_or_b32_e32 v28, v5, v42 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v48 +; SI-NEXT: v_bfe_u32 v7, v51, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v50, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v49, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v48, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: .LBB96_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: .LBB96_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB96_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v16, v4 +; VI-NEXT: v_mov_b32_e32 v49, v5 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: .LBB96_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB96_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -15576,7 +31508,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v39, v36, 8, 8 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v50 ; VI-NEXT: v_mov_b32_e32 v1, v38 @@ -15625,7 +31557,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -15651,9 +31583,9 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -15687,7 +31619,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -15727,7 +31659,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -15745,9 +31677,9 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] @@ -15773,7 +31705,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %end +; GFX11-TRUE16-NEXT: .LBB96_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -15831,7 +31763,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -15857,9 +31789,9 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, v39, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, v37, 3 op_sel_hi:[1,0] @@ -15893,7 +31825,7 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -15921,330 +31853,1084 @@ end: ret <32 x i8> %phi } -define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v16i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v25 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v24 -; GCN-NEXT: v_or_b32_e32 v1, v1, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v0, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_or_b32_e32 v14, v14, v54 -; GCN-NEXT: v_or_b32_e32 v15, v15, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v34, v5 -; GCN-NEXT: v_or_b32_e32 v6, v35, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v9, v36, v9 -; GCN-NEXT: v_or_b32_e32 v10, v37, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v12, v39, v12 -; GCN-NEXT: v_or_b32_e32 v13, v38, v13 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_or_b32_e32 v23, v4, v5 -; GCN-NEXT: v_or_b32_e32 v27, v7, v9 -; GCN-NEXT: v_or_b32_e32 v31, v11, v12 -; GCN-NEXT: v_or_b32_e32 v17, v0, v2 -; GCN-NEXT: v_or_b32_e32 v21, v8, v6 -; GCN-NEXT: v_or_b32_e32 v25, v14, v10 -; GCN-NEXT: v_or_b32_e32 v29, v15, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NEXT: v_alignbit_b32 v1, v19, v2, 16 -; GCN-NEXT: v_alignbit_b32 v5, v23, v6, 16 -; GCN-NEXT: v_alignbit_b32 v9, v27, v10, 16 -; GCN-NEXT: v_alignbit_b32 v13, v31, v13, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v53, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v9, v54, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v13, v50, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v52, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v12, v49, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: v_or_b32_e32 v3, v38, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_or_b32_e32 v7, v39, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_or_b32_e32 v11, v37, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_or_b32_e32 v15, v36, v15 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v10, v35, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v14, v34, v14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v5, v11, v9 -; GCN-NEXT: v_or_b32_e32 v7, v15, v13 -; GCN-NEXT: v_or_b32_e32 v8, v10, v8 -; GCN-NEXT: v_or_b32_e32 v9, v14, v12 -; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v9 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v2 -; GCN-NEXT: v_alignbit_b32 v1, v19, v17, 16 -; GCN-NEXT: v_alignbit_b32 v5, v23, v21, 16 -; GCN-NEXT: v_alignbit_b32 v9, v27, v25, 16 -; GCN-NEXT: v_alignbit_b32 v13, v31, v29, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v17 -; GCN-NEXT: v_mov_b32_e32 v2, v19 -; GCN-NEXT: v_mov_b32_e32 v4, v21 -; GCN-NEXT: v_mov_b32_e32 v6, v23 -; GCN-NEXT: v_mov_b32_e32 v8, v25 -; GCN-NEXT: v_mov_b32_e32 v10, v27 -; GCN-NEXT: v_mov_b32_e32 v12, v29 -; GCN-NEXT: v_mov_b32_e32 v14, v31 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i16_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_alignbit_b32 v11, s9, v6, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v6, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v6, 8 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_alignbit_b32 v19, s10, v6, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, s10, v6, 8 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_or_b32_e32 v28, v6, v5 +; SI-NEXT: v_alignbit_b32 v3, s12, v1, 24 +; SI-NEXT: v_alignbit_b32 v2, s12, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s12, v1, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, s8, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, s8, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, s8, 8 +; SI-NEXT: s_lshr_b32 s44, s12, 8 +; SI-NEXT: s_lshr_b32 s14, s9, 8 +; SI-NEXT: s_lshr_b32 s41, s10, 8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: s_and_b32 s45, s19, 0xffff +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_and_b32 s42, s27, 0xffff +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v4 +; SI-NEXT: s_bfe_u32 s13, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s40, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s43, s27, 0x80008 +; SI-NEXT: v_bfe_u32 v31, v4, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_alignbit_b32 v11, s9, v4, 24 +; SI-NEXT: v_alignbit_b32 v10, s9, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, s9, v4, 8 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_alignbit_b32 v3, s12, v1, 24 +; SI-NEXT: v_alignbit_b32 v2, s12, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s12, v1, 8 +; SI-NEXT: v_alignbit_b32 v19, s10, v4, 24 +; SI-NEXT: v_alignbit_b32 v18, s10, v4, 16 +; SI-NEXT: v_alignbit_b32 v17, s10, v4, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v0, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v0, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v0, 8 +; SI-NEXT: s_lshr_b32 s13, s12, 24 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s12, 8 +; SI-NEXT: s_lshr_b32 s40, s9, 24 +; SI-NEXT: s_lshr_b32 s15, s9, 16 +; SI-NEXT: s_lshr_b32 s14, s9, 8 +; SI-NEXT: s_lshr_b32 s43, s10, 24 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s10, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s44 +; SI-NEXT: v_mov_b32_e32 v6, s45 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v12, s9 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v14, s15 +; SI-NEXT: v_mov_b32_e32 v15, s40 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v20, s10 +; SI-NEXT: v_mov_b32_e32 v21, s41 +; SI-NEXT: v_mov_b32_e32 v22, s42 +; SI-NEXT: v_mov_b32_e32 v23, s43 +; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB97_2 ; -; VI-LABEL: bitcast_v32i8_to_v16i16: +; VI-LABEL: bitcast_v16i16_to_v32i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v2 -; VI-NEXT: v_mov_b32_e32 v31, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v32, v6 -; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_3 -; VI-NEXT: ; %bb.1: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_4 -; VI-NEXT: .LBB49_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s24, s23, 8 +; VI-NEXT: s_lshr_b32 s25, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s21, 8 +; VI-NEXT: s_lshr_b32 s40, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s57, s17, 8 +; VI-NEXT: s_lshr_b32 s58, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s59 +; VI-NEXT: v_mov_b32_e32 v2, s58 +; VI-NEXT: v_mov_b32_e32 v3, s10 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s57 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s46 +; VI-NEXT: v_mov_b32_e32 v10, s45 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v13, s44 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s41 +; VI-NEXT: v_mov_b32_e32 v18, s40 +; VI-NEXT: v_mov_b32_e32 v19, s6 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v21, s29 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v27, s4 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v29, s24 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB49_3: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v16i16_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s23, 8 +; GFX9-NEXT: s_lshr_b32 s24, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 8 +; GFX9-NEXT: s_lshr_b32 s29, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s45, s19, 8 +; GFX9-NEXT: s_lshr_b32 s44, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s57 +; GFX9-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s44 +; GFX9-NEXT: v_mov_b32_e32 v13, s45 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16i16_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16i16_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v35, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v34, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v2, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v6, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s42 :: v_dual_mov_b32 v10, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s41 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v18, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v22, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v26, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v30, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v32i8_to_v16i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v34, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v2 +; SI-NEXT: v_or_b32_e32 v2, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v4, v39, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v7, v48, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v6, v5, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v21, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v9, v9, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v8, v50, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v51, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v29, v0, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_or_b32_e32 v12, v53, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v15, v27, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; SI-NEXT: v_or_b32_e32 v14, v13, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v8, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_or_b32_e32 v18, v55, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v18, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v30 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v21, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v4, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB98_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB98_4 +; VI-NEXT: .LBB98_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB98_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 ; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr18 @@ -16271,8 +32957,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 -; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: .LBB98_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v30 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -16364,14 +33050,14 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: s_cbranch_execnz .LBB98_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_4 -; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB98_4 +; GFX9-NEXT: .LBB98_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: .LBB98_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -16430,8 +33116,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: .LBB98_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 @@ -16529,14 +33215,14 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.h @@ -16602,8 +33288,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v24.l, 3 @@ -16703,14 +33389,14 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 @@ -16784,8 +33470,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v28, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 @@ -16877,154 +33563,1079 @@ end: ret <16 x i16> %phi } +define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v16i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v19, v14 +; SI-NEXT: v_mov_b32_e32 v20, v12 +; SI-NEXT: v_readfirstlane_b32 s13, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v13 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s8, s19, 24 +; SI-NEXT: s_or_b32 s4, s8, s4 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s12, s29, 8 +; SI-NEXT: s_or_b32 s8, s8, s12 +; SI-NEXT: s_and_b32 s12, s6, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s15, s7, 24 +; SI-NEXT: s_or_b32 s41, s15, s12 +; SI-NEXT: s_and_b32 s12, s26, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s15, s27, 24 +; SI-NEXT: s_or_b32 s12, s15, s12 +; SI-NEXT: s_and_b32 s15, s16, 0xff +; SI-NEXT: s_lshl_b32 s40, s17, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 +; SI-NEXT: s_or_b32 s15, s15, s40 +; SI-NEXT: v_or_b32_e32 v9, v9, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v0, v10 +; SI-NEXT: s_or_b32 s15, s15, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s40, s25, 8 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_or_b32_e32 v12, v3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 +; SI-NEXT: s_or_b32 s12, s4, s12 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s40, s9, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v15, v7, v13 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v14, v9, v15 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v18, s4, v12 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s40, s13, 8 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_or_b32 s4, s4, s40 +; SI-NEXT: s_or_b32 s8, s8, s41 +; SI-NEXT: v_or_b32_e32 v22, v17, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s8, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v22, 16 +; SI-NEXT: v_or_b32_e32 v12, s4, v22 +; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: s_lshr_b32 s41, s41, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: s_add_i32 s15, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_alignbit_b32 v5, s8, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v18, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s40 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v32i8_to_v16i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v21, v6 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v22, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v5, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v21 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v22 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v19 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v16i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v4 +; GFX9-NEXT: v_mov_b32_e32 v21, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v21 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v19 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v14, 16, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v6, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB99_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB99_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB99_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB99_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB99_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_4 -; GCN-NEXT: .LBB50_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB50_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: .LBB50_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f16_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_4 +; SI-NEXT: .LBB100_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB100_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: .LBB100_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f16_to_v16bf16: ; VI: ; %bb.0: @@ -17033,7 +34644,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_e32 v8, 0x200, v0 @@ -17060,7 +34671,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v11, v2 ; VI-NEXT: v_or_b32_e32 v1, v10, v1 ; VI-NEXT: v_or_b32_e32 v0, v8, v0 -; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: .LBB100_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17093,7 +34704,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: s_cbranch_execz .LBB100_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] @@ -17103,7 +34714,7 @@ define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: .LBB100_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -17123,170 +34734,439 @@ end: ret <16 x bfloat> %phi } +define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_add_f16_e32 v5, s22, v0 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v13, s5 +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v5, v6 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v7, v7, v13 +; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_add_f16_e32 v8, s16, v0 +; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v10, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v11, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s21, v0 +; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_or_b32_e32 v3, v3, v12 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v0, v8, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v16bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_4 -; GCN-NEXT: .LBB51_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB51_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: .LBB51_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16bf16_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_4 +; SI-NEXT: .LBB102_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB102_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16bf16_to_v16f16: ; VI: ; %bb.0: @@ -17295,7 +35175,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -17442,7 +35322,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17453,7 +35333,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -17577,7 +35457,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v10, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v9, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v8, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17589,7 +35469,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v6 @@ -17733,7 +35613,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v13, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v11 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17745,7 +35625,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -17880,7 +35760,7 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v11, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -17900,221 +35780,910 @@ end: ret <16 x half> %phi } -define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f16_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, v14 -; GCN-NEXT: v_mov_b32_e32 v18, v6 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v17 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_4 -; GCN-NEXT: .LBB52_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB52_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: v_bfe_u32 v23, v22, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v33, v0 -; GCN-NEXT: v_or_b32_e32 v4, v32, v1 -; GCN-NEXT: v_or_b32_e32 v8, v35, v2 -; GCN-NEXT: v_or_b32_e32 v12, v34, v3 -; GCN-NEXT: v_or_b32_e32 v16, v38, v5 -; GCN-NEXT: v_or_b32_e32 v20, v36, v9 -; GCN-NEXT: v_or_b32_e32 v24, v49, v10 -; GCN-NEXT: v_or_b32_e32 v28, v48, v11 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_bfe_u32 v31, v30, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: .LBB52_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 -; GCN-NEXT: v_bfe_u32 v23, v22, 8, 8 -; GCN-NEXT: v_or_b32_e32 v24, v1, v0 -; GCN-NEXT: v_or_b32_e32 v28, v2, v12 -; GCN-NEXT: v_or_b32_e32 v16, v4, v3 -; GCN-NEXT: v_or_b32_e32 v20, v5, v17 -; GCN-NEXT: v_or_b32_e32 v8, v8, v18 -; GCN-NEXT: v_or_b32_e32 v12, v9, v19 -; GCN-NEXT: v_or_b32_e32 v0, v11, v10 -; GCN-NEXT: v_or_b32_e32 v4, v13, v21 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_bfe_u32 v31, v30, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v1 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB103_2 ; -; VI-LABEL: bitcast_v16f16_to_v32i8: +; VI-LABEL: bitcast_v16bf16_to_v16f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v5 -; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_mov_b32_e32 v33, v3 -; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s4, v1 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_lshl_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, s5, v1 +; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc +; VI-NEXT: s_lshl_b32 s5, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; VI-NEXT: v_add_f32_e32 v7, s5, v1 +; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s5, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s5, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 +; VI-NEXT: v_add_f32_e32 v12, s4, v1 +; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 +; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 +; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: s_lshl_b32 s5, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 +; GFX9-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v7 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_lshl_b32 s5, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc +; GFX9-NEXT: v_add_f32_e32 v12, s5, v1 +; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v13, v13, v12 +; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v5, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v12 +; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v14, vcc +; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 +; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc +; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 +; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GFX9-NEXT: v_and_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v10 +; GFX9-NEXT: v_and_b32_sdwa v10, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v16f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, s19 +; GFX11-NEXT: s_mov_b32 s6, s18 +; GFX11-NEXT: s_mov_b32 s5, s17 +; GFX11-NEXT: s_mov_b32 s4, s16 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-NEXT: .LBB103_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s8, s0, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s8 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s1, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: s_and_b32 s8, s2, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s4, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s3, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s0, s3, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s4, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_and_b32 s0, s5, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s0, s5, 16 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX11-NEXT: v_bfe_u32 v5, v12, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v13, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v12 +; GFX11-NEXT: s_and_b32 s0, s6, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc_lo +; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s6, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v6 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s7, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v18, v14, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: s_and_b32 s0, s7, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v15 +; GFX11-NEXT: v_bfe_u32 v20, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v18, v14 +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v20, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v12 +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v15, v21, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v16, v16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v16 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v13, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v14 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v7, v13, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v3 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v4, v17, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v12 +; GFX11-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX11-NEXT: v_lshl_or_b32 v1, v8, 16, v13 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v14 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB103_3: +; GFX11-NEXT: s_branch .LBB103_2 +; GFX11-NEXT: .LBB103_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v16f16_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_4 +; SI-NEXT: .LBB104_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB104_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v8, v36, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v35, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v16, v39, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v38, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v24, v50, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v4, v32, v1 +; SI-NEXT: v_or_b32_e32 v28, v49, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: .LBB104_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_or_b32_e32 v24, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_or_b32_e32 v28, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v8, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v33, v3 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr37 @@ -18133,7 +36702,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 @@ -18151,9 +36720,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[32:33] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v5, 0x200 ; VI-NEXT: v_add_f16_sdwa v14, v33, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -18204,7 +36773,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; VI-NEXT: v_bfe_u32 v37, v36, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v32 @@ -18252,7 +36821,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -18278,9 +36847,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] @@ -18315,7 +36884,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -18355,7 +36924,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[32:33] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[26:27] @@ -18373,9 +36942,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[18:19] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, v33 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] @@ -18401,7 +36970,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; GFX11-TRUE16-NEXT: .LBB52_4: ; %end +; GFX11-TRUE16-NEXT: .LBB104_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h @@ -18459,7 +37028,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -18485,9 +37054,9 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB104_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, v39 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, v37 op_sel_hi:[0,1] @@ -18521,7 +37090,7 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 @@ -18549,226 +37118,977 @@ end: ret <32 x i8> %phi } +define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f16_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v8, v36, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v35, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v16, v39, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v38, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v24, v50, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v4, v32, v1 +; SI-NEXT: v_or_b32_e32 v28, v49, v5 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_or_b32_e32 v24, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_or_b32_e32 v28, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v16, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v20, v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v8, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v16f16_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s23, 24 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s25, s23, 8 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s29, s22, 8 +; VI-NEXT: s_lshr_b32 s41, s21, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s24, s21, 8 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s28, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s15, s19, 8 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s27, s18, 8 +; VI-NEXT: s_lshr_b32 s43, s17, 24 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v2, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_add_f16_e32 v14, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; VI-NEXT: v_add_f16_e32 v34, s19, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v12, v34, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; VI-NEXT: v_add_f16_e32 v8, s18, v1 +; VI-NEXT: v_add_f16_e32 v22, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v11, v8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_add_f16_e32 v33, s21, v1 +; VI-NEXT: v_add_f16_e32 v18, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v20, v33, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; VI-NEXT: v_add_f16_e32 v16, s20, v1 +; VI-NEXT: v_add_f16_e32 v30, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v35, s17, v1 +; VI-NEXT: v_or_b32_e32 v19, v16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; VI-NEXT: v_add_f16_e32 v32, s23, v1 +; VI-NEXT: v_add_f16_e32 v26, s4, v1 +; VI-NEXT: v_or_b32_e32 v4, v35, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v1 +; VI-NEXT: v_or_b32_e32 v37, v32, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; VI-NEXT: v_add_f16_e32 v24, s22, v1 +; VI-NEXT: v_or_b32_e32 v3, v0, v3 +; VI-NEXT: v_or_b32_e32 v36, v24, v5 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[36:37] +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[19:20] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v36 +; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v2, s59 +; VI-NEXT: v_mov_b32_e32 v6, s58 +; VI-NEXT: v_mov_b32_e32 v10, s57 +; VI-NEXT: v_mov_b32_e32 v14, s56 +; VI-NEXT: v_mov_b32_e32 v18, s47 +; VI-NEXT: v_mov_b32_e32 v22, s46 +; VI-NEXT: v_mov_b32_e32 v26, s45 +; VI-NEXT: v_mov_b32_e32 v30, s44 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v35, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v32, s23 +; VI-NEXT: v_mov_b32_e32 v31, s40 +; VI-NEXT: v_mov_b32_e32 v23, s41 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v7, s43 +; VI-NEXT: v_mov_b32_e32 v25, s29 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v9, s27 +; VI-NEXT: v_mov_b32_e32 v13, s15 +; VI-NEXT: v_mov_b32_e32 v1, s26 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v35 +; VI-NEXT: v_mov_b32_e32 v12, v34 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v28, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s23, 24 +; GFX9-NEXT: s_lshr_b32 s15, s23, 16 +; GFX9-NEXT: s_lshr_b32 s25, s23, 8 +; GFX9-NEXT: s_lshr_b32 s24, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s21, 24 +; GFX9-NEXT: s_lshr_b32 s28, s21, 16 +; GFX9-NEXT: s_lshr_b32 s40, s21, 8 +; GFX9-NEXT: s_lshr_b32 s29, s20, 16 +; GFX9-NEXT: s_lshr_b32 s41, s20, 8 +; GFX9-NEXT: s_lshr_b32 s42, s19, 24 +; GFX9-NEXT: s_lshr_b32 s43, s19, 16 +; GFX9-NEXT: s_lshr_b32 s45, s19, 8 +; GFX9-NEXT: s_lshr_b32 s44, s18, 16 +; GFX9-NEXT: s_lshr_b32 s46, s18, 8 +; GFX9-NEXT: s_lshr_b32 s47, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s57, s16, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v2, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s19, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s18, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, s21, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s20, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, s23, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, s22, v2 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v2, s57 +; GFX9-NEXT: v_mov_b32_e32 v5, s58 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v7, s47 +; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v10, s44 +; GFX9-NEXT: v_mov_b32_e32 v13, s45 +; GFX9-NEXT: v_mov_b32_e32 v14, s43 +; GFX9-NEXT: v_mov_b32_e32 v15, s42 +; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v23, s27 +; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v26, s24 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-NEXT: v_mov_b32_e32 v20, v17 +; GFX9-NEXT: v_mov_b32_e32 v28, v25 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-NEXT: v_mov_b32_e32 v17, v33 +; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16f16_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16f16_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v35, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v34, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s0 :: v_dual_mov_b32 v39, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s2 :: v_dual_mov_b32 v37, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s16 :: v_dual_mov_b32 v35, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s18 :: v_dual_mov_b32 v33, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v2, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v6, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s42 :: v_dual_mov_b32 v10, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s41 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v18, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v22, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v26, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v30, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s22 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v29, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, s13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v16f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v0, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v36 -; GCN-NEXT: v_or_b32_e32 v5, v5, v37 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_or_b32_e32 v7, v7, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v9, v49 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_or_b32_e32 v11, v11, v51 -; GCN-NEXT: v_or_b32_e32 v12, v12, v52 -; GCN-NEXT: v_or_b32_e32 v13, v13, v53 -; GCN-NEXT: v_or_b32_e32 v14, v14, v54 -; GCN-NEXT: v_or_b32_e32 v15, v15, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB53_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v3, v54, v3 -; GCN-NEXT: v_or_b32_e32 v5, v53, v5 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v9, v51, v9 -; GCN-NEXT: v_or_b32_e32 v11, v50, v11 -; GCN-NEXT: v_or_b32_e32 v13, v49, v13 -; GCN-NEXT: v_or_b32_e32 v15, v48, v15 -; GCN-NEXT: v_or_b32_e32 v14, v39, v14 -; GCN-NEXT: v_or_b32_e32 v12, v38, v12 -; GCN-NEXT: v_or_b32_e32 v10, v37, v10 -; GCN-NEXT: v_or_b32_e32 v8, v36, v8 -; GCN-NEXT: v_or_b32_e32 v6, v35, v6 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 -; GCN-NEXT: .LBB53_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v31 -; GCN-NEXT: v_mov_b32_e32 v2, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v19 -; GCN-NEXT: v_mov_b32_e32 v6, v27 -; GCN-NEXT: v_mov_b32_e32 v8, v17 -; GCN-NEXT: v_mov_b32_e32 v10, v21 -; GCN-NEXT: v_mov_b32_e32 v12, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v29 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v16f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v39 +; SI-NEXT: v_or_b32_e32 v5, v5, v48 +; SI-NEXT: v_or_b32_e32 v6, v6, v49 +; SI-NEXT: v_or_b32_e32 v7, v7, v50 +; SI-NEXT: v_or_b32_e32 v8, v8, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v17 +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16f16: ; VI: ; %bb.0: @@ -18801,14 +38121,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: s_cbranch_execnz .LBB106_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_4 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB106_4 +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: .LBB106_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18866,8 +38186,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 -; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB106_2 +; VI-NEXT: .LBB106_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v30 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -18959,14 +38279,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: s_cbranch_execnz .LBB106_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_4 -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB106_4 +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: .LBB106_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -19025,8 +38345,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 -; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB106_2 +; GFX9-NEXT: .LBB106_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 @@ -19124,14 +38444,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.h @@ -19197,8 +38517,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v24.l, 3 @@ -19298,14 +38618,14 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 @@ -19379,8 +38699,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v28, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 @@ -19472,258 +38792,1156 @@ end: ret <16 x half> %phi } -define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v16bf16_to_v32i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_4 -; GCN-NEXT: .LBB54_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB54_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v53 -; GCN-NEXT: v_alignbit_b32 v0, v0, v33, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v32, 16 -; GCN-NEXT: v_alignbit_b32 v8, v1, v36, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v35, 16 -; GCN-NEXT: v_alignbit_b32 v16, v2, v48, 16 -; GCN-NEXT: v_alignbit_b32 v20, v22, v39, 16 -; GCN-NEXT: v_alignbit_b32 v24, v3, v52, 16 -; GCN-NEXT: v_alignbit_b32 v28, v30, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: .LBB54_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v34 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v16 -; GCN-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v28, v30, v2, 16 -; GCN-NEXT: v_alignbit_b32 v16, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v22, v5, 16 -; GCN-NEXT: v_alignbit_b32 v8, v17, v8, 16 -; GCN-NEXT: v_alignbit_b32 v12, v14, v9, 16 -; GCN-NEXT: v_alignbit_b32 v0, v18, v11, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v13, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v16f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: v_readfirstlane_b32 s47, v16 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_readfirstlane_b32 s40, v11 +; SI-NEXT: v_readfirstlane_b32 s41, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v9 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xff +; SI-NEXT: s_lshl_b32 s11, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s24, 0xff +; SI-NEXT: s_lshl_b32 s13, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_and_b32 s13, s22, 0xff +; SI-NEXT: s_lshl_b32 s15, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s45, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: s_and_b32 s43, s43, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 8 +; SI-NEXT: s_and_b32 s41, s41, 0xff +; SI-NEXT: s_lshl_b32 s40, s40, 8 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_and_b32 s15, s20, 0xff +; SI-NEXT: s_lshl_b32 s20, s21, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s5, s44, s5 +; SI-NEXT: s_or_b32 s42, s42, s43 +; SI-NEXT: s_or_b32 s40, s40, s41 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: s_addk_i32 s40, 0x300 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB107_2 ; -; VI-LABEL: bitcast_v16bf16_to_v32i8: +; VI-LABEL: bitcast_v32i8_to_v16f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v21, v6 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v22, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v5, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v21 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v22 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v19 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v16f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v4 +; GFX9-NEXT: v_mov_b32_e32 v21, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v21 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v19 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v14, 16, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v6, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB107_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB107_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB107_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB107_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB107_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { +; SI-LABEL: bitcast_v16bf16_to_v32i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_4 +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB108_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_alignbit_b32 v8, v5, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_alignbit_b32 v16, v5, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v34, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v38, 16 +; SI-NEXT: v_alignbit_b32 v20, v22, v50, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 +; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: .LBB108_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB108_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -19894,7 +40112,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 ; VI-NEXT: v_mov_b32_e32 v8, v34 @@ -19942,7 +40160,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -19968,9 +40186,9 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -20126,7 +40344,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v34 @@ -20179,7 +40397,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[26:27] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[18:19] @@ -20209,9 +40427,9 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v27.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v3 @@ -20368,7 +40586,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h @@ -20427,7 +40645,7 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v33 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 @@ -20453,9 +40671,9 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; GFX11-FAKE16-NEXT: .LBB54_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v39 @@ -20472,160 +40690,1462 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v1, v9, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v8 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v5, v4, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v12, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v35 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_add3_u32 v12, v13, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v12, v13 :: v_dual_add_f32 v12, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v13, v15, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v14 :: v_dual_add_f32 v14, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v17, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v15, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v19, v14, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v9, v15 :: v_dual_add_f32 v9, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v18, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v10, v15 :: v_dual_add_f32 v15, 0x40c00000, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v16, v19, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v32 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v18, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v20, v13, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v9, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v14, v12, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v0 +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_alignbit_b32 v8, v5, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_alignbit_b32 v16, v5, v51, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v52 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v33, 16 +; SI-NEXT: v_alignbit_b32 v12, v14, v37, 16 +; SI-NEXT: v_alignbit_b32 v20, v22, v49, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 +; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v52 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s23, 24 +; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s25, s23, 8 +; VI-NEXT: s_lshr_b32 s24, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: s_lshr_b32 s27, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s40, s21, 8 +; VI-NEXT: s_lshr_b32 s29, s20, 16 +; VI-NEXT: s_lshr_b32 s41, s20, 8 +; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s43, s19, 16 +; VI-NEXT: s_lshr_b32 s45, s19, 8 +; VI-NEXT: s_lshr_b32 s44, s18, 16 +; VI-NEXT: s_lshr_b32 s46, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s17, 24 +; VI-NEXT: s_lshr_b32 s56, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: s_lshr_b32 s57, s16, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v2 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v2 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VI-NEXT: v_add_f32_e32 v0, s4, v2 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v9, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v8, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v16, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v2 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v2, s4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v24, v2, v3, 16 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: s_branch .LBB109_5 +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr25 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v35, s59 +; VI-NEXT: v_mov_b32_e32 v2, s57 +; VI-NEXT: v_mov_b32_e32 v5, s58 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s47 +; VI-NEXT: v_mov_b32_e32 v34, s46 +; VI-NEXT: v_mov_b32_e32 v10, s44 +; VI-NEXT: v_mov_b32_e32 v13, s45 +; VI-NEXT: v_mov_b32_e32 v14, s43 +; VI-NEXT: v_mov_b32_e32 v15, s42 +; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v21, s40 +; VI-NEXT: v_mov_b32_e32 v22, s28 +; VI-NEXT: v_mov_b32_e32 v23, s27 +; VI-NEXT: v_mov_b32_e32 v32, s26 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: v_mov_b32_e32 v27, s10 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB109_5: ; %end +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v12, v9 +; VI-NEXT: v_mov_b32_e32 v20, v17 +; VI-NEXT: v_mov_b32_e32 v28, v25 +; VI-NEXT: v_mov_b32_e32 v1, v35 +; VI-NEXT: v_mov_b32_e32 v9, v34 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s23, 24 +; GFX9-NEXT: s_lshr_b32 s59, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s23, 8 +; GFX9-NEXT: s_lshr_b32 s47, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s22, 8 +; GFX9-NEXT: s_lshr_b32 s28, s21, 24 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s41, s21, 8 +; GFX9-NEXT: s_lshr_b32 s44, s20, 16 +; GFX9-NEXT: s_lshr_b32 s43, s20, 8 +; GFX9-NEXT: s_lshr_b32 s24, s19, 24 +; GFX9-NEXT: s_lshr_b32 s57, s19, 16 +; GFX9-NEXT: s_lshr_b32 s27, s19, 8 +; GFX9-NEXT: s_lshr_b32 s40, s18, 16 +; GFX9-NEXT: s_lshr_b32 s29, s18, 8 +; GFX9-NEXT: s_lshr_b32 s14, s17, 24 +; GFX9-NEXT: s_lshr_b32 s56, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: s_lshr_b32 s25, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB109_4 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v1 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0x7fff, v8 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v22, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v9, 0x7fff, v9 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; GFX9-NEXT: v_add_f32_e32 v9, s4, v5 +; GFX9-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v9 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v7, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v7 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v7 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc +; GFX9-NEXT: v_add_f32_e32 v11, s4, v5 +; GFX9-NEXT: v_bfe_u32 v12, v11, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v11 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 +; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v12, v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v11 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; GFX9-NEXT: s_branch .LBB109_5 +; GFX9-NEXT: .LBB109_3: +; GFX9-NEXT: ; implicit-def: $sgpr25 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr24 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB109_2 +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: v_mov_b32_e32 v24, s22 +; GFX9-NEXT: v_mov_b32_e32 v32, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s59 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v22, s58 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v34, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v35, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s56 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v25, s46 +; GFX9-NEXT: v_mov_b32_e32 v31, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s45 +; GFX9-NEXT: v_mov_b32_e32 v18, s44 +; GFX9-NEXT: v_mov_b32_e32 v17, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v21, s41 +; GFX9-NEXT: v_mov_b32_e32 v10, s40 +; GFX9-NEXT: v_mov_b32_e32 v9, s29 +; GFX9-NEXT: v_mov_b32_e32 v15, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v2, s26 +; GFX9-NEXT: v_mov_b32_e32 v1, s25 +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB109_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-NEXT: v_mov_b32_e32 v12, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v28, v32 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v10, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v10, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v12, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v14, 16, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v15, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v16, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v16, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v16, 16, 1 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, v21, v16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, v24, v18 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v23, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v16, v17, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, v19, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v26, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v15, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v22, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v17, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v5, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v9, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[32:33], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_branch .LBB109_5 +; GFX11-TRUE16-NEXT: .LBB109_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s4 +; GFX11-TRUE16-NEXT: .LBB109_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v35.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v32i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v1, v9, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v0, vcc_lo -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v8 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v10, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v5, v4, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v12, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v35 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v6, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_add3_u32 v12, v13, v7, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v12, v13 :: v_dual_add_f32 v12, 0x40c00000, v14 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v13, v15, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v7, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v14 :: v_dual_add_f32 v14, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v17, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v15, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v19, v14, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v9, v15 :: v_dual_add_f32 v9, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v18, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v10, v15 :: v_dual_add_f32 v15, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v16, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v34 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v14, 16, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v15, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v16, v19, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_add3_u32 v21, v21, v17, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v13, v17 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v32 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v20, v13, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v9, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v14, v12, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v10 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v38 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v39 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v36 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v37 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v34 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v35 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v24, v32 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v13, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v20, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v18, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v20, v16 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v19, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v12, v20, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v22, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v5, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v7, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: s_branch .LBB109_5 +; GFX11-FAKE16-NEXT: .LBB109_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v27, s10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v19, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB109_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v35 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v20, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v32 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -20645,256 +42165,257 @@ end: } define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i8_to_v16bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v17 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v25 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GCN-NEXT: v_or_b32_e32 v31, v1, v0 -; GCN-NEXT: v_or_b32_e32 v35, v36, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v37, v4 -; GCN-NEXT: v_or_b32_e32 v33, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v38, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v39, v9 -; GCN-NEXT: v_or_b32_e32 v23, v11, v10 -; GCN-NEXT: v_or_b32_e32 v27, v48, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v49, v14 -; GCN-NEXT: v_or_b32_e32 v32, v16, v15 -; GCN-NEXT: v_or_b32_e32 v13, v50, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_or_b32_e32 v15, v51, v20 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v25 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v7, v11, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v54, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; GCN-NEXT: v_or_b32_e32 v12, v53, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; GCN-NEXT: v_or_b32_e32 v4, v52, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v3, v51, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v7 -; GCN-NEXT: v_or_b32_e32 v7, v50, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 -; GCN-NEXT: v_or_b32_e32 v11, v49, v15 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v15, v48, v17 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v14, v39, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v38, v9 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v6, v37, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v5 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v15, v13 -; GCN-NEXT: v_or_b32_e32 v10, v14, v12 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v10 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v6 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v31 -; GCN-NEXT: v_mov_b32_e32 v1, v35 -; GCN-NEXT: v_mov_b32_e32 v2, v21 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v19 -; GCN-NEXT: v_mov_b32_e32 v8, v23 -; GCN-NEXT: v_mov_b32_e32 v9, v27 -; GCN-NEXT: v_mov_b32_e32 v10, v29 -; GCN-NEXT: v_mov_b32_e32 v12, v32 -; GCN-NEXT: v_mov_b32_e32 v14, v34 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i8_to_v16bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v36, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: v_or_b32_e32 v31, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v39, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v7, v49, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v17 +; SI-NEXT: v_or_b32_e32 v23, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v27, v50, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v11, v21, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v25 +; SI-NEXT: v_or_b32_e32 v32, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v13, v52, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v55 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v15, v54, v2 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v12 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v31 +; SI-NEXT: v_mov_b32_e32 v6, v19 +; SI-NEXT: v_mov_b32_e32 v8, v23 +; SI-NEXT: v_mov_b32_e32 v9, v27 +; SI-NEXT: v_mov_b32_e32 v10, v29 +; SI-NEXT: v_mov_b32_e32 v12, v32 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i8_to_v16bf16: ; VI: ; %bb.0: @@ -20927,14 +42448,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: s_cbranch_execnz .LBB110_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_4 -; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB110_4 +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: .LBB110_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20992,8 +42513,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 -; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB110_2 +; VI-NEXT: .LBB110_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v30 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 @@ -21085,14 +42606,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: s_cbranch_execnz .LBB110_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_4 -; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB110_4 +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: .LBB110_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 @@ -21151,8 +42672,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 -; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB110_2 +; GFX9-NEXT: .LBB110_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 @@ -21250,14 +42771,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.h @@ -21323,8 +42844,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v24.l, 3 @@ -21424,14 +42945,14 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 @@ -21505,8 +43026,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v28, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 @@ -21597,3 +43118,924 @@ end: %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <16 x bfloat> %phi } + +define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_readfirstlane_b32 s42, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_readfirstlane_b32 s40, v7 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v17 +; SI-NEXT: s_cbranch_scc0 .LBB111_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v3 +; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_or_b32_e32 v17, v9, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_or_b32_e32 v9, v0, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_or_b32_e32 v19, v1, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s10, 24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v11 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_or_b32_e32 v18, v13, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s44, s4, 16 +; SI-NEXT: v_or_b32_e32 v13, v5, v7 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s45, s4, 16 +; SI-NEXT: v_or_b32_e32 v15, v6, v7 +; SI-NEXT: s_cbranch_execnz .LBB111_4 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s9, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s10, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: s_add_i32 s8, s6, 0x3000000 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s13, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s5, 16 +; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s4, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: s_branch .LBB111_5 +; SI-NEXT: .LBB111_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: s_branch .LBB111_2 +; SI-NEXT: .LBB111_4: +; SI-NEXT: v_mov_b32_e32 v10, s44 +; SI-NEXT: v_mov_b32_e32 v14, s45 +; SI-NEXT: .LBB111_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, v17 +; SI-NEXT: v_mov_b32_e32 v11, v19 +; SI-NEXT: v_mov_b32_e32 v12, v18 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: v_mov_b32_e32 v21, v6 +; VI-NEXT: v_mov_b32_e32 v20, v4 +; VI-NEXT: v_mov_b32_e32 v22, v2 +; VI-NEXT: v_mov_b32_e32 v19, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v10 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v5, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v21 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v22 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v19 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v22, v4 +; GFX9-NEXT: v_mov_b32_e32 v21, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v21 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v19 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v14, 16, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v6, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB111_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB111_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-TRUE16-NEXT: s_branch .LBB111_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v7, 16, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v16 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v17 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v14, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v7, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB111_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB111_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-FAKE16-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 86ac4354e3f17..6cf53d187fcab 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -1,32 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i32_to_v9f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i32_to_v9f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v9f32: ; VI: ; %bb.0: @@ -109,28 +109,167 @@ end: ret <9 x float> %phi } +define inreg <9 x float> @bitcast_v9i32_to_v9f32_scalar(<9 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i32_to_v9f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v9i32_to_v9f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v9i32_to_v9f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v9i32_to_v9f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i32> %a, splat (i32 3) + %a2 = bitcast <9 x i32> %a1 to <9 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x i32> %a to <9 x float> + br label %end + +end: + %phi = phi <9 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x float> %phi +} + define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f32_to_v9i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f32_to_v9i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v9i32: ; VI: ; %bb.0: @@ -208,70 +347,217 @@ end: ret <9 x i32> %phi } +define inreg <9 x i32> @bitcast_v9f32_to_v9i32_scalar(<9 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f32_to_v9i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9f32_to_v9i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9f32_to_v9i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9f32_to_v9i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <9 x float> %a1 to <9 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x float> %a to <9 x i32> + br label %end + +end: + %phi = phi <9 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i32> %phi +} + define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i32_to_v18i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i32_to_v18i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18i16: ; VI: ; %bb.0: @@ -323,7 +609,7 @@ define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 @@ -334,7 +620,7 @@ define <18 x i16> @bitcast_v9i32_to_v18i16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -354,114 +640,295 @@ end: ret <18 x i16> %phi } +define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i32_to_v18i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: s_lshr_b32 s7, s21, 16 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v9i32_to_v18i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v9i32_to_v18i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v9i32_to_v18i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i32> %a, splat (i32 3) + %a2 = bitcast <9 x i32> %a1 to <18 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x i32> %a to <18 x i16> + br label %end + +end: + %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i16> %phi +} + define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i16_to_v9i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v8 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v4 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v25 -; GCN-NEXT: v_or_b32_e32 v1, v1, v26 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v18 -; GCN-NEXT: v_or_b32_e32 v3, v3, v24 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v26, v1 -; GCN-NEXT: v_or_b32_e32 v2, v18, v2 -; GCN-NEXT: v_or_b32_e32 v3, v24, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i16_to_v9i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v9i32: ; VI: ; %bb.0: @@ -470,7 +937,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 3 ; VI-NEXT: v_add_u16_e32 v9, 3, v8 @@ -500,7 +967,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v9, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v9, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -533,7 +1000,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] @@ -544,7 +1011,7 @@ define <9 x i32> @bitcast_v18i16_to_v9i32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -564,125 +1031,376 @@ end: ret <9 x i32> %phi } +define inreg <9 x i32> @bitcast_v18i16_to_v9i32_scalar(<18 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i16_to_v9i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v7, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v8, v0, v11 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v18i16_to_v9i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v18i16_to_v9i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18i16_to_v9i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i16> %a, splat (i16 3) + %a2 = bitcast <18 x i16> %a1 to <9 x i32> + br label %end + +cmp.false: + %a3 = bitcast <18 x i16> %a to <9 x i32> + br label %end + +end: + %phi = phi <9 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i32> %phi +} + define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i32_to_v18f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i32_to_v18f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i32_to_v18f16: ; VI: ; %bb.0: @@ -734,7 +1452,7 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 @@ -745,7 +1463,7 @@ define <18 x half> @bitcast_v9i32_to_v18f16(<9 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -765,153 +1483,356 @@ end: ret <18 x half> %phi } +define inreg <18 x half> @bitcast_v9i32_to_v18f16_scalar(<9 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i32_to_v18f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v9i32_to_v18f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v9i32_to_v18f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v9i32_to_v18f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i32> %a, splat (i32 3) + %a2 = bitcast <9 x i32> %a1 to <18 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x i32> %a to <18 x half> + br label %end + +end: + %phi = phi <18 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x half> %phi +} + define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f16_to_v9i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v16 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v23, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v12, v6 -; GCN-NEXT: v_or_b32_e32 v7, v10, v7 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v14, v8 -; GCN-NEXT: v_or_b32_e32 v6, v12, v15 -; GCN-NEXT: v_or_b32_e32 v7, v10, v13 -; GCN-NEXT: v_or_b32_e32 v8, v9, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f16_to_v9i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9i32: ; VI: ; %bb.0: @@ -920,7 +1841,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_sdwa v10, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -950,7 +1871,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -961,7 +1882,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] @@ -973,7 +1894,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -985,7 +1906,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] @@ -996,7 +1917,7 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1016,70 +1937,346 @@ end: ret <9 x i32> %phi } +define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f16_to_v9i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v18f16_to_v9i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v9, v1 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f16_to_v9i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f16_to_v9i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x half> %a, splat (half 0xH0200) + %a2 = bitcast <18 x half> %a1 to <9 x i32> + br label %end + +cmp.false: + %a3 = bitcast <18 x half> %a to <9 x i32> + br label %end + +end: + %phi = phi <9 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i32> %phi +} + define <18 x i16> @bitcast_v9f32_to_v18i16(<9 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f32_to_v18i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f32_to_v18i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18i16: ; VI: ; %bb.0: @@ -1157,114 +2354,314 @@ end: ret <18 x i16> %phi } +define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f32_to_v18i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9f32_to_v18i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9f32_to_v18i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9f32_to_v18i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <9 x float> %a1 to <18 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x float> %a to <18 x i16> + br label %end + +end: + %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i16> %phi +} + define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i16_to_v9f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v23, v8 -; GCN-NEXT: v_mov_b32_e32 v22, v6 -; GCN-NEXT: v_mov_b32_e32 v21, v4 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v25 -; GCN-NEXT: v_or_b32_e32 v1, v1, v26 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v2, v2, v18 -; GCN-NEXT: v_or_b32_e32 v3, v3, v24 -; GCN-NEXT: v_or_b32_e32 v4, v4, v9 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v19 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v25, v0 -; GCN-NEXT: v_or_b32_e32 v1, v26, v1 -; GCN-NEXT: v_or_b32_e32 v2, v18, v2 -; GCN-NEXT: v_or_b32_e32 v3, v24, v3 -; GCN-NEXT: v_or_b32_e32 v4, v9, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i16_to_v9f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v13 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v9f32: ; VI: ; %bb.0: @@ -1273,7 +2670,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 3 ; VI-NEXT: v_add_u16_e32 v9, 3, v8 @@ -1303,7 +2700,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v9, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v9, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1336,7 +2733,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] @@ -1347,7 +2744,7 @@ define <9 x float> @bitcast_v18i16_to_v9f32(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1367,125 +2764,376 @@ end: ret <9 x float> %phi } +define inreg <9 x float> @bitcast_v18i16_to_v9f32_scalar(<18 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i16_to_v9f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v7, v0, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v8, v0, v11 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v18i16_to_v9f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v18i16_to_v9f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18i16_to_v9f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i16> %a, splat (i16 3) + %a2 = bitcast <18 x i16> %a1 to <9 x float> + br label %end + +cmp.false: + %a3 = bitcast <18 x i16> %a to <9 x float> + br label %end + +end: + %phi = phi <9 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x float> %phi +} + define <18 x half> @bitcast_v9f32_to_v18f16(<9 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f32_to_v18f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v18, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f32_to_v18f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18f16: ; VI: ; %bb.0: @@ -1563,153 +3211,379 @@ end: ret <18 x half> %phi } +define inreg <18 x half> @bitcast_v9f32_to_v18f16_scalar(<9 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f32_to_v18f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s25, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v9f32_to_v18f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9f32_to_v18f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9f32_to_v18f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <9 x float> %a1 to <18 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x float> %a to <18 x half> + br label %end + +end: + %phi = phi <18 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x half> %phi +} + define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f16_to_v9f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v16 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v23, v2 -; GCN-NEXT: v_or_b32_e32 v3, v21, v3 -; GCN-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v12, v6 -; GCN-NEXT: v_or_b32_e32 v7, v10, v7 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v14, v8 -; GCN-NEXT: v_or_b32_e32 v6, v12, v15 -; GCN-NEXT: v_or_b32_e32 v7, v10, v13 -; GCN-NEXT: v_or_b32_e32 v8, v9, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f16_to_v9f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v2, v26, v2 +; SI-NEXT: v_or_b32_e32 v3, v24, v3 +; SI-NEXT: v_or_b32_e32 v4, v22, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v9f32: ; VI: ; %bb.0: @@ -1718,7 +3592,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 0x200 ; VI-NEXT: v_add_f16_sdwa v10, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1748,7 +3622,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v10 ; VI-NEXT: v_or_b32_e32 v0, v0, v9 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1759,7 +3633,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] @@ -1771,7 +3645,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1783,7 +3657,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] @@ -1794,7 +3668,7 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1814,134 +3688,410 @@ end: ret <9 x float> %phi } +define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f16_to_v9f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v2, v21, v2 +; SI-NEXT: v_or_b32_e32 v3, v19, v3 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_or_b32_e32 v5, v15, v5 +; SI-NEXT: v_or_b32_e32 v6, v13, v6 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v18f16_to_v9f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v9, v1 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f16_to_v9f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f16_to_v9f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x half> %a, splat (half 0xH0200) + %a2 = bitcast <18 x half> %a1 to <9 x float> + br label %end + +cmp.false: + %a3 = bitcast <18 x half> %a to <9 x float> + br label %end + +end: + %phi = phi <9 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x float> %phi +} + define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i16_to_v18f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v17 -; GCN-NEXT: v_mov_b32_e32 v34, v16 -; GCN-NEXT: v_mov_b32_e32 v33, v15 -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v31, v13 -; GCN-NEXT: v_mov_b32_e32 v30, v12 -; GCN-NEXT: v_mov_b32_e32 v29, v11 -; GCN-NEXT: v_mov_b32_e32 v28, v10 -; GCN-NEXT: v_mov_b32_e32 v27, v9 -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v36, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i16_to_v18f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v17 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v31, v13 +; SI-NEXT: v_mov_b32_e32 v30, v12 +; SI-NEXT: v_mov_b32_e32 v29, v11 +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v5 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v21, v3 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i16_to_v18f16: ; VI: ; %bb.0: @@ -1950,7 +4100,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v9, 3 ; VI-NEXT: v_add_u16_sdwa v10, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1980,7 +4130,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v12 ; VI-NEXT: v_or_b32_e32 v1, v1, v11 ; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2013,7 +4163,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] @@ -2024,7 +4174,7 @@ define <18 x half> @bitcast_v18i16_to_v18f16(<18 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2044,113 +4194,363 @@ end: ret <18 x half> %phi } +define inreg <18 x half> @bitcast_v18i16_to_v18f16_scalar(<18 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i16_to_v18f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v20, v1 +; SI-NEXT: v_mov_b32_e32 v19, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v18 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v19 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v18i16_to_v18f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v18i16_to_v18f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18i16_to_v18f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i16> %a, splat (i16 3) + %a2 = bitcast <18 x i16> %a1 to <18 x half> + br label %end + +cmp.false: + %a3 = bitcast <18 x i16> %a to <18 x half> + br label %end + +end: + %phi = phi <18 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x half> %phi +} + define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f16_to_v18i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; GCN-NEXT: v_or_b32_e32 v14, v14, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GCN-NEXT: v_or_b32_e32 v10, v10, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v6, v6, v19 -; GCN-NEXT: v_or_b32_e32 v2, v2, v18 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f16_to_v18i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f16_to_v18i16: ; VI: ; %bb.0: @@ -2159,7 +4559,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 0x200 ; VI-NEXT: v_add_f16_e32 v9, 0x200, v0 @@ -2189,7 +4589,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v12, v2 ; VI-NEXT: v_or_b32_e32 v1, v11, v1 ; VI-NEXT: v_or_b32_e32 v0, v9, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2200,7 +4600,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] @@ -2212,7 +4612,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2224,7 +4624,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] @@ -2235,7 +4635,7 @@ define <18 x i16> @bitcast_v18f16_to_v18i16(<18 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2254,3 +4654,289 @@ end: %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <18 x i16> %phi } + +define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f16_to_v18i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v18f16_to_v18i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s25, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s24, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v8, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v9, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v9, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f16_to_v18i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s25, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f16_to_v18i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x half> %a, splat (half 0xH0200) + %a2 = bitcast <18 x half> %a1 to <18 x i16> + br label %end + +cmp.false: + %a3 = bitcast <18 x half> %a to <18 x i16> + br label %end + +end: + %phi = phi <18 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 6e6e62c4b05ad..35ab38c67b1ec 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -1,34 +1,34 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i32_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i32_to_v10f32: ; VI: ; %bb.0: @@ -116,29 +116,176 @@ end: ret <10 x float> %phi } +define inreg <10 x float> @bitcast_v10i32_to_v10f32_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v10i32_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f32_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f32_to_v10i32: ; VI: ; %bb.0: @@ -147,7 +294,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 @@ -159,7 +306,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -170,7 +317,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 ; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 @@ -182,7 +329,7 @@ define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -220,145 +367,235 @@ end: ret <10 x i32> %phi } -define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v9 -; GCN-NEXT: v_mov_b32_e32 v28, v8 -; GCN-NEXT: v_mov_b32_e32 v27, v7 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v5 -; GCN-NEXT: v_mov_b32_e32 v24, v4 -; GCN-NEXT: v_mov_b32_e32 v23, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v21, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x i32> @bitcast_v10f32_to_v10i32_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10i32_to_v20f16: +; VI-LABEL: bitcast_v10f32_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB4_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB4_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 @@ -370,18 +607,18 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v20f16: +; GFX9-LABEL: bitcast_v10i32_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 @@ -393,11 +630,11 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i32_to_v20f16: +; GFX11-LABEL: bitcast_v10i32_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -405,7 +642,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 ; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 @@ -417,7 +654,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -425,249 +662,396 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { cmp.true: %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <20 x half> + %a2 = bitcast <10 x i32> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <20 x half> + %a3 = bitcast <10 x i32> %a to <20 x i16> br label %end end: - %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <20 x half> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_or_b32_e32 v2, v25, v2 -; GCN-NEXT: v_or_b32_e32 v3, v23, v3 -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: v_or_b32_e32 v7, v12, v7 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v17, v16 -; GCN-NEXT: v_or_b32_e32 v6, v19, v18 -; GCN-NEXT: v_or_b32_e32 v7, v12, v15 -; GCN-NEXT: v_or_b32_e32 v8, v11, v14 -; GCN-NEXT: v_or_b32_e32 v9, v10, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v20f16_to_v10i32: +; VI-LABEL: bitcast_v10i32_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_or_b32_e32 v8, v13, v8 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v10i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v10, 0x200 -; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v11 -; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v11 -; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v11 -; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v11 -; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v11 -; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v11 -; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v11 -; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v11 -; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v11 -; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v20f16_to_v10i32: +; GFX9-LABEL: bitcast_v20i16_to_v10i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20f16_to_v10i32: +; GFX11-LABEL: bitcast_v20i16_to_v10i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -675,31 +1059,31 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <20 x half> %a, splat (half 0xH0200) - %a2 = bitcast <20 x half> %a1 to <10 x i32> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x i32> br label %end cmp.false: - %a3 = bitcast <20 x half> %a to <10 x i32> + %a3 = bitcast <20 x i16> %a to <10 x i32> br label %end end: @@ -707,561 +1091,9311 @@ end: ret <10 x i32> %phi } -define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB4_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB4_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v35, v49, v35 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; GCN-NEXT: v_or_b32_e32 v6, v6, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v19, v32, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v38, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v27, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v21, v28, v30 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v22, v17 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v18, v19 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v26 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_or_b32_e32 v5, v5, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v21 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x i32> @bitcast_v20i16_to_v10i32_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v0, v13 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v10i32_to_v40i8: +; VI-LABEL: bitcast_v20i16_to_v10i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v28, v8 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v3 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB4_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 ; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB4_4: ; %end +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 -; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v40i8: +; GFX9-LABEL: bitcast_v10i32_to_v20f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: ; implicit-def: $vgpr16 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr14 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr13 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr12 -; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB4_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 ; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 ; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] ; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB4_4: ; %end +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define inreg <20 x half> @bitcast_v10i32_to_v20f16_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v10i32_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v20f16_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v27, v3 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v20f16_to_v10i32_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v20, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v20f16_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v37, 8, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v48, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v38, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s28, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v10i32_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s75, 0xff +; VI-NEXT: s_lshl_b32 s9, s12, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s17, 0xff +; VI-NEXT: s_lshl_b32 s7, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s73, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s19, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s60, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s58, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s43, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s42, 0xff +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s40, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s29, 0xff +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s12, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s74, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s73, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s72, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s61, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s59, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s58, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s46, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s45, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s43, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s41, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s40, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s4, s6, s4 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s28, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10i32_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s25 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s63, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s58, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s45, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s22, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v40i8_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v44 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v9, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v7, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v28 +; VI-NEXT: v_add_u16_e32 v8, 3, v30 +; VI-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v8, 3, v39 +; VI-NEXT: v_add_u16_e32 v10, 3, v38 +; VI-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v37 +; VI-NEXT: v_add_u16_e32 v12, 3, v36 +; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40i8_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x9 +; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0x9 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_4 +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB14_3: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: .LBB14_4: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v12, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v14, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v16, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v53, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v55, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v65, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v48, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v49, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v51, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v52, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v20, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v22, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v30, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v39, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v36, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v25, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v29, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v13, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v28, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v40i8_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v28, v2 +; VI-NEXT: v_mov_b32_e32 v27, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v28, v2 +; GFX9-NEXT: v_mov_b32_e32 v27, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v26, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v4 :: v_dual_mov_b32 v24, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v0 :: v_dual_lshlrev_b32 v32, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v30, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v31, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v25 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v22, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define inreg <5 x double> @bitcast_v10i32_to_v5f64_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v10i32_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v5f64_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v5f64_to_v10i32_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB19_4 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_3: +; SI-NEXT: s_branch .LBB19_2 +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v10i32_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v5i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v5i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define inreg <5 x i64> @bitcast_v10i32_to_v5i64_scalar(<10 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i32_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v10i32_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v10i32_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v10i32_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v5i64_to_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i64_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i64_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define inreg <10 x i32> @bitcast_v5i64_to_v10i32_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v10i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v5i64_to_v10i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v10i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v10i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v8 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_or_b32_e32 v7, v7, v15 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_or_b32_e32 v6, v20, v6 +; SI-NEXT: v_or_b32_e32 v7, v15, v7 +; SI-NEXT: v_or_b32_e32 v8, v13, v8 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20i16_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define inreg <10 x float> @bitcast_v20i16_to_v10f32_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v11, v2 +; SI-NEXT: v_mov_b32_e32 v12, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v0, v13 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v20i16_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v28, v8 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v3 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_4 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB28_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: .LBB28_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define inreg <20 x half> @bitcast_v10f32_to_v20f16_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v10f32_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v20f16_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v2, v29, v2 +; SI-NEXT: v_or_b32_e32 v3, v27, v3 +; SI-NEXT: v_or_b32_e32 v4, v25, v4 +; SI-NEXT: v_or_b32_e32 v5, v23, v5 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v21 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB30_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB30_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB30_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define inreg <10 x float> @bitcast_v20f16_to_v10f32_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v29, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v3, v22, v3 +; SI-NEXT: v_or_b32_e32 v4, v20, v4 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v6, v16, v6 +; SI-NEXT: v_or_b32_e32 v7, v14, v7 +; SI-NEXT: v_or_b32_e32 v8, v12, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v20 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v20f16_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_4 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_3: +; VI-NEXT: s_branch .LBB31_2 +; VI-NEXT: .LBB31_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB32_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB32_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB32_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB32_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 @@ -1297,7 +10431,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-TRUE16-LABEL: bitcast_v10f32_to_v40i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -1324,7 +10458,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -1346,27 +10480,23 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB4_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB32_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 @@ -1380,7 +10510,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB4_4: ; %end +; GFX11-TRUE16-NEXT: .LBB32_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1481,7 +10611,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: +; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -1518,7 +10648,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -1550,27 +10680,23 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB4_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 @@ -1594,7 +10720,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB4_4: ; %end +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -1696,12 +10822,12 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <40 x i8> + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <40 x i8> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <40 x i8> + %a3 = bitcast <10 x float> %a to <40 x i8> br label %end end: @@ -1709,290 +10835,1508 @@ end: ret <40 x i8> %phi } -define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v55 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v14, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v22, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v26, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: .LBB5_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v49, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v50, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v52, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v27, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v29, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v53, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x300, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB5_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s28, s25, 24 +; SI-NEXT: s_lshr_b32 s29, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 8 +; SI-NEXT: s_lshr_b32 s15, s23, 24 +; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s9, s19, 24 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v31, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_alignbit_b32 v1, v16, v17, 24 +; SI-NEXT: v_alignbit_b32 v2, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v3, v16, v17, 8 +; SI-NEXT: v_alignbit_b32 v4, v18, v21, 24 +; SI-NEXT: v_alignbit_b32 v5, v18, v21, 16 +; SI-NEXT: v_alignbit_b32 v6, v18, v21, 8 +; SI-NEXT: v_alignbit_b32 v7, v23, v24, 24 +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v23, v24, 8 +; SI-NEXT: v_alignbit_b32 v10, v28, v29, 24 +; SI-NEXT: v_alignbit_b32 v11, v28, v29, 16 +; SI-NEXT: v_alignbit_b32 v12, v28, v29, 8 +; SI-NEXT: v_alignbit_b32 v13, v31, v34, 24 +; SI-NEXT: v_alignbit_b32 v14, v31, v34, 16 +; SI-NEXT: v_alignbit_b32 v15, v31, v34, 8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v31 +; SI-NEXT: s_branch .LBB33_5 +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v31, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v28, s19 +; SI-NEXT: v_mov_b32_e32 v24, s20 +; SI-NEXT: v_mov_b32_e32 v23, s21 +; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v48, s8 +; SI-NEXT: v_mov_b32_e32 v39, s7 +; SI-NEXT: v_mov_b32_e32 v38, s6 +; SI-NEXT: v_mov_b32_e32 v37, s11 +; SI-NEXT: v_mov_b32_e32 v36, s10 +; SI-NEXT: v_mov_b32_e32 v35, s9 +; SI-NEXT: v_mov_b32_e32 v33, s14 +; SI-NEXT: v_mov_b32_e32 v32, s13 +; SI-NEXT: v_mov_b32_e32 v30, s12 +; SI-NEXT: v_mov_b32_e32 v27, s27 +; SI-NEXT: v_mov_b32_e32 v26, s26 +; SI-NEXT: v_mov_b32_e32 v25, s15 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v15, v34, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v48 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v38 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v40i8_to_v10i32: +; VI-LABEL: bitcast_v10f32_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s29, s25, 8 +; VI-NEXT: s_lshr_b32 s28, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 8 +; VI-NEXT: s_lshr_b32 s43, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s57, s21, 8 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s19, 8 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s17, 8 +; VI-NEXT: s_lshr_b32 s74, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; VI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s20, 1.0 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; VI-NEXT: v_add_f32_e64 v8, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s18, 1.0 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_f32_e64 v10, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s16, 1.0 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; VI-NEXT: s_branch .LBB33_5 +; VI-NEXT: .LBB33_3: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v10, s17 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v8, s19 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v4, s23 +; VI-NEXT: v_mov_b32_e32 v1, s24 +; VI-NEXT: v_mov_b32_e32 v2, s25 +; VI-NEXT: v_mov_b32_e32 v39, s76 +; VI-NEXT: v_mov_b32_e32 v48, s74 +; VI-NEXT: v_mov_b32_e32 v38, s75 +; VI-NEXT: v_mov_b32_e32 v36, s73 +; VI-NEXT: v_mov_b32_e32 v37, s72 +; VI-NEXT: v_mov_b32_e32 v35, s63 +; VI-NEXT: v_mov_b32_e32 v34, s61 +; VI-NEXT: v_mov_b32_e32 v33, s62 +; VI-NEXT: v_mov_b32_e32 v31, s60 +; VI-NEXT: v_mov_b32_e32 v32, s59 +; VI-NEXT: v_mov_b32_e32 v30, s58 +; VI-NEXT: v_mov_b32_e32 v29, s56 +; VI-NEXT: v_mov_b32_e32 v28, s57 +; VI-NEXT: v_mov_b32_e32 v26, s47 +; VI-NEXT: v_mov_b32_e32 v27, s46 +; VI-NEXT: v_mov_b32_e32 v25, s45 +; VI-NEXT: v_mov_b32_e32 v24, s43 +; VI-NEXT: v_mov_b32_e32 v23, s44 +; VI-NEXT: v_mov_b32_e32 v21, s42 +; VI-NEXT: v_mov_b32_e32 v22, s41 +; VI-NEXT: v_mov_b32_e32 v20, s40 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v13, s8 +; VI-NEXT: v_mov_b32_e32 v12, s10 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: .LBB33_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; VI-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; VI-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; VI-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; VI-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; VI-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; VI-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s28, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 8 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s21, 8 +; GFX9-NEXT: s_lshr_b32 s56, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 8 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s17, 8 +; GFX9-NEXT: s_lshr_b32 s74, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s22, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_add_f32_e64 v6, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s20, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_add_f32_e64 v8, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e64 v10, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: s_branch .LBB33_5 +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-NEXT: v_mov_b32_e32 v39, s76 +; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v38, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s73 +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v35, s63 +; GFX9-NEXT: v_mov_b32_e32 v34, s61 +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s60 +; GFX9-NEXT: v_mov_b32_e32 v32, s59 +; GFX9-NEXT: v_mov_b32_e32 v30, s58 +; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v28, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v27, s46 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v24, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s44 +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v22, s41 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: .LBB33_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v10f32_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v15 +; GFX11-TRUE16-NEXT: s_branch .LBB33_5 +; GFX11-TRUE16-NEXT: .LBB33_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: .LBB33_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v34, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v29, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v11, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v12, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB33_5 +; GFX11-FAKE16-NEXT: .LBB33_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s20 :: v_dual_mov_b32 v2, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s63 :: v_dual_mov_b32 v39, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s62 :: v_dual_mov_b32 v37, s60 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s56 :: v_dual_mov_b32 v33, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s47 :: v_dual_mov_b32 v31, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v29, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s44 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v25, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s28 :: v_dual_mov_b32 v23, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v7, s12 +; GFX11-FAKE16-NEXT: .LBB33_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v34, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v10, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v24, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v3, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v4, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v40i8_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v8 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v44 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v8, v8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v55, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v54, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v50, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v10f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -2045,7 +12389,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2118,9 +12462,9 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB5_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 @@ -2194,7 +12538,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB5_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -2204,7 +12548,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v40i8_to_v10i32: +; GFX9-LABEL: bitcast_v40i8_to_v10f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -2257,7 +12601,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2330,9 +12674,9 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: .LBB5_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 @@ -2406,7 +12750,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB5_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -2416,7 +12760,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10f32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 @@ -2473,15 +12817,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB5_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-TRUE16-NEXT: .LBB5_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_4 +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB5_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h @@ -2594,8 +12938,8 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB5_2 -; GFX11-TRUE16-NEXT: .LBB5_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 @@ -2710,7 +13054,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10i32: +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10f32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 @@ -2758,15 +13102,15 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB5_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB5_4 -; GFX11-FAKE16-NEXT: .LBB5_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_4 +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB5_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -2879,8 +13223,8 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB5_2 -; GFX11-FAKE16-NEXT: .LBB5_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -2948,140 +13292,1744 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v37, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v36, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v25, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v29, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v13, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v28, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v40i8_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v28, v2 +; VI-NEXT: v_mov_b32_e32 v27, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v28, v2 +; GFX9-NEXT: v_mov_b32_e32 v27, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v26, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v4 :: v_dual_mov_b32 v24, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v0 :: v_dual_lshlrev_b32 v32, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v34, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v29, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v30, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v31, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v12 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v25 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v25, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v29, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v11, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v13, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v15, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v28, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v22, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define inreg <5 x double> @bitcast_v10f32_to_v5f64_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v5f64_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define inreg <10 x float> @bitcast_v5f64_to_v10f32_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_4 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_3: +; SI-NEXT: s_branch .LBB39_2 +; SI-NEXT: .LBB39_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_3: +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <40 x i8> %a, splat (i8 3) - %a2 = bitcast <40 x i8> %a1 to <10 x i32> + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x float> br label %end cmp.false: - %a3 = bitcast <40 x i8> %a to <10 x i32> + %a3 = bitcast <5 x double> %a to <10 x float> br label %end end: - %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i32> %phi + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi } -define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v10f32_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10i32_to_v5f64: +; VI-LABEL: bitcast_v10f32_to_v5i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v5f64: +; GFX9-LABEL: bitcast_v10f32_to_v5i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i32_to_v5f64: +; GFX11-LABEL: bitcast_v10f32_to_v5i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3089,199 +15037,278 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <5 x double> + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x i64> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <5 x double> + %a3 = bitcast <10 x float> %a to <5 x i64> br label %end end: - %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <5 x double> %phi + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi } -define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <5 x i64> @bitcast_v10f32_to_v5i64_scalar(<10 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f32_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_3: +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v5f64_to_v10i32: +; VI-LABEL: bitcast_v10f32_to_v5i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB7_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5f64_to_v10i32: +; GFX9-LABEL: bitcast_v10f32_to_v5i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB7_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5f64_to_v10i32: +; GFX11-LABEL: bitcast_v10f32_to_v5i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB7_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <5 x double> %a1 to <10 x i32> + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x i64> br label %end cmp.false: - %a3 = bitcast <5 x double> %a to <10 x i32> + %a3 = bitcast <10 x float> %a to <5 x i64> br label %end end: - %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i32> %phi + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi } -define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i32_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v5i64_to_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10i32_to_v5i64: +; VI-LABEL: bitcast_v5i64_to_v10f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10i32_to_v5i64: +; GFX9-LABEL: bitcast_v5i64_to_v10f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10i32_to_v5i64: +; GFX11-LABEL: bitcast_v5i64_to_v10f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3289,109 +15316,397 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <10 x i32> %a, splat (i32 3) - %a2 = bitcast <10 x i32> %a1 to <5 x i64> + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x float> br label %end cmp.false: - %a3 = bitcast <10 x i32> %a to <5 x i64> + %a3 = bitcast <5 x i64> %a to <10 x float> br label %end end: - %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <5 x i64> %phi + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi } -define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v10i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x float> @bitcast_v5i64_to_v10f32_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v10f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: s_branch .LBB43_2 ; -; VI-LABEL: bitcast_v5i64_to_v10i32: +; VI-LABEL: bitcast_v5i64_to_v10f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v10f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v10f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB43_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v19 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v37, v17 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v35, v15 +; SI-NEXT: v_mov_b32_e32 v34, v14 +; SI-NEXT: v_mov_b32_e32 v33, v13 +; SI-NEXT: v_mov_b32_e32 v32, v12 +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v9 +; SI-NEXT: v_mov_b32_e32 v28, v8 +; SI-NEXT: v_mov_b32_e32 v27, v7 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v23, v3 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v21, v1 +; SI-NEXT: v_mov_b32_e32 v48, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v20f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: v_mov_b32_e32 v10, 3 +; VI-NEXT: v_add_u16_sdwa v11, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v12, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v15, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v16, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v17, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_or_b32_e32 v8, v8, v19 +; VI-NEXT: v_or_b32_e32 v7, v7, v18 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_or_b32_e32 v5, v5, v16 +; VI-NEXT: v_or_b32_e32 v4, v4, v15 +; VI-NEXT: v_or_b32_e32 v3, v3, v14 +; VI-NEXT: v_or_b32_e32 v2, v2, v13 +; VI-NEXT: v_or_b32_e32 v1, v1, v12 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5i64_to_v10i32: +; GFX9-LABEL: bitcast_v20i16_to_v20f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5i64_to_v10i32: +; GFX11-LABEL: bitcast_v20i16_to_v20f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3399,245 +15714,299 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <5 x i64> %a, splat (i64 3) - %a2 = bitcast <5 x i64> %a1 to <10 x i32> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <20 x half> br label %end cmp.false: - %a3 = bitcast <5 x i64> %a to <10 x i32> + %a3 = bitcast <20 x i16> %a to <20 x half> br label %end end: - %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i32> %phi + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi } -define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v9 -; GCN-NEXT: v_mov_b32_e32 v28, v8 -; GCN-NEXT: v_mov_b32_e32 v27, v7 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v5 -; GCN-NEXT: v_mov_b32_e32 v24, v4 -; GCN-NEXT: v_mov_b32_e32 v23, v3 -; GCN-NEXT: v_mov_b32_e32 v22, v2 -; GCN-NEXT: v_mov_b32_e32 v21, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x half> @bitcast_v20i16_to_v20f16_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v22, v5 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v24, v1 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB45_2 ; -; VI-LABEL: bitcast_v10f32_to_v20f16: +; VI-LABEL: bitcast_v20i16_to_v20f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 ; -; GFX9-LABEL: bitcast_v10f32_to_v20f16: +; GFX9-LABEL: bitcast_v20i16_to_v20f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f32_to_v20f16: +; GFX11-LABEL: bitcast_v20i16_to_v20f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <20 x half> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <20 x half> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <20 x half> + %a3 = bitcast <20 x i16> %a to <20 x half> br label %end end: @@ -3645,220 +16014,177 @@ end: ret <20 x half> %phi } -define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v29, v0 -; GCN-NEXT: v_or_b32_e32 v1, v27, v1 -; GCN-NEXT: v_or_b32_e32 v2, v25, v2 -; GCN-NEXT: v_or_b32_e32 v3, v23, v3 -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_or_b32_e32 v5, v21, v5 -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: v_or_b32_e32 v7, v12, v7 -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v17, v16 -; GCN-NEXT: v_or_b32_e32 v6, v19, v18 -; GCN-NEXT: v_or_b32_e32 v7, v12, v15 -; GCN-NEXT: v_or_b32_e32 v8, v11, v14 -; GCN-NEXT: v_or_b32_e32 v9, v10, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v20f16_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v20f16_to_v10f32: +; VI-LABEL: bitcast_v20f16_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v10, 0x200 -; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v11, 0x200 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v11, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 ; VI-NEXT: v_or_b32_e32 v9, v9, v11 -; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v11 -; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v11 -; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v11 -; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v11 -; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v11 -; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v11 -; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v11 -; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v11 -; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_or_b32_e32 v7, v18, v7 +; VI-NEXT: v_or_b32_e32 v6, v17, v6 +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_or_b32_e32 v4, v15, v4 +; VI-NEXT: v_or_b32_e32 v3, v14, v3 +; VI-NEXT: v_or_b32_e32 v2, v13, v2 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v20f16_to_v10f32: +; GFX9-LABEL: bitcast_v20f16_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -3871,11 +16197,11 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v20f16_to_v10f32: +; GFX11-LABEL: bitcast_v20f16_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -3883,7 +16209,7 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -3895,7 +16221,7 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3903,432 +16229,927 @@ define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { cmp.true: %a1 = fadd <20 x half> %a, splat (half 0xH0200) - %a2 = bitcast <20 x half> %a1 to <10 x float> + %a2 = bitcast <20 x half> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <20 x half> %a to <10 x float> + %a3 = bitcast <20 x half> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v35, v49, v35 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; GCN-NEXT: v_or_b32_e32 v6, v6, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v19, v32, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v38, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v27, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v21, v28, v30 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v22, v17 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v18, v19 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v26 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_or_b32_e32 v5, v5, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v21 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: s_branch .LBB47_2 ; -; VI-LABEL: bitcast_v10f32_to_v40i8: +; VI-LABEL: bitcast_v20f16_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s25, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v9, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v11, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v10, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v10, v11 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v20 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v30, v1, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v31, v1, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v23, v1, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v22, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v21, v1, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v18, v1, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v1, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v1, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v6, v1, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v2, v1, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 +; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 +; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 +; SI-NEXT: v_alignbit_b32 v36, v22, v23, 24 +; SI-NEXT: v_alignbit_b32 v37, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 8 +; SI-NEXT: v_alignbit_b32 v33, v18, v21, 24 +; SI-NEXT: v_alignbit_b32 v34, v18, v21, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v21, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v29, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v14, 8 +; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; SI-NEXT: v_and_b32_e32 v45, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v42, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v55, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v51, 0xffff, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v46, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v43, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v40, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v53, v20, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v60, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v25, v2 +; SI-NEXT: v_or_b32_e32 v4, v59, v4 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v24, v2 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v39, v31, v30, 24 +; SI-NEXT: v_alignbit_b32 v48, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v52, v31, v30, 8 +; SI-NEXT: v_alignbit_b32 v36, v22, v23, 24 +; SI-NEXT: v_alignbit_b32 v37, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 8 +; SI-NEXT: v_alignbit_b32 v33, v18, v21, 24 +; SI-NEXT: v_alignbit_b32 v34, v18, v21, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v21, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v29, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v14, 8 +; SI-NEXT: v_alignbit_b32 v26, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v27, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v6, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v31 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v40i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; VI-NEXT: ; implicit-def: $vgpr16 -; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr14 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr12 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v55, v7 +; VI-NEXT: v_mov_b32_e32 v53, v8 +; VI-NEXT: v_mov_b32_e32 v43, v9 +; VI-NEXT: v_mov_b32_e32 v42, v10 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_sdwa v17, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v42, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_add_u16_e32 v43, 3, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 +; VI-NEXT: v_add_u16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v53, 3, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; VI-NEXT: v_add_u16_e32 v55, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; VI-NEXT: v_or_b32_e32 v10, v42, v10 +; VI-NEXT: v_or_b32_e32 v9, v43, v9 +; VI-NEXT: v_add_u16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v49, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_add_u16_e32 v50, 3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; VI-NEXT: v_or_b32_e32 v8, v53, v8 +; VI-NEXT: v_or_b32_e32 v7, v55, v7 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_u16_e32 v37, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; VI-NEXT: v_add_u16_e32 v38, 3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; VI-NEXT: v_or_b32_e32 v6, v49, v6 +; VI-NEXT: v_or_b32_e32 v5, v50, v5 ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; VI-NEXT: v_add_u16_e32 v34, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; VI-NEXT: v_or_b32_e32 v4, v37, v4 +; VI-NEXT: v_or_b32_e32 v3, v38, v3 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_or_b32_e32 v1, v34, v1 ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v1 +; VI-NEXT: v_bfe_u32 v29, v17, 8, 8 +; VI-NEXT: v_bfe_u32 v33, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v39, v19, 8, 8 +; VI-NEXT: v_bfe_u32 v52, v21, 8, 8 +; VI-NEXT: v_bfe_u32 v41, v23, 8, 8 +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 -; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10f32_to_v40i8: +; GFX9-LABEL: bitcast_v20i16_to_v40i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 @@ -4364,7 +17185,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -4396,23 +17217,23 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 @@ -4440,7 +17261,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -4505,7 +17326,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v10f32_to_v40i8: +; GFX11-TRUE16-LABEL: bitcast_v20i16_to_v40i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -4532,7 +17353,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -4554,23 +17375,27 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 @@ -4584,7 +17409,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -4685,7 +17510,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: +; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 @@ -4722,7 +17547,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -4754,23 +17579,27 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 8, v10 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v9 @@ -4794,7 +17623,7 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -4872,36 +17701,1354 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s72, v6 +; SI-NEXT: v_readfirstlane_b32 s73, v5 +; SI-NEXT: v_readfirstlane_b32 s62, v2 +; SI-NEXT: v_readfirstlane_b32 s63, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_alignbit_b32 v7, s14, v1, 24 +; SI-NEXT: v_alignbit_b32 v12, s14, v1, 16 +; SI-NEXT: v_alignbit_b32 v16, s14, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_alignbit_b32 v8, s12, v1, 24 +; SI-NEXT: v_alignbit_b32 v13, s12, v1, 16 +; SI-NEXT: v_alignbit_b32 v17, s12, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: v_alignbit_b32 v6, s10, v1, 24 +; SI-NEXT: v_alignbit_b32 v11, s10, v1, 16 +; SI-NEXT: v_alignbit_b32 v15, s10, v1, 8 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v5, s8, v1, 24 +; SI-NEXT: v_alignbit_b32 v9, s8, v1, 16 +; SI-NEXT: v_alignbit_b32 v14, s8, v1, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v18 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: v_alignbit_b32 v2, s6, v1, 24 +; SI-NEXT: v_alignbit_b32 v4, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v10, s6, v1, 8 +; SI-NEXT: s_lshr_b32 s59, s14, 8 +; SI-NEXT: s_lshr_b32 s56, s12, 8 +; SI-NEXT: s_lshr_b32 s45, s10, 8 +; SI-NEXT: s_lshr_b32 s42, s8, 8 +; SI-NEXT: s_lshr_b32 s15, s6, 8 +; SI-NEXT: s_and_b32 s60, s19, 0xffff +; SI-NEXT: s_and_b32 s57, s23, 0xffff +; SI-NEXT: s_and_b32 s46, s27, 0xffff +; SI-NEXT: s_and_b32 s43, s62, 0xffff +; SI-NEXT: s_and_b32 s40, s72, 0xffff +; SI-NEXT: s_bfe_u32 s61, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s58, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s47, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s44, s62, 0x80008 +; SI-NEXT: s_bfe_u32 s41, s72, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_alignbit_b32 v7, s14, v2, 24 +; SI-NEXT: v_alignbit_b32 v12, s14, v2, 16 +; SI-NEXT: v_alignbit_b32 v16, s14, v2, 8 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_alignbit_b32 v8, s12, v2, 24 +; SI-NEXT: v_alignbit_b32 v13, s12, v2, 16 +; SI-NEXT: v_alignbit_b32 v17, s12, v2, 8 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_alignbit_b32 v6, s10, v2, 24 +; SI-NEXT: v_alignbit_b32 v11, s10, v2, 16 +; SI-NEXT: v_alignbit_b32 v15, s10, v2, 8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_alignbit_b32 v5, s8, v2, 24 +; SI-NEXT: v_alignbit_b32 v9, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v14, s8, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v3, v1, 24 +; SI-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; SI-NEXT: v_alignbit_b32 v10, v3, v1, 8 +; SI-NEXT: s_lshr_b32 s61, s14, 24 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s14, 8 +; SI-NEXT: s_lshr_b32 s58, s12, 24 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s12, 8 +; SI-NEXT: s_lshr_b32 s47, s10, 24 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s10, 8 +; SI-NEXT: s_lshr_b32 s44, s8, 24 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s8, 8 +; SI-NEXT: s_lshr_b32 s41, s6, 24 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s6, 8 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s13, s61, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v7, v12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s13, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s58, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s47, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v9 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s44, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s40, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s41, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v20i16_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s75, 0xff +; VI-NEXT: s_lshl_b32 s9, s12, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s17, 0xff +; VI-NEXT: s_lshl_b32 s7, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s73, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s19, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s60, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s58, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s43, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s42, 0xff +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s40, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s29, 0xff +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s28, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 8 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s21, 8 +; GFX9-NEXT: s_lshr_b32 s56, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 8 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s17, 8 +; GFX9-NEXT: s_lshr_b32 s74, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_pk_add_u16 v6, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_pk_add_u16 v8, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v10, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-NEXT: v_mov_b32_e32 v39, s76 +; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v38, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s73 +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v35, s63 +; GFX9-NEXT: v_mov_b32_e32 v34, s61 +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s60 +; GFX9-NEXT: v_mov_b32_e32 v32, s59 +; GFX9-NEXT: v_mov_b32_e32 v30, s58 +; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v28, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v27, s46 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v24, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s44 +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v22, s41 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v20i16_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v32, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v30, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s20 :: v_dual_mov_b32 v2, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s63 :: v_dual_mov_b32 v39, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s62 :: v_dual_mov_b32 v37, s60 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s56 :: v_dual_mov_b32 v33, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s47 :: v_dual_mov_b32 v31, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v29, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s44 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v25, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s28 :: v_dual_mov_b32 v23, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v7, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v34, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v10, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v24, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v3, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v4, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <40 x i8> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <40 x i8> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <40 x i8> + %a3 = bitcast <20 x i16> %a to <40 x i8> br label %end end: @@ -4909,383 +19056,460 @@ end: ret <40 x i8> %phi } -define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v55 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v14, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v22, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v26, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: .LBB13_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v54 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v49, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v50, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v14, v52, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v27, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v22, v29, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v26, v53, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v13, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v15, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v17, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v13, v19, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v22 -; GCN-NEXT: v_or_b32_e32 v17, v23, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x300, v26 -; GCN-NEXT: v_or_b32_e32 v19, v25, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB13_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v40i8_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v31, v14 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v38, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v19 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v29 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v35, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v2, v49, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v6, v50, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v32, v4, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v25, v0, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v40 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v11, v53, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v10, v7, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v12, v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v15, v47, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v14, v12, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v34, v0, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v16, v16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_or_b32_e32 v12, v46, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v19, v56, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_or_b32_e32 v18, v16, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v12, v0, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_alignbit_b32 v1, v35, v2, 16 +; SI-NEXT: v_alignbit_b32 v5, v32, v4, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v7, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v55 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v46, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v41, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v53, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v50, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v1, v35, v25, 16 +; SI-NEXT: v_alignbit_b32 v5, v32, v21, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v25 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v6, v32 +; SI-NEXT: v_mov_b32_e32 v8, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v40i8_to_v10f32: +; VI-LABEL: bitcast_v40i8_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, v8 -; VI-NEXT: v_mov_b32_e32 v34, v6 -; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v38, v4 ; VI-NEXT: v_mov_b32_e32 v32, v2 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v37, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v31, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr12 -; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr20 @@ -5294,210 +19518,222 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB13_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u16_e32 v0, 3, v31 -; VI-NEXT: v_add_u16_e32 v1, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v9, 0x300 -; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 3, v33 -; VI-NEXT: v_add_u16_e32 v2, 3, v34 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 -; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: v_add_u16_e32 v2, 3, v35 -; VI-NEXT: v_add_u16_e32 v3, 3, v10 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v12 -; VI-NEXT: v_add_u16_e32 v4, 3, v14 -; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 -; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v16 -; VI-NEXT: v_add_u16_e32 v5, 3, v18 -; VI-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 -; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v20 -; VI-NEXT: v_add_u16_e32 v6, 3, v22 -; VI-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 -; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v24 -; VI-NEXT: v_add_u16_e32 v7, 3, v26 -; VI-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v28 -; VI-NEXT: v_add_u16_e32 v8, 3, v30 -; VI-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 -; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v1, 0x300 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v0, 3, v54 +; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v0, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v8, 3, v39 -; VI-NEXT: v_add_u16_e32 v10, 3, v38 -; VI-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v37 -; VI-NEXT: v_add_u16_e32 v12, 3, v36 -; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v51 +; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v12, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v24 +; VI-NEXT: v_or_b32_sdwa v13, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v14, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v16 +; VI-NEXT: v_or_b32_sdwa v15, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v16, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v34 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_or_b32_sdwa v17, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v38 +; VI-NEXT: v_or_b32_sdwa v19, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v36 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v19 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 -; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v18 +; VI-NEXT: v_or_b32_e32 v2, v17, v2 +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_or_b32_e32 v4, v15, v4 +; VI-NEXT: v_or_b32_e32 v5, v14, v5 +; VI-NEXT: v_or_b32_e32 v6, v13, v6 +; VI-NEXT: v_or_b32_e32 v7, v12, v7 +; VI-NEXT: v_or_b32_e32 v8, v11, v8 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB13_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v40i8_to_v10f32: +; GFX9-LABEL: bitcast_v40i8_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v35, v8 -; GFX9-NEXT: v_mov_b32_e32 v34, v6 -; GFX9-NEXT: v_mov_b32_e32 v33, v4 -; GFX9-NEXT: v_mov_b32_e32 v32, v2 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v31, v10 +; GFX9-NEXT: v_mov_b32_e32 v32, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v37, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v10 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v34, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v37, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: v_or_b32_sdwa v8, v55, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v53, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_or_b32_sdwa v9, v42, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr10 -; GFX9-NEXT: ; implicit-def: $vgpr12 -; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr20 @@ -5506,502 +19742,444 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr13 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: .LBB13_2: ; %Flow +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v53 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v36 +; GFX9-NEXT: v_add_u16_e32 v19, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 -; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 -; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 -; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 -; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 -; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 -; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v39 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v37 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB13_4: ; %end +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v19, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v18, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v17, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v16, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v15, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v14, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v13, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v12, v7, s6 +; GFX9-NEXT: v_perm_b32 v8, v11, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10f32: +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x9 -; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v35.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v35.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v34.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v35.h, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v23.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v28.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v29.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v29.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v26.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v27.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v25.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v25.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v17.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v16.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v18.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v2.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v11.l ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10f32: +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, v10 :: v_dual_mov_b32 v34, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 ; GFX11-FAKE16-NEXT: s_clause 0x9 ; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:36 ; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_u16 v37, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_u16 v38, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:4 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v19 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v19 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v29 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v4 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v6 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v70, 8, v8 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v71, 8, v10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v49 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v49 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v22 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v24 @@ -6009,43 +20187,33 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v28 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v30 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v66 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v71 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v8, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v12, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v14, v13, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 @@ -6054,234 +20222,1455 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr17 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v34, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v35, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v10, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v12, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v14, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v16, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v28, 3 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v67, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v27, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v69, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v30, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v20, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v24, 3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v11, v8, 0x5040100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v53, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v16, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v19, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v31, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v54, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v53, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v34, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v36, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v17, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v52, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, v37, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v38, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, v33, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v55, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v65, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v48, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v49, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v50, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v51, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v52, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x300, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v48, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v49, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v50, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v39, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, 0x300, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v3, 0x300, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v4, 0x300, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v8, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, v20, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, v22, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, v24, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, v26, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, v28, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, v30, 3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, v39, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, v38, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, v37, 3 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, v36, 3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v25, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v29, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v11, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v13, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v15, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v17, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v19, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v5, 0x300, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v6, 0x300, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v7, 0x300, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v8, 0x300, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v9, 0x300, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v10, 0x300, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v11, 0x300, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v12, 0x300, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v13, 0x300, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u16 v14, 0x300, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v20, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v17, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v18, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v19, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v20, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v21, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v14, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v13, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v12, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v10, v9, 0x5040100 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_readfirstlane_b32 s14, v19 +; SI-NEXT: v_readfirstlane_b32 s40, v18 +; SI-NEXT: v_readfirstlane_b32 s12, v11 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v21 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s10, s19, 24 +; SI-NEXT: s_or_b32 s4, s10, s4 +; SI-NEXT: s_and_b32 s10, s28, 0xff +; SI-NEXT: s_lshl_b32 s15, s29, 8 +; SI-NEXT: s_or_b32 s10, s10, s15 +; SI-NEXT: s_and_b32 s15, s6, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s41, s7, 24 +; SI-NEXT: s_or_b32 s43, s41, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s41, s27, 24 +; SI-NEXT: s_or_b32 s15, s41, s15 +; SI-NEXT: s_and_b32 s41, s16, 0xff +; SI-NEXT: s_lshl_b32 s42, s17, 8 +; SI-NEXT: s_or_b32 s41, s41, s42 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 +; SI-NEXT: s_or_b32 s41, s41, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s42, s25, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v0, v10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 +; SI-NEXT: s_or_b32 s15, s4, s15 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s42, s8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v27 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_or_b32_e32 v15, v3, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v19, v7, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v25, v13, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v23, s4, v15 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s42, s12, 8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v17, v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_or_b32_e32 v21, v28, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v32, v29, v18 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v18, v17, v32 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v26, s4, v21 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s42, s14, 8 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: s_or_b32 s10, s10, s43 +; SI-NEXT: v_or_b32_e32 v33, v31, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s10, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 +; SI-NEXT: v_alignbit_b32 v13, v25, v21, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v33, 16 +; SI-NEXT: v_or_b32_e32 v21, s4, v33 +; SI-NEXT: s_lshr_b32 s42, s5, 16 +; SI-NEXT: s_lshr_b32 s43, s43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: s_add_i32 s41, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s11, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s41 +; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s15 +; SI-NEXT: v_alignbit_b32 v5, s10, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v25, v26, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v21, 16 +; SI-NEXT: s_lshr_b32 s42, s11, 16 +; SI-NEXT: s_lshr_b32 s43, s10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s41 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s42 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, v23 +; SI-NEXT: v_mov_b32_e32 v12, v26 +; SI-NEXT: v_mov_b32_e32 v14, v25 +; SI-NEXT: v_mov_b32_e32 v16, v21 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v40i8_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: v_mov_b32_e32 v27, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v29, v8 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v30, v4 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v6, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v5, v38, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v10 +; GFX9-NEXT: v_mov_b32_e32 v27, v8 +; GFX9-NEXT: v_mov_b32_e32 v28, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v2 +; GFX9-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v9, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v7, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v12, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v11, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: %a1 = add <40 x i8> %a, splat (i8 3) - %a2 = bitcast <40 x i8> %a1 to <10 x float> + %a2 = bitcast <40 x i8> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <40 x i8> %a to <10 x float> + %a3 = bitcast <40 x i8> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v12 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v31 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10f32_to_v5f64: +; VI-LABEL: bitcast_v20i16_to_v5f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10f32_to_v5f64: +; GFX9-LABEL: bitcast_v20i16_to_v5f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f32_to_v5f64: +; GFX11-LABEL: bitcast_v20i16_to_v5f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -6289,25 +21678,31 @@ define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <5 x double> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x double> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <5 x double> + %a3 = bitcast <20 x i16> %a to <5 x double> br label %end end: @@ -6315,62 +21710,390 @@ end: ret <5 x double> %phi } -define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB53_2 ; -; VI-LABEL: bitcast_v5f64_to_v10f32: +; VI-LABEL: bitcast_v20i16_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v5f64_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v9 +; SI-NEXT: v_mov_b32_e32 v27, v8 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v6, v22 +; SI-NEXT: v_mov_b32_e32 v8, v23 +; SI-NEXT: v_mov_b32_e32 v10, v24 +; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: v_mov_b32_e32 v14, v26 +; SI-NEXT: v_mov_b32_e32 v16, v27 +; SI-NEXT: v_mov_b32_e32 v18, v28 +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5f64_to_v10f32: +; GFX9-LABEL: bitcast_v5f64_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5f64_to_v10f32: +; GFX11-LABEL: bitcast_v5f64_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -6378,14 +22101,14 @@ define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6393,89 +22116,407 @@ define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { cmp.true: %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <5 x double> %a1 to <10 x float> + %a2 = bitcast <5 x double> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <5 x double> %a to <10 x float> + %a3 = bitcast <5 x double> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } -define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f32_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v20, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v21, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v22, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v23, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v24, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 +; SI-NEXT: v_alignbit_b32 v20, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v21, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_branch .LBB55_5 +; SI-NEXT: .LBB55_3: +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: v_mov_b32_e32 v9, v22 +; SI-NEXT: v_mov_b32_e32 v13, v21 +; SI-NEXT: v_mov_b32_e32 v17, v20 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v10f32_to_v5i64: +; VI-LABEL: bitcast_v5f64_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v20i16_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v12 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_4 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB56_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v31 +; SI-NEXT: v_or_b32_e32 v6, v6, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v29 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: .LBB56_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i16_to_v5i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_add_u16_e32 v10, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v10, v8 +; VI-NEXT: v_add_u16_e32 v10, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v10, v7 +; VI-NEXT: v_add_u16_e32 v10, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_add_u16_e32 v10, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v10, v5 +; VI-NEXT: v_add_u16_e32 v10, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v10, v4 +; VI-NEXT: v_add_u16_e32 v10, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_add_u16_e32 v10, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v10, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_add_u16_e32 v10, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v10, v0 +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v10f32_to_v5i64: +; GFX9-LABEL: bitcast_v20i16_to_v5i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v10f32_to_v5i64: +; GFX11-LABEL: bitcast_v20i16_to_v5i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -6483,25 +22524,31 @@ define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <10 x float> %a1 to <5 x i64> + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x i64> br label %end cmp.false: - %a3 = bitcast <10 x float> %a to <5 x i64> + %a3 = bitcast <20 x i16> %a to <5 x i64> br label %end end: @@ -6509,38 +22556,358 @@ end: ret <5 x i64> %phi } -define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v10f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i16_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mov_b32_e32 v16, v4 +; SI-NEXT: v_mov_b32_e32 v17, v2 +; SI-NEXT: v_mov_b32_e32 v18, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB57_2 ; -; VI-LABEL: bitcast_v5i64_to_v10f32: +; VI-LABEL: bitcast_v20i16_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v20i16_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i16_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i16> %a, splat (i16 3) + %a2 = bitcast <20 x i16> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x i16> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v5i64_to_v20i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v20, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v20, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB58_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc @@ -6552,18 +22919,18 @@ define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v5i64_to_v10f32: +; GFX9-LABEL: bitcast_v5i64_to_v20i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc @@ -6575,419 +22942,619 @@ define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v5i64_to_v10f32: +; GFX11-LABEL: bitcast_v5i64_to_v20i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB17_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB58_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <20 x i16> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <20 x i16> + br label %end + +end: + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi +} + +define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v20i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v5i64_to_v20i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v20i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_3 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB59_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: s_branch .LBB59_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v20i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB59_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: s_branch .LBB59_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: %a1 = add <5 x i64> %a, splat (i64 3) - %a2 = bitcast <5 x i64> %a1 to <10 x float> + %a2 = bitcast <5 x i64> %a1 to <20 x i16> br label %end cmp.false: - %a3 = bitcast <5 x i64> %a to <10 x float> + %a3 = bitcast <5 x i64> %a to <20 x i16> br label %end end: - %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x float> %phi + %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i16> %phi } define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v7 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v19 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_bfe_u32 v14, v21, 8, 8 -; GCN-NEXT: v_bfe_u32 v11, v4, 8, 8 -; GCN-NEXT: v_bfe_u32 v8, v3, 8, 8 -; GCN-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GCN-NEXT: v_or_b32_e32 v29, v50, v6 -; GCN-NEXT: v_or_b32_e32 v27, v49, v7 -; GCN-NEXT: v_or_b32_e32 v20, v52, v9 -; GCN-NEXT: v_or_b32_e32 v17, v51, v10 -; GCN-NEXT: v_or_b32_e32 v13, v54, v12 -; GCN-NEXT: v_or_b32_e32 v12, v53, v15 -; GCN-NEXT: v_or_b32_e32 v10, v42, v16 -; GCN-NEXT: v_or_b32_e32 v9, v40, v18 -; GCN-NEXT: v_or_b32_e32 v7, v45, v19 -; GCN-NEXT: v_or_b32_e32 v6, v44, v22 -; GCN-NEXT: v_alignbit_b32 v34, v27, v29, 24 -; GCN-NEXT: v_alignbit_b32 v36, v27, v29, 16 -; GCN-NEXT: v_alignbit_b32 v38, v27, v29, 8 -; GCN-NEXT: v_alignbit_b32 v32, v17, v20, 24 -; GCN-NEXT: v_alignbit_b32 v33, v17, v20, 16 -; GCN-NEXT: v_alignbit_b32 v35, v17, v20, 8 -; GCN-NEXT: v_alignbit_b32 v26, v12, v13, 24 -; GCN-NEXT: v_alignbit_b32 v28, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; GCN-NEXT: v_alignbit_b32 v19, v9, v10, 24 -; GCN-NEXT: v_alignbit_b32 v24, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v25, v9, v10, 8 -; GCN-NEXT: v_alignbit_b32 v15, v6, v7, 24 -; GCN-NEXT: v_alignbit_b32 v16, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 8, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; GCN-NEXT: v_bfe_u32 v22, v1, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v49 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_bfe_u32 v14, v21, 8, 8 -; GCN-NEXT: v_bfe_u32 v11, v4, 8, 8 -; GCN-NEXT: v_bfe_u32 v8, v3, 8, 8 -; GCN-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GCN-NEXT: v_or_b32_e32 v7, v6, v13 -; GCN-NEXT: v_or_b32_e32 v6, v22, v17 -; GCN-NEXT: v_or_b32_e32 v10, v9, v20 -; GCN-NEXT: v_or_b32_e32 v9, v23, v25 -; GCN-NEXT: v_or_b32_e32 v13, v12, v26 -; GCN-NEXT: v_or_b32_e32 v12, v24, v27 -; GCN-NEXT: v_or_b32_e32 v20, v15, v28 -; GCN-NEXT: v_or_b32_e32 v17, v16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v18, v30 -; GCN-NEXT: v_or_b32_e32 v27, v19, v31 -; GCN-NEXT: v_alignbit_b32 v34, v27, v29, 24 -; GCN-NEXT: v_alignbit_b32 v36, v27, v29, 16 -; GCN-NEXT: v_alignbit_b32 v38, v27, v29, 8 -; GCN-NEXT: v_alignbit_b32 v32, v17, v20, 24 -; GCN-NEXT: v_alignbit_b32 v33, v17, v20, 16 -; GCN-NEXT: v_alignbit_b32 v35, v17, v20, 8 -; GCN-NEXT: v_alignbit_b32 v26, v12, v13, 24 -; GCN-NEXT: v_alignbit_b32 v28, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v31, v12, v13, 8 -; GCN-NEXT: v_alignbit_b32 v19, v9, v10, 24 -; GCN-NEXT: v_alignbit_b32 v24, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v25, v9, v10, 8 -; GCN-NEXT: v_alignbit_b32 v15, v6, v7, 24 -; GCN-NEXT: v_alignbit_b32 v16, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v18, v6, v7, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 8, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 8, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; GCN-NEXT: v_bfe_u32 v22, v1, 8, 8 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 36, v0 -; GCN-NEXT: v_or_b32_e32 v29, v29, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v35, v50, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v17, v17, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v13, v13, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v10, v10, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v7, v7, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v6, v6, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v34, v36 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v21, v49, v21 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v29, v32, v33 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v4, v11, v4 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v13, v26, v28 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v3, v8, v3 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v19, v24 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v15, v16 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v1, v22, v1 -; GCN-NEXT: v_or_b32_e32 v15, v18, v23 -; GCN-NEXT: v_or_b32_e32 v16, v25, v21 -; GCN-NEXT: v_or_b32_e32 v18, v27, v29 -; GCN-NEXT: v_or_b32_e32 v4, v17, v4 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_or_b32_e32 v2, v9, v2 -; GCN-NEXT: v_or_b32_e32 v5, v5, v7 -; GCN-NEXT: v_or_b32_e32 v1, v6, v1 -; GCN-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f16_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_or_b32_e32 v24, v50, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v19, v49, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_or_b32_e32 v12, v53, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_or_b32_e32 v11, v52, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; SI-NEXT: v_or_b32_e32 v9, v40, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_or_b32_e32 v10, v55, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_or_b32_e32 v7, v43, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_or_b32_e32 v8, v42, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v6, v46, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v45, v5 +; SI-NEXT: v_alignbit_b32 v26, v19, v24, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v24, 8 +; SI-NEXT: v_alignbit_b32 v25, v11, v12, 24 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 8 +; SI-NEXT: v_alignbit_b32 v18, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v13, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v14, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, v5, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_bfe_u32 v48, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: .LBB60_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v24, v13, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_or_b32_e32 v19, v15, v13 +; SI-NEXT: v_alignbit_b32 v26, v19, v24, 24 +; SI-NEXT: v_alignbit_b32 v30, v19, v24, 16 +; SI-NEXT: v_alignbit_b32 v32, v19, v24, 8 +; SI-NEXT: v_alignbit_b32 v25, v11, v12, 24 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 8 +; SI-NEXT: v_alignbit_b32 v18, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v13, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v14, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, v5, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 +; SI-NEXT: v_bfe_u32 v48, v22, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v36, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 +; SI-NEXT: .LBB60_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v24, v24, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v26, v26, v30 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v39 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v19, v19, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v48 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_or_b32_e32 v19, v19, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v25 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v4, v12, v4 +; SI-NEXT: v_or_b32_e32 v4, v11, v4 +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v9, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v15 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v13 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v40i8: ; VI: ; %bb.0: @@ -7025,7 +23592,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB60_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7047,9 +23614,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v2 ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB18_2: ; %Flow +; VI-NEXT: .LBB60_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_4 +; VI-NEXT: s_cbranch_execz .LBB60_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 ; VI-NEXT: v_add_f16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7112,7 +23679,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v35, v19, 8, 8 ; VI-NEXT: v_bfe_u32 v38, v21, 8, 8 ; VI-NEXT: v_bfe_u32 v48, v23, 8, 8 -; VI-NEXT: .LBB18_4: ; %end +; VI-NEXT: .LBB60_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -7222,7 +23789,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB60_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7254,9 +23821,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: .LBB60_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: s_cbranch_execz .LBB60_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -7299,7 +23866,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: .LBB60_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -7391,7 +23958,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7413,9 +23980,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB60_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB60_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -7447,7 +24014,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB18_4: ; %end +; GFX11-TRUE16-NEXT: .LBB60_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -7585,7 +24152,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -7617,9 +24184,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB18_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB60_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB60_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -7661,7 +24228,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB18_4: ; %end +; GFX11-FAKE16-NEXT: .LBB60_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -7776,310 +24343,1649 @@ end: ret <40 x i8> %phi } +define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v16, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s26 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v43, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v28, v15, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v24, v12, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v14, v33, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v20, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: v_or_b32_e32 v7, v53, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v11, v50, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v5, v44, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v41, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v46, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_or_b32_e32 v3, v45, v3 +; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 +; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 +; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 +; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 +; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v1, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v50 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 +; SI-NEXT: v_or_b32_e32 v7, v13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_or_b32_e32 v28, v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v24, v12, v15 +; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 +; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 +; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 +; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 +; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 +; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 +; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v1, 8, 8 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v37 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v30 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v40 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v42 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v36 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v29 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v55 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v23 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v49 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v27 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v20f16_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s41, s25, 24 +; VI-NEXT: s_lshr_b32 s59, s25, 16 +; VI-NEXT: s_lshr_b32 s26, s25, 8 +; VI-NEXT: s_lshr_b32 s60, s24, 16 +; VI-NEXT: s_lshr_b32 s27, s24, 8 +; VI-NEXT: s_lshr_b32 s43, s23, 24 +; VI-NEXT: s_lshr_b32 s61, s23, 16 +; VI-NEXT: s_lshr_b32 s28, s23, 8 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s29, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s40, s21, 8 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s42, s20, 8 +; VI-NEXT: s_lshr_b32 s57, s19, 24 +; VI-NEXT: s_lshr_b32 s73, s19, 16 +; VI-NEXT: s_lshr_b32 s44, s19, 8 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s45, s18, 8 +; VI-NEXT: s_lshr_b32 s58, s17, 24 +; VI-NEXT: s_lshr_b32 s75, s17, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 8 +; VI-NEXT: s_lshr_b32 s76, s16, 16 +; VI-NEXT: s_lshr_b32 s56, s16, 8 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB61_4 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v8, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_add_f16_e32 v17, s17, v1 +; VI-NEXT: v_add_f16_e32 v12, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v39, v17, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: v_add_f16_e32 v22, s16, v1 +; VI-NEXT: v_add_f16_e32 v9, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v38, v22, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; VI-NEXT: v_add_f16_e32 v18, s19, v1 +; VI-NEXT: v_add_f16_e32 v13, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v36, v18, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_add_f16_e32 v23, s18, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v35, v23, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; VI-NEXT: v_add_f16_e32 v19, s21, v1 +; VI-NEXT: v_add_f16_e32 v14, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v33, v19, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_add_f16_e32 v24, s20, v1 +; VI-NEXT: v_add_f16_e32 v11, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v32, v24, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; VI-NEXT: v_add_f16_e32 v20, s23, v1 +; VI-NEXT: v_add_f16_e32 v15, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v30, v20, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_add_f16_e32 v25, s22, v1 +; VI-NEXT: v_add_f16_e32 v7, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v29, v25, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; VI-NEXT: v_add_f16_e32 v21, s25, v1 +; VI-NEXT: v_add_f16_e32 v16, s4, v1 +; VI-NEXT: v_or_b32_e32 v49, v21, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; VI-NEXT: v_add_f16_e32 v26, s24, v1 +; VI-NEXT: v_or_b32_e32 v48, v26, v2 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[48:49] +; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[29:30] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[35:36] +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v48 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v38 +; VI-NEXT: v_bfe_u32 v6, v7, 8, 8 +; VI-NEXT: v_bfe_u32 v29, v11, 8, 8 +; VI-NEXT: v_bfe_u32 v32, v10, 8, 8 +; VI-NEXT: v_bfe_u32 v35, v9, 8, 8 +; VI-NEXT: v_bfe_u32 v38, v8, 8, 8 +; VI-NEXT: s_branch .LBB61_5 +; VI-NEXT: .LBB61_3: +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: s_branch .LBB61_2 +; VI-NEXT: .LBB61_4: +; VI-NEXT: v_mov_b32_e32 v12, s76 +; VI-NEXT: v_mov_b32_e32 v8, s75 +; VI-NEXT: v_mov_b32_e32 v13, s74 +; VI-NEXT: v_mov_b32_e32 v9, s73 +; VI-NEXT: v_mov_b32_e32 v14, s72 +; VI-NEXT: v_mov_b32_e32 v10, s63 +; VI-NEXT: v_mov_b32_e32 v15, s62 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v16, s60 +; VI-NEXT: v_mov_b32_e32 v7, s59 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v23, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v24, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v20, s23 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v21, s25 +; VI-NEXT: v_mov_b32_e32 v38, s58 +; VI-NEXT: v_mov_b32_e32 v35, s57 +; VI-NEXT: v_mov_b32_e32 v32, s46 +; VI-NEXT: v_mov_b32_e32 v29, s43 +; VI-NEXT: v_mov_b32_e32 v6, s41 +; VI-NEXT: v_mov_b32_e32 v48, s56 +; VI-NEXT: v_mov_b32_e32 v39, s47 +; VI-NEXT: v_mov_b32_e32 v37, s45 +; VI-NEXT: v_mov_b32_e32 v36, s44 +; VI-NEXT: v_mov_b32_e32 v34, s42 +; VI-NEXT: v_mov_b32_e32 v33, s40 +; VI-NEXT: v_mov_b32_e32 v31, s29 +; VI-NEXT: v_mov_b32_e32 v30, s28 +; VI-NEXT: v_mov_b32_e32 v28, s27 +; VI-NEXT: v_mov_b32_e32 v27, s26 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: .LBB61_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v22, v22, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v12, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; VI-NEXT: v_or_b32_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v5, v23, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v35 +; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v34 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v4, v24, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s28, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 8 +; GFX9-NEXT: s_lshr_b32 s43, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s57, s21, 8 +; GFX9-NEXT: s_lshr_b32 s56, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s62, s19, 8 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s17, 8 +; GFX9-NEXT: s_lshr_b32 s74, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB61_4 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s21, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s20, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s23, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s22, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s25, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s24, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: s_branch .LBB61_5 +; GFX9-NEXT: .LBB61_3: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB61_2 +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v8, s19 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v4, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v2, s25 +; GFX9-NEXT: v_mov_b32_e32 v39, s76 +; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v38, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s73 +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v35, s63 +; GFX9-NEXT: v_mov_b32_e32 v34, s61 +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s60 +; GFX9-NEXT: v_mov_b32_e32 v32, s59 +; GFX9-NEXT: v_mov_b32_e32 v30, s58 +; GFX9-NEXT: v_mov_b32_e32 v29, s56 +; GFX9-NEXT: v_mov_b32_e32 v28, s57 +; GFX9-NEXT: v_mov_b32_e32 v26, s47 +; GFX9-NEXT: v_mov_b32_e32 v27, s46 +; GFX9-NEXT: v_mov_b32_e32 v25, s45 +; GFX9-NEXT: v_mov_b32_e32 v24, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s44 +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v22, s41 +; GFX9-NEXT: v_mov_b32_e32 v20, s40 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 +; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: .LBB61_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v9, v34, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v8, v31, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v7, v29, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v5, v24, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v20f16_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX11-TRUE16-NEXT: .LBB61_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB61_5 +; GFX11-TRUE16-NEXT: .LBB61_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB61_2 +; GFX11-TRUE16-NEXT: .LBB61_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s4 +; GFX11-TRUE16-NEXT: .LBB61_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v32, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v30, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX11-FAKE16-NEXT: .LBB61_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB61_5 +; GFX11-FAKE16-NEXT: .LBB61_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB61_2 +; GFX11-FAKE16-NEXT: .LBB61_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s20 :: v_dual_mov_b32 v2, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s63 :: v_dual_mov_b32 v39, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s62 :: v_dual_mov_b32 v37, s60 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v35, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s56 :: v_dual_mov_b32 v33, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s47 :: v_dual_mov_b32 v31, s46 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v29, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s44 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v25, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s28 :: v_dual_mov_b32 v23, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v21, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v7, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v15, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: .LBB61_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v34, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v29, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v10, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v10, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v24, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, v37, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v19, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v3, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v4, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v57 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v58 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v59 -; GCN-NEXT: v_or_b32_e32 v0, v0, v36 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_or_b32_e32 v9, v9, v53 -; GCN-NEXT: v_or_b32_e32 v10, v10, v54 -; GCN-NEXT: v_or_b32_e32 v11, v11, v55 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: v_or_b32_e32 v13, v13, v41 -; GCN-NEXT: v_or_b32_e32 v14, v14, v42 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_or_b32_e32 v16, v16, v44 -; GCN-NEXT: v_or_b32_e32 v17, v17, v45 -; GCN-NEXT: v_or_b32_e32 v18, v18, v46 -; GCN-NEXT: v_or_b32_e32 v19, v19, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB19_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v59 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v47, v1 -; GCN-NEXT: v_or_b32_e32 v3, v46, v3 -; GCN-NEXT: v_or_b32_e32 v5, v45, v5 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v9, v43, v9 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: v_or_b32_e32 v13, v41, v13 -; GCN-NEXT: v_or_b32_e32 v15, v40, v15 -; GCN-NEXT: v_or_b32_e32 v17, v55, v17 -; GCN-NEXT: v_or_b32_e32 v19, v54, v19 -; GCN-NEXT: v_or_b32_e32 v18, v53, v18 -; GCN-NEXT: v_or_b32_e32 v16, v52, v16 -; GCN-NEXT: v_or_b32_e32 v14, v51, v14 -; GCN-NEXT: v_or_b32_e32 v12, v50, v12 -; GCN-NEXT: v_or_b32_e32 v10, v49, v10 -; GCN-NEXT: v_or_b32_e32 v8, v48, v8 -; GCN-NEXT: v_or_b32_e32 v6, v39, v6 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_or_b32_e32 v2, v37, v2 -; GCN-NEXT: v_or_b32_e32 v0, v36, v0 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 -; GCN-NEXT: .LBB19_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v31 -; GCN-NEXT: v_mov_b32_e32 v2, v23 -; GCN-NEXT: v_mov_b32_e32 v4, v35 -; GCN-NEXT: v_mov_b32_e32 v6, v27 -; GCN-NEXT: v_mov_b32_e32 v8, v33 -; GCN-NEXT: v_mov_b32_e32 v10, v21 -; GCN-NEXT: v_mov_b32_e32 v12, v25 -; GCN-NEXT: v_mov_b32_e32 v14, v29 -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: v_mov_b32_e32 v18, v34 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i8_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v31, v2 +; SI-NEXT: v_mov_b32_e32 v35, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v25 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v33 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v6, v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v6, v6, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v6, v6, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v6, v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v6, v6, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v38 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v6, v27 +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i8_to_v20f16: ; VI: ; %bb.0: @@ -8140,7 +26046,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -8213,9 +26119,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: .LBB62_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: s_cbranch_execz .LBB62_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v55 ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8291,7 +26197,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v7, v12, v7 ; VI-NEXT: v_or_b32_e32 v8, v11, v8 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: .LBB62_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -8363,7 +26269,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB62_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v36, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -8437,9 +26343,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: .LBB62_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: s_cbranch_execz .LBB62_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 @@ -8514,7 +26420,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v7, v12, v7, s6 ; GFX9-NEXT: v_perm_b32 v8, v11, v8, s6 ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: .LBB62_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -8590,15 +26496,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_4 -; GFX11-TRUE16-NEXT: .LBB19_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_4 +; GFX11-TRUE16-NEXT: .LBB62_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB19_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB62_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v21.h @@ -8681,8 +26587,8 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-TRUE16-NEXT: .LBB19_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3 @@ -8817,15 +26723,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB62_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_4 -; GFX11-FAKE16-NEXT: .LBB19_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB62_4 +; GFX11-FAKE16-NEXT: .LBB62_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB19_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB62_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v35 @@ -8918,8 +26824,8 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-FAKE16-NEXT: .LBB19_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2 +; GFX11-FAKE16-NEXT: .LBB62_4: ; %cmp.true ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v68, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v66, 3 @@ -9034,167 +26940,1259 @@ end: ret <20 x half> %phi } +define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_readfirstlane_b32 s62, v25 +; SI-NEXT: v_readfirstlane_b32 s63, v24 +; SI-NEXT: v_readfirstlane_b32 s60, v23 +; SI-NEXT: v_readfirstlane_b32 s61, v22 +; SI-NEXT: v_readfirstlane_b32 s58, v21 +; SI-NEXT: v_readfirstlane_b32 s59, v20 +; SI-NEXT: v_readfirstlane_b32 s56, v19 +; SI-NEXT: v_readfirstlane_b32 s57, v18 +; SI-NEXT: v_readfirstlane_b32 s46, v17 +; SI-NEXT: v_readfirstlane_b32 s47, v16 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_readfirstlane_b32 s45, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v13 +; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_readfirstlane_b32 s15, v11 +; SI-NEXT: v_readfirstlane_b32 s41, v10 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v0 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: s_and_b32 s11, s40, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s6, s6, s11 +; SI-NEXT: s_and_b32 s11, s13, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s28, 0xff +; SI-NEXT: s_lshl_b32 s11, s29, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s11, s26, 0xff +; SI-NEXT: s_lshl_b32 s13, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: s_add_i32 s47, s47, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: s_and_b32 s13, s24, 0xff +; SI-NEXT: s_lshl_b32 s14, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s61, 0xff +; SI-NEXT: s_lshl_b32 s60, s60, 8 +; SI-NEXT: s_and_b32 s59, s59, 0xff +; SI-NEXT: s_lshl_b32 s58, s58, 8 +; SI-NEXT: s_and_b32 s57, s57, 0xff +; SI-NEXT: s_lshl_b32 s56, s56, 8 +; SI-NEXT: s_and_b32 s47, s47, 0xff +; SI-NEXT: s_lshl_b32 s46, s46, 8 +; SI-NEXT: s_and_b32 s45, s45, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: s_and_b32 s43, s43, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 8 +; SI-NEXT: s_and_b32 s41, s41, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s22, 0xff +; SI-NEXT: s_lshl_b32 s22, s23, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s5, s60, s5 +; SI-NEXT: s_or_b32 s58, s58, s59 +; SI-NEXT: s_or_b32 s56, s56, s57 +; SI-NEXT: s_or_b32 s46, s46, s47 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: s_or_b32 s42, s42, s43 +; SI-NEXT: s_or_b32 s15, s15, s41 +; SI-NEXT: s_or_b32 s14, s22, s14 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s58, 0x300 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_addk_i32 s46, 0x300 +; SI-NEXT: s_addk_i32 s44, 0x300 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v40i8_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: v_mov_b32_e32 v27, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v29, v8 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v30, v4 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: s_cbranch_scc0 .LBB63_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB63_3 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v6, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v5, v38, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: v_or_b32_sdwa v4, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB63_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v30, v10 +; GFX9-NEXT: v_mov_b32_e32 v27, v8 +; GFX9-NEXT: v_mov_b32_e32 v28, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v2 +; GFX9-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB63_3 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v3, v26, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v11, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v9, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: .LBB63_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB63_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v7, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-TRUE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v12, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v11, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: .LBB63_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB63_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB63_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v2, 0xffff, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v31 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v6, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v6, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v9, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3 +; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v27 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v35, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v33, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v18 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v25 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v34, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v19, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v37, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v32, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v22, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v10, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v11, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB63_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB63_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB63_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_or_b32_e32 v0, v35, v0 -; GCN-NEXT: v_or_b32_e32 v1, v33, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v29, v2 -; GCN-NEXT: v_or_b32_e32 v3, v27, v3 -; GCN-NEXT: v_or_b32_e32 v4, v25, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v21, v6 -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v16, v9 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v16, v17 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f16_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5f64: ; VI: ; %bb.0: @@ -9203,7 +28201,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB64_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 0x200 ; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9236,7 +28234,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v11 ; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB64_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9247,7 +28245,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB64_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -9260,7 +28258,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB64_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9272,7 +28270,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -9284,7 +28282,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9304,122 +28302,434 @@ end: ret <5 x double> %phi } +define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v20f16_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_4 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_3: +; VI-NEXT: s_branch .LBB65_2 +; VI-NEXT: .LBB65_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_4 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_3: +; GFX9-NEXT: s_branch .LBB65_2 +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_4 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_3: +; GFX11-NEXT: s_branch .LBB65_2 +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v24 -; GCN-NEXT: v_mov_b32_e32 v1, v29 -; GCN-NEXT: v_mov_b32_e32 v2, v20 -; GCN-NEXT: v_mov_b32_e32 v3, v28 -; GCN-NEXT: v_mov_b32_e32 v4, v21 -; GCN-NEXT: v_mov_b32_e32 v5, v27 -; GCN-NEXT: v_mov_b32_e32 v6, v22 -; GCN-NEXT: v_mov_b32_e32 v7, v26 -; GCN-NEXT: v_mov_b32_e32 v8, v23 -; GCN-NEXT: v_mov_b32_e32 v9, v25 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f64_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: .LBB66_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v2, v27 +; SI-NEXT: v_mov_b32_e32 v3, v26 +; SI-NEXT: v_mov_b32_e32 v4, v25 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: v_mov_b32_e32 v6, v24 +; SI-NEXT: v_mov_b32_e32 v7, v21 +; SI-NEXT: v_mov_b32_e32 v8, v22 +; SI-NEXT: v_mov_b32_e32 v9, v20 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20f16: ; VI: ; %bb.0: @@ -9428,14 +28738,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9446,14 +28756,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9465,14 +28775,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB66_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9492,167 +28802,385 @@ end: ret <20 x half> %phi } +define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v5f64_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f16_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v18 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_or_b32_e32 v0, v35, v0 -; GCN-NEXT: v_or_b32_e32 v1, v33, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v29, v2 -; GCN-NEXT: v_or_b32_e32 v3, v27, v3 -; GCN-NEXT: v_or_b32_e32 v4, v25, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v21, v6 -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v16, v9 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v16, v17 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f16_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_4 +; SI-NEXT: .LBB68_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB68_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: .LBB68_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v25 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f16_to_v5i64: ; VI: ; %bb.0: @@ -9661,7 +29189,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v10, 0x200 ; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9694,7 +29222,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v11 ; VI-NEXT: v_or_b32_e32 v0, v0, v10 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB68_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9705,7 +29233,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -9718,7 +29246,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB68_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9730,7 +29258,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -9742,7 +29270,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB68_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9762,136 +29290,448 @@ end: ret <5 x i64> %phi } +define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f16_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v26, v4 +; SI-NEXT: v_or_b32_e32 v5, v24, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v20f16_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB69_4 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_3: +; VI-NEXT: s_branch .LBB69_2 +; VI-NEXT: .LBB69_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB69_4 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_3: +; GFX9-NEXT: s_branch .LBB69_2 +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB69_4 +; GFX11-NEXT: .LBB69_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB69_3: +; GFX11-NEXT: s_branch .LBB69_2 +; GFX11-NEXT: .LBB69_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v20f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v9 -; GCN-NEXT: v_mov_b32_e32 v21, v8 -; GCN-NEXT: v_mov_b32_e32 v24, v7 -; GCN-NEXT: v_mov_b32_e32 v23, v6 -; GCN-NEXT: v_mov_b32_e32 v26, v5 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v28, v3 -; GCN-NEXT: v_mov_b32_e32 v27, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v1 -; GCN-NEXT: v_mov_b32_e32 v20, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v22, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i64_to_v20f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v21, vcc +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v20f16: ; VI: ; %bb.0: @@ -9900,7 +29740,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB70_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc @@ -9912,7 +29752,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9923,7 +29763,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB70_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc @@ -9935,7 +29775,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9947,7 +29787,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9962,7 +29802,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB70_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9982,305 +29822,528 @@ end: ret <20 x half> %phi } +define inreg <20 x half> @bitcast_v5i64_to_v20f16_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v20f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s24, 3 +; SI-NEXT: s_addc_u32 s21, s25, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v5i64_to_v20f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v20f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v20f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v45 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v44 -; GCN-NEXT: v_or_b32_e32 v0, v0, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v53, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v54, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v55, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v40, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v41, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v42, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i8_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB72_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v54, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i8_to_v5f64: ; VI: ; %bb.0: @@ -10342,7 +30405,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -10415,9 +30478,9 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB72_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB72_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 @@ -10491,7 +30554,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB72_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -10565,7 +30628,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -10638,9 +30701,9 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB72_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 @@ -10714,7 +30777,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB72_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -10785,15 +30848,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_4 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_4 +; GFX11-TRUE16-NEXT: .LBB72_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB24_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l @@ -10906,8 +30969,8 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 +; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 @@ -11071,15 +31134,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB72_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_4 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB72_4 +; GFX11-FAKE16-NEXT: .LBB72_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB24_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -11192,8 +31255,8 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 +; GFX11-FAKE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -11324,227 +31387,1354 @@ end: ret <5 x double> %phi } +define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v34, v14 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v40i8_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v34, v14 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v27, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: s_cbranch_scc0 .LBB73_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB73_3 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB73_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB73_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v32, v10 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v27, v2 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB73_3 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB73_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB73_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v5f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB73_3 +; GFX11-TRUE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v37 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB73_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB73_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB73_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v5f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v6 :: v_dual_mov_b32 v24, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v2 :: v_dual_mov_b32 v26, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v37 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v30 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB73_3 +; GFX11-FAKE16-NEXT: .LBB73_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v34, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v37, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v36, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v24 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v33, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v17, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v19, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v21, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB73_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB73_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB73_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v17, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v18, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v19, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v30, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v17, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v18, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v19, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v30, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; GCN-NEXT: v_or_b32_e32 v49, v49, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_or_b32_e32 v48, v51, v48 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v22, v32, v35 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v33, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v30, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v18, v26, v27 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v23, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v20, v13 -; GCN-NEXT: v_or_b32_e32 v13, v21, v22 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_or_b32_e32 v4, v4, v30 -; GCN-NEXT: v_or_b32_e32 v5, v5, v17 -; GCN-NEXT: v_or_b32_e32 v6, v6, v18 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f64_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB74_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB74_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v40i8: ; VI: ; %bb.0: @@ -11582,7 +32772,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -11614,9 +32804,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB74_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB74_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; VI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -11653,7 +32843,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB74_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -11763,7 +32953,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -11795,9 +32985,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB74_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB74_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; GFX9-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -11834,7 +33024,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB74_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -11926,7 +33116,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -11948,9 +33138,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -11977,7 +33167,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %end +; GFX11-TRUE16-NEXT: .LBB74_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12115,7 +33305,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -12147,9 +33337,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB74_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB74_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -12186,7 +33376,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %end +; GFX11-FAKE16-NEXT: .LBB74_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12249,40 +33439,1234 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v11 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v33, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v25, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v20, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_alignbit_b32 v2, s25, v1, 24 +; SI-NEXT: v_alignbit_b32 v11, s25, v1, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s22 +; SI-NEXT: v_alignbit_b32 v4, s23, v1, 24 +; SI-NEXT: v_alignbit_b32 v13, s23, v1, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: v_alignbit_b32 v6, s21, v1, 24 +; SI-NEXT: v_alignbit_b32 v15, s21, v1, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_alignbit_b32 v8, s19, v1, 24 +; SI-NEXT: v_alignbit_b32 v10, s19, v1, 16 +; SI-NEXT: v_alignbit_b32 v17, s19, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: v_alignbit_b32 v18, s17, v1, 24 +; SI-NEXT: v_alignbit_b32 v19, s17, v1, 16 +; SI-NEXT: v_alignbit_b32 v20, s17, v1, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB75_4 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 +; SI-NEXT: v_readfirstlane_b32 s25, v2 +; SI-NEXT: v_readfirstlane_b32 s23, v4 +; SI-NEXT: v_readfirstlane_b32 s21, v6 +; SI-NEXT: v_readfirstlane_b32 s19, v8 +; SI-NEXT: v_readfirstlane_b32 s17, v10 +; SI-NEXT: v_alignbit_b32 v2, s25, v1, 24 +; SI-NEXT: v_alignbit_b32 v11, s25, v1, 16 +; SI-NEXT: v_alignbit_b32 v12, s25, v1, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v3, 24 +; SI-NEXT: v_alignbit_b32 v13, s23, v3, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v3, 8 +; SI-NEXT: v_alignbit_b32 v6, s21, v5, 24 +; SI-NEXT: v_alignbit_b32 v15, s21, v5, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v5, 8 +; SI-NEXT: v_alignbit_b32 v8, s19, v7, 24 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v7, 16 +; SI-NEXT: v_alignbit_b32 v17, s19, v7, 8 +; SI-NEXT: v_alignbit_b32 v18, s17, v9, 24 +; SI-NEXT: v_alignbit_b32 v19, s17, v9, 16 +; SI-NEXT: v_alignbit_b32 v20, s17, v9, 8 +; SI-NEXT: s_branch .LBB75_5 +; SI-NEXT: .LBB75_3: +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB75_2 +; SI-NEXT: .LBB75_4: +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_mov_b32_e32 v3, s22 +; SI-NEXT: v_mov_b32_e32 v5, s20 +; SI-NEXT: v_mov_b32_e32 v7, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: .LBB75_5: ; %end +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: v_or_b32_e32 v9, v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s28, 24 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v16 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v14 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s60, s24, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 8 +; VI-NEXT: s_lshr_b32 s29, s23, 24 +; VI-NEXT: s_lshr_b32 s40, s23, 16 +; VI-NEXT: s_lshr_b32 s41, s23, 8 +; VI-NEXT: s_lshr_b32 s62, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 8 +; VI-NEXT: s_lshr_b32 s42, s21, 24 +; VI-NEXT: s_lshr_b32 s43, s21, 16 +; VI-NEXT: s_lshr_b32 s44, s21, 8 +; VI-NEXT: s_lshr_b32 s72, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 8 +; VI-NEXT: s_lshr_b32 s45, s19, 24 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 8 +; VI-NEXT: s_lshr_b32 s74, s18, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 8 +; VI-NEXT: s_lshr_b32 s56, s17, 24 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: s_lshr_b32 s76, s16, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 8 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB75_4 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; VI-NEXT: v_readfirstlane_b32 s17, v10 +; VI-NEXT: v_readfirstlane_b32 s19, v8 +; VI-NEXT: v_readfirstlane_b32 s21, v6 +; VI-NEXT: v_readfirstlane_b32 s23, v4 +; VI-NEXT: v_readfirstlane_b32 s25, v2 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: s_lshr_b32 s29, s23, 24 +; VI-NEXT: s_lshr_b32 s40, s23, 16 +; VI-NEXT: s_lshr_b32 s41, s23, 8 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; VI-NEXT: s_lshr_b32 s42, s21, 24 +; VI-NEXT: s_lshr_b32 s43, s21, 16 +; VI-NEXT: s_lshr_b32 s44, s21, 8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v5 +; VI-NEXT: s_lshr_b32 s45, s19, 24 +; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: s_lshr_b32 s47, s19, 8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v7 +; VI-NEXT: s_lshr_b32 s56, s17, 24 +; VI-NEXT: s_lshr_b32 s57, s17, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; VI-NEXT: s_branch .LBB75_5 +; VI-NEXT: .LBB75_3: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB75_2 +; VI-NEXT: .LBB75_4: +; VI-NEXT: v_mov_b32_e32 v9, s16 +; VI-NEXT: v_mov_b32_e32 v7, s18 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v3, s22 +; VI-NEXT: v_mov_b32_e32 v1, s24 +; VI-NEXT: v_mov_b32_e32 v25, s76 +; VI-NEXT: v_mov_b32_e32 v26, s75 +; VI-NEXT: v_mov_b32_e32 v23, s74 +; VI-NEXT: v_mov_b32_e32 v24, s73 +; VI-NEXT: v_mov_b32_e32 v21, s72 +; VI-NEXT: v_mov_b32_e32 v22, s63 +; VI-NEXT: v_mov_b32_e32 v19, s62 +; VI-NEXT: v_mov_b32_e32 v20, s61 +; VI-NEXT: v_mov_b32_e32 v17, s60 +; VI-NEXT: v_mov_b32_e32 v18, s59 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v13, s8 +; VI-NEXT: v_mov_b32_e32 v12, s10 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: .LBB75_5: ; %end +; VI-NEXT: s_and_b32 s4, s17, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v15 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v25, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s19, 0xff +; VI-NEXT: s_lshl_b32 s5, s47, 8 +; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s6, s45, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: s_lshl_b32 s5, s44, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s43, 0xff +; VI-NEXT: s_lshl_b32 s6, s42, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: s_lshl_b32 s5, s41, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s40, 0xff +; VI-NEXT: s_lshl_b32 s6, s29, 8 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 24, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s60, s24, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 8 +; GFX9-NEXT: s_lshr_b32 s29, s23, 24 +; GFX9-NEXT: s_lshr_b32 s40, s23, 16 +; GFX9-NEXT: s_lshr_b32 s41, s23, 8 +; GFX9-NEXT: s_lshr_b32 s62, s22, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 8 +; GFX9-NEXT: s_lshr_b32 s42, s21, 24 +; GFX9-NEXT: s_lshr_b32 s43, s21, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 8 +; GFX9-NEXT: s_lshr_b32 s72, s20, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 8 +; GFX9-NEXT: s_lshr_b32 s45, s19, 24 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s19, 8 +; GFX9-NEXT: s_lshr_b32 s74, s18, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 8 +; GFX9-NEXT: s_lshr_b32 s56, s17, 24 +; GFX9-NEXT: s_lshr_b32 s57, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: s_lshr_b32 s76, s16, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] +; GFX9-NEXT: v_readfirstlane_b32 s17, v10 +; GFX9-NEXT: v_readfirstlane_b32 s19, v8 +; GFX9-NEXT: v_readfirstlane_b32 s21, v6 +; GFX9-NEXT: v_readfirstlane_b32 s23, v4 +; GFX9-NEXT: v_readfirstlane_b32 s25, v2 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: s_lshr_b32 s29, s23, 24 +; GFX9-NEXT: s_lshr_b32 s40, s23, 16 +; GFX9-NEXT: s_lshr_b32 s41, s23, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; GFX9-NEXT: s_lshr_b32 s42, s21, 24 +; GFX9-NEXT: s_lshr_b32 s43, s21, 16 +; GFX9-NEXT: s_lshr_b32 s44, s21, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v5 +; GFX9-NEXT: s_lshr_b32 s45, s19, 24 +; GFX9-NEXT: s_lshr_b32 s46, s19, 16 +; GFX9-NEXT: s_lshr_b32 s47, s19, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: s_lshr_b32 s56, s17, 24 +; GFX9-NEXT: s_lshr_b32 s57, s17, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; GFX9-NEXT: s_branch .LBB75_5 +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v9, s16 +; GFX9-NEXT: v_mov_b32_e32 v7, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s22 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mov_b32_e32 v25, s76 +; GFX9-NEXT: v_mov_b32_e32 v26, s75 +; GFX9-NEXT: v_mov_b32_e32 v23, s74 +; GFX9-NEXT: v_mov_b32_e32 v24, s73 +; GFX9-NEXT: v_mov_b32_e32 v21, s72 +; GFX9-NEXT: v_mov_b32_e32 v22, s63 +; GFX9-NEXT: v_mov_b32_e32 v19, s62 +; GFX9-NEXT: v_mov_b32_e32 v20, s61 +; GFX9-NEXT: v_mov_b32_e32 v17, s60 +; GFX9-NEXT: v_mov_b32_e32 v18, s59 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: .LBB75_5: ; %end +; GFX9-NEXT: s_and_b32 s4, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v15 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s47, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s46, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s45, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v24 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s44, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s42, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s41, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s28, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v5f64_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s45 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-TRUE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[1:2], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v11 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[4:5] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[12:13] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB75_5 +; GFX11-TRUE16-NEXT: .LBB75_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB75_2 +; GFX11-TRUE16-NEXT: .LBB75_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s47 +; GFX11-TRUE16-NEXT: .LBB75_5: ; %end +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s41 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s29 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v16 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_and_b32 v10, 0xff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s23 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s24 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[12:15], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[4:7], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s45, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s45 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-FAKE16-NEXT: .LBB75_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[1:2], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v11 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[10:11] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[4:5] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[12:13] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 8, v12 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 8 +; GFX11-FAKE16-NEXT: s_branch .LBB75_5 +; GFX11-FAKE16-NEXT: .LBB75_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB75_2 +; GFX11-FAKE16-NEXT: .LBB75_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v7, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v1, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v15, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v5, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v19, s63 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v17, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s62 :: v_dual_mov_b32 v13, s59 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s60 :: v_dual_mov_b32 v9, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s58 :: v_dual_mov_b32 v11, s56 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s47 :: v_dual_mov_b32 v6, s46 +; GFX11-FAKE16-NEXT: .LBB75_5: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s41, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s40, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v13, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s26, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s23, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s28, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xff ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s19, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s24, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s22, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s14, 8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v4, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, s1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[12:15], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[1:2], off offset:32 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -12302,304 +34686,310 @@ end: } define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i8_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v45 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v44 -; GCN-NEXT: v_or_b32_e32 v0, v0, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v8, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v10, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v20, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: v_or_b32_e32 v3, v6, v7 -; GCN-NEXT: v_or_b32_e32 v4, v8, v9 -; GCN-NEXT: v_or_b32_e32 v5, v10, v11 -; GCN-NEXT: v_or_b32_e32 v6, v12, v13 -; GCN-NEXT: v_or_b32_e32 v7, v14, v15 -; GCN-NEXT: v_or_b32_e32 v8, v16, v17 -; GCN-NEXT: v_or_b32_e32 v9, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v53, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v6, v54, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v8, v55, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v10, v40, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v12, v41, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v16, v42, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_or_b32_e32 v7, v50, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_or_b32_e32 v9, v17, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_or_b32_e32 v11, v19, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_or_b32_e32 v13, v21, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_or_b32_e32 v15, v23, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_or_b32_e32 v17, v25, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v20 -; GCN-NEXT: v_or_b32_e32 v19, v27, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v9, v19, v18 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i8_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v43, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v42 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v5, v5, v54 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v9, v9, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v2, v44, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v43, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v5, v54, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v53, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v21, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v9, v19, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v17, v10 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i8_to_v5i64: ; VI: ; %bb.0: @@ -12661,7 +35051,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12734,9 +35124,9 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB76_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB76_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v31 ; VI-NEXT: v_add_u16_e32 v1, 3, v32 @@ -12810,7 +35200,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 ; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB76_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -12884,7 +35274,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12957,9 +35347,9 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB76_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB76_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 @@ -13033,7 +35423,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB76_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -13104,15 +35494,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_4 +; GFX11-TRUE16-NEXT: .LBB76_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l @@ -13225,8 +35615,8 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 @@ -13390,15 +35780,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_4 +; GFX11-FAKE16-NEXT: .LBB76_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 @@ -13511,8 +35901,8 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-FAKE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -13643,232 +36033,1359 @@ end: ret <5 x i64> %phi } +define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i8_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: v_mov_b32_e32 v34, v14 +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v27, v2 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v25 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v40i8_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: v_mov_b32_e32 v34, v14 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v30, v6 +; VI-NEXT: v_mov_b32_e32 v29, v4 +; VI-NEXT: v_mov_b32_e32 v27, v2 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: s_cbranch_scc0 .LBB77_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB77_3 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v28 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v27 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB77_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB77_2 +; +; GFX9-LABEL: bitcast_v40i8_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v14 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v32, v10 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 +; GFX9-NEXT: v_mov_b32_e32 v30, v6 +; GFX9-NEXT: v_mov_b32_e32 v29, v4 +; GFX9-NEXT: v_mov_b32_e32 v27, v2 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB77_3 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v28 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: .LBB77_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB77_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v5i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB77_3 +; GFX11-TRUE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v37 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB77_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB77_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB77_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v5i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v6 :: v_dual_mov_b32 v24, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v2 :: v_dual_mov_b32 v26, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v27 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v37 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v30 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB77_3 +; GFX11-FAKE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v23 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s24, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s25, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v34, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v31, v6 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v37, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v36, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v24 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v33, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v22, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v17, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v19, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v32, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v21, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: .LBB77_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB77_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB77_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v40i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; GCN-NEXT: v_or_b32_e32 v35, v49, v35 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GCN-NEXT: v_or_b32_e32 v3, v3, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; GCN-NEXT: v_or_b32_e32 v4, v4, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_or_b32_e32 v5, v5, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v17 -; GCN-NEXT: v_or_b32_e32 v6, v6, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v10, v10, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 -; GCN-NEXT: v_or_b32_e32 v19, v32, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v25, v38, v39 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v27, v34, v36 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v21, v28, v30 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v15, v22, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v12, v17, v13 -; GCN-NEXT: v_or_b32_e32 v13, v18, v19 -; GCN-NEXT: v_or_b32_e32 v16, v24, v25 -; GCN-NEXT: v_or_b32_e32 v3, v3, v26 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_or_b32_e32 v5, v5, v20 -; GCN-NEXT: v_or_b32_e32 v6, v6, v21 -; GCN-NEXT: v_or_b32_e32 v7, v7, v14 -; GCN-NEXT: v_or_b32_e32 v8, v8, v15 -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v12 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i64_to_v40i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB78_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: .LBB78_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v32, v32, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v32 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v32, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v40i8: ; VI: ; %bb.0: @@ -13906,7 +37423,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -13938,9 +37455,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB78_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB78_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc @@ -13982,7 +37499,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB78_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -14092,7 +37609,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB78_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -14124,9 +37641,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB78_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB78_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc @@ -14168,7 +37685,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB78_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 @@ -14260,7 +37777,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -14282,9 +37799,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB27_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB78_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, v3, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -14319,7 +37836,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-TRUE16-NEXT: .LBB27_4: ; %end +; GFX11-TRUE16-NEXT: .LBB78_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14457,7 +37974,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] @@ -14489,9 +38006,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB27_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB78_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, v3, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -14536,7 +38053,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 -; GFX11-FAKE16-NEXT: .LBB27_4: ; %end +; GFX11-FAKE16-NEXT: .LBB78_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14651,24 +38168,1207 @@ end: ret <40 x i8> %phi } +define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v40i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: v_mov_b32_e32 v3, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 +; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 +; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 +; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 +; SI-NEXT: s_lshr_b32 s6, s25, 24 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 8 +; SI-NEXT: s_lshr_b32 s9, s23, 24 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 8 +; SI-NEXT: s_lshr_b32 s12, s21, 24 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 8 +; SI-NEXT: s_lshr_b32 s15, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s28, s17, 24 +; SI-NEXT: s_lshr_b32 s29, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s28, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s15, s15, 24 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v5i64_to_v40i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s27, s25, 16 +; VI-NEXT: s_lshr_b32 s28, s25, 8 +; VI-NEXT: s_lshr_b32 s29, s24, 16 +; VI-NEXT: s_lshr_b32 s40, s24, 8 +; VI-NEXT: s_lshr_b32 s41, s23, 24 +; VI-NEXT: s_lshr_b32 s42, s23, 16 +; VI-NEXT: s_lshr_b32 s43, s23, 8 +; VI-NEXT: s_lshr_b32 s44, s22, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 8 +; VI-NEXT: s_lshr_b32 s46, s21, 24 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: s_lshr_b32 s56, s21, 8 +; VI-NEXT: s_lshr_b32 s57, s20, 16 +; VI-NEXT: s_lshr_b32 s58, s20, 8 +; VI-NEXT: s_lshr_b32 s59, s19, 24 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s19, 8 +; VI-NEXT: s_lshr_b32 s62, s18, 16 +; VI-NEXT: s_lshr_b32 s63, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s17, 24 +; VI-NEXT: s_lshr_b32 s73, s17, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s76, s16, 8 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: s_and_b32 s5, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s75, 0xff +; VI-NEXT: s_lshl_b32 s9, s12, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_and_b32 s5, s17, 0xff +; VI-NEXT: s_lshl_b32 s7, s74, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s73, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s10, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s19, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s60, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s58, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s47, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s45, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s7, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s43, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s42, 0xff +; VI-NEXT: s_lshl_b32 s7, s41, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s24, 0xff +; VI-NEXT: s_lshl_b32 s6, s40, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s29, 0xff +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s5, s28, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s26, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: s_branch .LBB79_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v40i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s27, s25, 16 +; GFX9-NEXT: s_lshr_b32 s28, s25, 8 +; GFX9-NEXT: s_lshr_b32 s29, s24, 16 +; GFX9-NEXT: s_lshr_b32 s40, s24, 8 +; GFX9-NEXT: s_lshr_b32 s41, s23, 24 +; GFX9-NEXT: s_lshr_b32 s42, s23, 16 +; GFX9-NEXT: s_lshr_b32 s43, s23, 8 +; GFX9-NEXT: s_lshr_b32 s44, s22, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 8 +; GFX9-NEXT: s_lshr_b32 s46, s21, 24 +; GFX9-NEXT: s_lshr_b32 s47, s21, 16 +; GFX9-NEXT: s_lshr_b32 s56, s21, 8 +; GFX9-NEXT: s_lshr_b32 s57, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s59, s19, 24 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s19, 8 +; GFX9-NEXT: s_lshr_b32 s62, s18, 16 +; GFX9-NEXT: s_lshr_b32 s63, s18, 8 +; GFX9-NEXT: s_lshr_b32 s72, s17, 24 +; GFX9-NEXT: s_lshr_b32 s73, s17, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 8 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_lshr_b32 s76, s16, 8 +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s12, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s74, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s73, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s72, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s63, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s61, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s59, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s58, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s46, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s45, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s43, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s41, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s40, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s4, s6, s4 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s28, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: s_branch .LBB79_2 +; +; GFX11-TRUE16-LABEL: bitcast_v5i64_to_v40i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-TRUE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-TRUE16-NEXT: .LBB79_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s58 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s25 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB79_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB79_2 +; +; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s63, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s63 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s28, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s29, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s0, 8 +; GFX11-FAKE16-NEXT: .LBB79_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s60, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s58, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s47, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s45, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s41, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s29, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s27, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s24, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s22, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s15, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB79_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB79_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v5f64_to_v5i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5f64_to_v5i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: .LBB80_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v5i64: ; VI: ; %bb.0: @@ -14677,14 +39377,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14695,14 +39395,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14714,14 +39414,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14741,29 +39441,184 @@ end: ret <5 x i64> %phi } +define inreg <5 x i64> @bitcast_v5f64_to_v5i64_scalar(<5 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5f64_to_v5i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v5i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v5i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v5i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v5i64_to_v5f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v5i64_to_v5f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5i64_to_v5f64: ; VI: ; %bb.0: @@ -14772,7 +39627,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -14784,7 +39639,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -14795,7 +39650,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -14807,7 +39662,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -14819,7 +39674,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -14834,7 +39689,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB82_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14853,3 +39708,149 @@ end: %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <5 x double> %phi } + +define inreg <5 x double> @bitcast_v5i64_to_v5f64_scalar(<5 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v5i64_to_v5f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s26, 0 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v5i64_to_v5f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s26, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_3 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: .LBB83_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: s_branch .LBB83_2 +; +; GFX9-LABEL: bitcast_v5i64_to_v5f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s26, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: .LBB83_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: s_branch .LBB83_2 +; +; GFX11-LABEL: bitcast_v5i64_to_v5f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: .LBB83_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: s_branch .LBB83_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 7f8b733038f1e..7b756bce857bc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1,25 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define float @bitcast_i32_to_f32(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_f32: ; VI: ; %bb.0: @@ -77,20 +76,100 @@ end: ret float %phi } +define inreg float @bitcast_i32_to_f32_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_i32_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_i32_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_i32_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to float + br label %end + +cmp.false: + %a3 = bitcast i32 %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define i32 @bitcast_f32_to_i32(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_i32: ; VI: ; %bb.0: @@ -148,30 +227,113 @@ end: ret i32 %phi } +define inreg i32 @bitcast_f32_to_i32_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast float %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2i16: ; VI: ; %bb.0: @@ -229,37 +391,122 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_i32_to_v2i16_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_i32_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_i32_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_i32_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_i32: ; VI: ; %bb.0: @@ -320,37 +567,130 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v2i16_to_i32_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v2i16_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v2i16_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2f16: ; VI: ; %bb.0: @@ -408,41 +748,129 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_i32_to_v2f16_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_i32_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_i32_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_i32_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_i32: ; VI: ; %bb.0: @@ -504,35 +932,134 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v2f16_to_i32_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v2f16_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v2bf16: ; VI: ; %bb.0: @@ -590,39 +1117,127 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_i32_to_v2bf16_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_i32_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_i32_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_i32_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB13_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: .LBB13_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_i32: ; VI: ; %bb.0: @@ -631,7 +1246,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -651,7 +1266,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -662,7 +1277,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -681,7 +1296,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -693,7 +1308,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -714,7 +1329,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -726,7 +1341,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -746,7 +1361,7 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -766,47 +1381,198 @@ end: ret i32 %phi } -define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_i32_to_v1i32: +define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v2bf16_to_i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_4 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_3: +; VI-NEXT: s_branch .LBB15_2 +; VI-NEXT: .LBB15_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_i32_to_v1i32: +; GFX9-LABEL: bitcast_v2bf16_to_i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) { +; SI-LABEL: bitcast_i32_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; ; GFX11-LABEL: bitcast_i32_to_v1i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -837,20 +1603,100 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_i32_to_v1i32_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_i32_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_i32_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_i32_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB17_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: .LBB17_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_i32: ; VI: ; %bb.0: @@ -908,36 +1754,117 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v1i32_to_i32_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v1i32_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v1i32_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-LABEL: bitcast_v1i32_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB19_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: .LBB19_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { -; GCN-LABEL: bitcast_i32_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i32_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i32_to_v4i8: ; VI: ; %bb.0: @@ -948,20 +1875,20 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB10_3 +; VI-NEXT: s_cbranch_execnz .LBB20_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB10_4 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB20_4 +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB10_3: ; %cmp.false +; VI-NEXT: .LBB20_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: .LBB10_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: .LBB20_4: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -978,20 +1905,20 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB10_3 +; GFX9-NEXT: s_cbranch_execnz .LBB20_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB10_4 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB20_4 +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB10_3: ; %cmp.false +; GFX9-NEXT: .LBB20_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: .LBB10_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: .LBB20_4: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -1036,20 +1963,20 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_4 -; GFX11-FAKE16-NEXT: .LBB10_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-FAKE16-NEXT: .LBB20_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB10_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB20_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-FAKE16-NEXT: .LBB10_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: .LBB20_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -1074,52 +2001,209 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_i32_to_v4i8_scalar(i32 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i32_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_i32_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_i32_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-TRUE16-LABEL: bitcast_i32_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-TRUE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: .LBB21_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB21_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB21_2 +; +; GFX11-FAKE16-LABEL: bitcast_i32_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-FAKE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: .LBB21_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB21_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr1 +; GFX11-FAKE16-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_i32: ; VI: ; %bb.0: @@ -1131,14 +2215,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: s_cbranch_execnz .LBB22_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_4 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB22_4 +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB11_3: ; %cmp.false +; VI-NEXT: .LBB22_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1147,8 +2231,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: .LBB11_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: .LBB22_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1170,14 +2254,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: s_cbranch_execnz .LBB22_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_4 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB22_4 +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB11_3: ; %cmp.false +; GFX9-NEXT: .LBB22_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1186,8 +2270,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: .LBB11_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: .LBB22_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1209,14 +2293,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -1232,8 +2316,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-TRUE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1263,14 +2347,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -1286,8 +2370,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-FAKE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1323,30 +2407,200 @@ end: ret i32 %phi } +define inreg i32 @bitcast_v4i8_to_i32_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v4i8_to_i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v4i8_to_i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v4i8_to_i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_4 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB12_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: .LBB12_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_4 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB24_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: .LBB24_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2i16: ; VI: ; %bb.0: @@ -1404,37 +2658,124 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_f32_to_v2i16_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_f32: ; VI: ; %bb.0: @@ -1495,37 +2836,130 @@ end: ret float %phi } +define inreg float @bitcast_v2i16_to_f32_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v2i16_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v2i16_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_4 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB14_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: .LBB14_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_4 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB28_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: .LBB28_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2f16: ; VI: ; %bb.0: @@ -1583,41 +3017,130 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_f32_to_v2f16_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_f32_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_f32: ; VI: ; %bb.0: @@ -1679,35 +3202,134 @@ end: ret float %phi } +define inreg float @bitcast_v2f16_to_f32_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v2f16_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_4 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_3: +; VI-NEXT: s_branch .LBB31_2 +; VI-NEXT: .LBB31_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v2bf16: ; VI: ; %bb.0: @@ -1765,39 +3387,129 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_f32_to_v2bf16_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_f32: ; VI: ; %bb.0: @@ -1806,7 +3518,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -1826,7 +3538,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1837,7 +3549,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -1856,7 +3568,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1868,7 +3580,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -1889,7 +3601,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1901,7 +3613,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -1921,7 +3633,7 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1941,20 +3653,171 @@ end: ret float %phi } +define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v2bf16_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v1i32: ; VI: ; %bb.0: @@ -2012,20 +3875,102 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_f32_to_v1i32_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast float %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_f32: ; VI: ; %bb.0: @@ -2083,36 +4028,117 @@ end: ret float %phi } +define inreg float @bitcast_v1i32_to_f32_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v1i32_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v1i32_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v1i32_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { -; GCN-LABEL: bitcast_f32_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB20_4 -; GCN-NEXT: .LBB20_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB20_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: .LBB20_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f32_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f32_to_v4i8: ; VI: ; %bb.0: @@ -2123,20 +4149,20 @@ define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: s_cbranch_execnz .LBB40_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_4 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB40_4 +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB20_3: ; %cmp.false +; VI-NEXT: .LBB40_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: .LBB20_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: .LBB40_4: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -2153,20 +4179,20 @@ define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-NEXT: s_cbranch_execnz .LBB40_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_4 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB40_4 +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB20_3: ; %cmp.false +; GFX9-NEXT: .LBB40_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: .LBB20_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: .LBB40_4: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -2211,20 +4237,20 @@ define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_4 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_4 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB20_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -2249,52 +4275,213 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_f32_to_v4i8_scalar(float inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f32_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s7, s16, 24 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s7, s16, 24 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s7, s16, 24 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f32_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f32_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast float %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_f32: ; VI: ; %bb.0: @@ -2306,14 +4493,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: s_cbranch_execnz .LBB42_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB21_4 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB42_4 +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB21_3: ; %cmp.false +; VI-NEXT: .LBB42_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2322,8 +4509,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: .LBB21_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: .LBB42_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2345,14 +4532,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: s_cbranch_execnz .LBB42_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB21_4 -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB42_4 +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB21_3: ; %cmp.false +; GFX9-NEXT: .LBB42_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2361,8 +4548,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: .LBB21_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: .LBB42_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2384,14 +4571,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_4 +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB21_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -2407,8 +4594,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-TRUE16-NEXT: .LBB21_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2438,14 +4625,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4 -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_4 +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB21_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -2461,8 +4648,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: .LBB21_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: .LBB42_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2498,38 +4685,208 @@ end: ret float %phi } +define inreg float @bitcast_v4i8_to_f32_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v4i8_to_f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v4i8_to_f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-LABEL: bitcast_v4i8_to_f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB43_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2f16: ; VI: ; %bb.0: @@ -2590,29 +4947,120 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v2i16_to_v2f16_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v2i16_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2i16: ; VI: ; %bb.0: @@ -2674,23 +5122,118 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v2f16_to_v2i16_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v2f16_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v1, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: .LBB24_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v2bf16: ; VI: ; %bb.0: @@ -2721,18 +5264,111 @@ define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) { ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2i16_to_v2bf16: +; GFX11-LABEL: bitcast_v2i16_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define inreg <2 x bfloat> @bitcast_v2i16_to_v2bf16_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s16, 16 +; SI-NEXT: s_add_i32 s6, s5, 0x30000 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v2i16_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v2bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2752,39 +5388,39 @@ end: } define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2i16: ; VI: ; %bb.0: @@ -2793,7 +5429,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -2813,7 +5449,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2824,7 +5460,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -2843,7 +5479,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2855,7 +5491,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2878,7 +5514,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2890,7 +5526,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -2910,7 +5546,7 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2930,37 +5566,187 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_4 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_3: +; VI-NEXT: s_branch .LBB51_2 +; VI-NEXT: .LBB51_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_4 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB26_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: .LBB26_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v1i32: ; VI: ; %bb.0: @@ -3021,30 +5807,123 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_v2i16_to_v1i32_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v2i16_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2i16: ; VI: ; %bb.0: @@ -3102,48 +5981,133 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v1i32_to_v2i16_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v1i32_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-LABEL: bitcast_v1i32_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB55_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i16_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB28_4 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB28_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v4, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: .LBB28_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v3, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i16_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB56_4 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB56_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_bfe_u32 v3, v4, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: .LBB56_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_bfe_u32 v3, v3, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i16_to_v4i8: ; VI: ; %bb.0: @@ -3156,21 +6120,21 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB28_3 +; VI-NEXT: s_cbranch_execnz .LBB56_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB28_4 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB56_4 +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB28_3: ; %cmp.false +; VI-NEXT: .LBB56_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 ; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: ; implicit-def: $vgpr4 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 -; VI-NEXT: .LBB28_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB56_2 +; VI-NEXT: .LBB56_4: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: v_add_u16_sdwa v2, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v4 @@ -3190,20 +6154,20 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB28_3 +; GFX9-NEXT: s_cbranch_execnz .LBB56_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB28_4 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB56_4 +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB28_3: ; %cmp.false +; GFX9-NEXT: .LBB56_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 -; GFX9-NEXT: .LBB28_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB56_2 +; GFX9-NEXT: .LBB56_4: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -3248,20 +6212,20 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_4 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_4 +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB28_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB56_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-FAKE16-NEXT: .LBB28_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 +; GFX11-FAKE16-NEXT: .LBB56_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -3286,55 +6250,228 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v2i16_to_v4i8_scalar(<2 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i16_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshr_b32 s7, s6, 8 +; SI-NEXT: s_and_b32 s8, s17, 0xffff +; SI-NEXT: s_bfe_u32 s9, s17, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_lshr_b32 s7, s6, 8 +; SI-NEXT: s_and_b32 s8, s17, 0xffff +; SI-NEXT: s_bfe_u32 s9, s17, 0x80008 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v2i16_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s8, s16, 24 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s9, s16, 8 +; VI-NEXT: s_mov_b32 s7, s16 +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s16, 16 +; VI-NEXT: s_add_i32 s7, s16, 3 +; VI-NEXT: s_add_i32 s6, s5, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff +; VI-NEXT: s_lshl_b32 s5, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s9, s4, 8 +; VI-NEXT: s_bfe_u32 s8, s6, 0x80008 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v2i16_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s7, s16, 24 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2i16_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2i16_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2i16: ; VI: ; %bb.0: @@ -3346,14 +6483,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: s_cbranch_execnz .LBB58_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB29_4 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB58_4 +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB29_3: ; %cmp.false +; VI-NEXT: .LBB58_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3362,8 +6499,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 -; VI-NEXT: .LBB29_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB58_2 +; VI-NEXT: .LBB58_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3385,14 +6522,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: s_cbranch_execnz .LBB58_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB29_4 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB58_4 +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB29_3: ; %cmp.false +; GFX9-NEXT: .LBB58_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3401,8 +6538,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 -; GFX9-NEXT: .LBB29_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB58_2 +; GFX9-NEXT: .LBB58_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3424,14 +6561,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_4 -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4 +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB29_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -3447,8 +6584,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-TRUE16-NEXT: .LBB29_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -3478,14 +6615,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_4 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_4 +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB29_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -3501,8 +6638,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 -; GFX11-FAKE16-NEXT: .LBB29_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 +; GFX11-FAKE16-NEXT: .LBB58_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -3538,42 +6675,216 @@ end: ret <2 x i16> %phi } +define inreg <2 x i16> @bitcast_v4i8_to_v2i16_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v2i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s19, 8 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_and_b32 s7, s5, 0xffff +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v4i8_to_v2i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v2i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB59_3 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB59_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB59_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v2i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_3 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB59_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB59_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v2bf16: ; VI: ; %bb.0: @@ -3587,36 +6898,136 @@ define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v1, 0x200, v0 ; VI-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define inreg <2 x bfloat> @bitcast_v2f16_to_v2bf16_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s17 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v2f16_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_4 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_e32 v1, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_3: +; VI-NEXT: s_branch .LBB61_2 +; VI-NEXT: .LBB61_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v2f16_to_v2bf16: +; GFX9-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_4 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_3: +; GFX9-NEXT: s_branch .LBB61_2 +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2f16_to_v2bf16: +; GFX11-LABEL: bitcast_v2f16_to_v2bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_4 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_3: +; GFX11-NEXT: s_branch .LBB61_2 +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3636,43 +7047,43 @@ end: } define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v2f16: ; VI: ; %bb.0: @@ -3681,7 +7092,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -3701,7 +7112,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3712,7 +7123,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: s_cbranch_execz .LBB62_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -3731,7 +7142,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB31_2: ; %end +; GFX9-NEXT: .LBB62_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3743,7 +7154,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -3764,7 +7175,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB31_2: ; %end +; GFX11-TRUE16-NEXT: .LBB62_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3776,7 +7187,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -3796,7 +7207,7 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB31_2: ; %end +; GFX11-FAKE16-NEXT: .LBB62_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3816,41 +7227,198 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v1i32: ; VI: ; %bb.0: @@ -3912,37 +7480,136 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_v2f16_to_v1i32_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v2f16_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_4 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_3: +; VI-NEXT: s_branch .LBB65_2 +; VI-NEXT: .LBB65_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_4 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_3: +; GFX9-NEXT: s_branch .LBB65_2 +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_4 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_3: +; GFX11-NEXT: s_branch .LBB65_2 +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2f16: ; VI: ; %bb.0: @@ -4000,46 +7667,135 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v1i32_to_v2f16_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v1i32_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_3 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB67_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_4: +; VI-NEXT: s_branch .LBB67_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_3 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB67_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: s_branch .LBB67_2 +; +; GFX11-LABEL: bitcast_v1i32_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB67_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: .LBB67_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f16_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_4 -; GCN-NEXT: .LBB34_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB34_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: .LBB34_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f16_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_4 +; SI-NEXT: .LBB68_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB68_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: .LBB68_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f16_to_v4i8: ; VI: ; %bb.0: @@ -4050,19 +7806,19 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB34_3 +; VI-NEXT: s_cbranch_execnz .LBB68_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB34_4 -; VI-NEXT: .LBB34_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB68_4 +; VI-NEXT: .LBB68_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB34_3: ; %cmp.false +; VI-NEXT: .LBB68_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 -; VI-NEXT: .LBB34_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB68_2 +; VI-NEXT: .LBB68_4: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v1, 0x200 ; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 @@ -4082,20 +7838,20 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB34_3 +; GFX9-NEXT: s_cbranch_execnz .LBB68_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB34_4 -; GFX9-NEXT: .LBB34_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB68_4 +; GFX9-NEXT: .LBB68_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB34_3: ; %cmp.false +; GFX9-NEXT: .LBB68_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 -; GFX9-NEXT: .LBB34_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB68_2 +; GFX9-NEXT: .LBB68_4: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4141,20 +7897,20 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB68_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_4 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB68_4 +; GFX11-FAKE16-NEXT: .LBB68_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB34_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB68_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 +; GFX11-FAKE16-NEXT: .LBB68_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -4179,51 +7935,221 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v2f16_to_v4i8_scalar(<2 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f16_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v2f16_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB69_4 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v2, s4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_3: +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB69_2 +; VI-NEXT: .LBB69_4: +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s7, s16, 24 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB69_4 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_3: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: s_branch .LBB69_2 +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2f16_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_4 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2f16_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_4 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_or_b32_e32 v1, v4, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2f16: ; VI: ; %bb.0: @@ -4235,14 +8161,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4251,8 +8177,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4274,14 +8200,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4290,8 +8216,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4313,14 +8239,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -4336,8 +8262,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4367,14 +8293,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -4390,8 +8316,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4427,39 +8353,207 @@ end: ret <2 x half> %phi } +define inreg <2 x half> @bitcast_v4i8_to_v2f16_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v2f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s17, 8 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v4i8_to_v2f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v2f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v2f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB36_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB36_4 -; GCN-NEXT: .LBB36_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB36_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: .LBB36_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB72_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB72_4 +; SI-NEXT: .LBB72_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB72_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: .LBB72_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v1i32: ; VI: ; %bb.0: @@ -4468,7 +8562,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -4488,7 +8582,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4499,7 +8593,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -4518,7 +8612,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4530,7 +8624,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -4551,7 +8645,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-TRUE16-NEXT: .LBB36_2: ; %end +; GFX11-TRUE16-NEXT: .LBB72_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4563,7 +8657,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -4583,7 +8677,7 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB36_2: ; %end +; GFX11-FAKE16-NEXT: .LBB72_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4603,35 +8697,187 @@ end: ret <1 x i32> %phi } +define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v2bf16: ; VI: ; %bb.0: @@ -4689,46 +8935,134 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_v1i32_to_v2bf16_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s16, 16 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_3 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB75_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: s_branch .LBB75_2 +; +; GFX11-LABEL: bitcast_v1i32_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_vccz .LBB75_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: .LBB75_4: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v2bf16_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_4 -; GCN-NEXT: .LBB38_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB38_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_alignbit_b32 v0, v2, v5, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: .LBB38_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2bf16_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_4 +; SI-NEXT: .LBB76_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB76_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: .LBB76_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2bf16_to_v4i8: ; VI: ; %bb.0: @@ -4739,20 +9073,20 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB38_3 +; VI-NEXT: s_cbranch_execnz .LBB76_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB38_4 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB76_4 +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB38_3: ; %cmp.false +; VI-NEXT: .LBB76_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 -; VI-NEXT: .LBB38_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB76_2 +; VI-NEXT: .LBB76_4: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -4786,20 +9120,20 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB38_3 +; GFX9-NEXT: s_cbranch_execnz .LBB76_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB38_4 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB76_4 +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB38_3: ; %cmp.false +; GFX9-NEXT: .LBB76_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 -; GFX9-NEXT: .LBB38_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB76_2 +; GFX9-NEXT: .LBB76_4: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -4845,7 +9179,7 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4868,7 +9202,7 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-TRUE16-NEXT: .LBB38_4: ; %end +; GFX11-TRUE16-NEXT: .LBB76_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h @@ -4884,20 +9218,20 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_4 -; GFX11-FAKE16-NEXT: .LBB38_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_4 +; GFX11-FAKE16-NEXT: .LBB76_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB38_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_2 -; GFX11-FAKE16-NEXT: .LBB38_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB76_2 +; GFX11-FAKE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4939,51 +9273,298 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s7, s16, 24 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s8, s16, 16 +; GFX9-NEXT: s_lshr_b32 s7, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-TRUE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB77_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB77_2 +; GFX11-TRUE16-NEXT: .LBB77_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-FAKE16-NEXT: .LBB77_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB77_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: s_branch .LBB77_2 +; GFX11-FAKE16-NEXT: .LBB77_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v2bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v3, v0 -; GCN-NEXT: v_or_b32_e32 v0, v4, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v2bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v2bf16: ; VI: ; %bb.0: @@ -4995,14 +9576,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: s_cbranch_execnz .LBB78_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB39_4 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB78_4 +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB39_3: ; %cmp.false +; VI-NEXT: .LBB78_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5011,8 +9592,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 -; VI-NEXT: .LBB39_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB78_2 +; VI-NEXT: .LBB78_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5034,14 +9615,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: s_cbranch_execnz .LBB78_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB39_4 -; GFX9-NEXT: .LBB39_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB78_4 +; GFX9-NEXT: .LBB78_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB39_3: ; %cmp.false +; GFX9-NEXT: .LBB78_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5050,8 +9631,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB39_2 -; GFX9-NEXT: .LBB39_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB78_2 +; GFX9-NEXT: .LBB78_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5073,14 +9654,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_4 -; GFX11-TRUE16-NEXT: .LBB39_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_4 +; GFX11-TRUE16-NEXT: .LBB78_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB39_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -5096,8 +9677,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-TRUE16-NEXT: .LBB39_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5127,14 +9708,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_4 -; GFX11-FAKE16-NEXT: .LBB39_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_4 +; GFX11-FAKE16-NEXT: .LBB78_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB39_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -5150,8 +9731,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB39_2 -; GFX11-FAKE16-NEXT: .LBB39_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB78_2 +; GFX11-FAKE16-NEXT: .LBB78_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5187,36 +9768,206 @@ end: ret <2 x bfloat> %phi } +define inreg <2 x bfloat> @bitcast_v4i8_to_v2bf16_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshl_b32 s4, s19, 24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s6, s5, 0x3000000 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB79_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB79_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v2bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_3 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB79_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB79_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v1i32_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_4 -; GCN-NEXT: .LBB40_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB40_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: .LBB40_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v1i32_to_v4i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_4 +; SI-NEXT: .LBB80_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB80_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: .LBB80_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v1i32_to_v4i8: ; VI: ; %bb.0: @@ -5227,20 +9978,20 @@ define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB40_3 +; VI-NEXT: s_cbranch_execnz .LBB80_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB40_4 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB80_4 +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB40_3: ; %cmp.false +; VI-NEXT: .LBB80_3: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 -; VI-NEXT: .LBB40_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB80_2 +; VI-NEXT: .LBB80_4: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -5257,20 +10008,20 @@ define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB40_3 +; GFX9-NEXT: s_cbranch_execnz .LBB80_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB40_4 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB80_4 +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB40_3: ; %cmp.false +; GFX9-NEXT: .LBB80_3: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 -; GFX9-NEXT: .LBB40_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB80_2 +; GFX9-NEXT: .LBB80_4: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -5315,20 +10066,20 @@ define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB80_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_4 -; GFX11-FAKE16-NEXT: .LBB40_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB80_4 +; GFX11-FAKE16-NEXT: .LBB80_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB40_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB80_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 -; GFX11-FAKE16-NEXT: .LBB40_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB80_2 +; GFX11-FAKE16-NEXT: .LBB80_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -5353,52 +10104,209 @@ end: ret <4 x i8> %phi } +define inreg <4 x i8> @bitcast_v1i32_to_v4i8_scalar(<1 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v1i32_to_v4i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s17, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s7, s16, 16 +; SI-NEXT: s_lshr_b32 s6, s16, 24 +; SI-NEXT: s_lshr_b32 s8, s16, 8 +; SI-NEXT: .LBB81_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB81_2 +; +; VI-LABEL: bitcast_v1i32_to_v4i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s17, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB81_3 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s7, s16, 16 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: .LBB81_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_4: +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB81_2 +; +; GFX9-LABEL: bitcast_v1i32_to_v4i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB81_3 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s7, s16, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 24 +; GFX9-NEXT: s_lshr_b32 s8, s16, 8 +; GFX9-NEXT: .LBB81_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr7 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB81_2 +; +; GFX11-TRUE16-LABEL: bitcast_v1i32_to_v4i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB81_3 +; GFX11-TRUE16-NEXT: .LBB81_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-TRUE16-NEXT: .LBB81_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB81_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr1_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB81_2 +; +; GFX11-FAKE16-LABEL: bitcast_v1i32_to_v4i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB81_3 +; GFX11-FAKE16-NEXT: .LBB81_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: .LBB81_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB81_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr1 +; GFX11-FAKE16-NEXT: s_branch .LBB81_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i8_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i8_to_v1i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i8_to_v1i32: ; VI: ; %bb.0: @@ -5410,14 +10318,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: s_cbranch_execnz .LBB82_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_4 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB82_4 +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB41_3: ; %cmp.false +; VI-NEXT: .LBB82_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5426,8 +10334,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 -; VI-NEXT: .LBB41_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB82_2 +; VI-NEXT: .LBB82_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v5 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5449,14 +10357,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: s_cbranch_execnz .LBB82_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_4 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB82_4 +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB41_3: ; %cmp.false +; GFX9-NEXT: .LBB82_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5465,8 +10373,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 -; GFX9-NEXT: .LBB41_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB82_2 +; GFX9-NEXT: .LBB82_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5488,14 +10396,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_4 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_4 +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB41_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 @@ -5511,8 +10419,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 -; GFX11-TRUE16-NEXT: .LBB41_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 +; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v2.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5542,14 +10450,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v4 ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB82_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_4 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB82_4 +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB41_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 @@ -5565,8 +10473,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 -; GFX11-FAKE16-NEXT: .LBB41_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 +; GFX11-FAKE16-NEXT: .LBB82_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v5, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v2, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5601,3 +10509,173 @@ end: %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <1 x i32> %phi } + +define inreg <1 x i32> @bitcast_v4i8_to_v1i32_scalar(<4 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i8_to_v1i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v4i8_to_v1i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s20, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_cbranch_execnz .LBB83_3 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_add_i32 s6, s4, 0x3000000 +; VI-NEXT: .LBB83_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_4: +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB83_2 +; +; GFX9-LABEL: bitcast_v4i8_to_v1i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_cbranch_execnz .LBB83_3 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: .LBB83_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB83_2 +; +; GFX11-LABEL: bitcast_v4i8_to_v1i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: .LBB83_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: ; implicit-def: $sgpr5 +; GFX11-NEXT: s_branch .LBB83_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index 2a7315c055a54..6fc9a35cd9ee6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -1,34 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i32_to_v11f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i32_to_v11f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v11f32: ; VI: ; %bb.0: @@ -119,30 +120,185 @@ end: ret <11 x float> %phi } +define inreg <11 x float> @bitcast_v11i32_to_v11f32_scalar(<11 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i32_to_v11f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v11i32_to_v11f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v11i32_to_v11f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v11i32_to_v11f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i32> %a, splat (i32 3) + %a2 = bitcast <11 x i32> %a1 to <11 x float> + br label %end + +cmp.false: + %a3 = bitcast <11 x i32> %a to <11 x float> + br label %end + +end: + %phi = phi <11 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x float> %phi +} + define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f32_to_v11i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f32_to_v11i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v11i32: ; VI: ; %bb.0: @@ -151,7 +307,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -164,7 +320,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -175,7 +331,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -188,7 +344,7 @@ define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -227,80 +383,242 @@ end: ret <11 x i32> %phi } +define inreg <11 x i32> @bitcast_v11f32_to_v11i32_scalar(<11 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f32_to_v11i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f32_to_v11i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f32_to_v11i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f32_to_v11i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <11 x float> %a1 to <11 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x float> %a to <11 x i32> + br label %end + +end: + %phi = phi <11 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i32> %phi +} + define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i32_to_v22i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i32_to_v22i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22i16: ; VI: ; %bb.0: @@ -309,7 +627,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 @@ -322,7 +640,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -333,7 +651,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 @@ -346,7 +664,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -358,7 +676,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 @@ -371,7 +689,7 @@ define <22 x i16> @bitcast_v11i32_to_v22i16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -391,133 +709,338 @@ end: ret <22 x i16> %phi } +define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i32_to_v22i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: s_lshr_b32 s7, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v11i32_to_v22i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v11i32_to_v22i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v11i32_to_v22i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i32> %a, splat (i32 3) + %a2 = bitcast <11 x i32> %a1 to <22 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x i32> %a to <22 x i16> + br label %end + +end: + %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i16> %phi +} + define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i16_to_v11i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v10 -; GCN-NEXT: v_mov_b32_e32 v27, v8 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v29 -; GCN-NEXT: v_or_b32_e32 v2, v2, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v29, v1 -; GCN-NEXT: v_or_b32_e32 v2, v30, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i16_to_v11i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v8 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v11i32: ; VI: ; %bb.0: @@ -526,7 +1049,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 3 ; VI-NEXT: v_add_u16_e32 v11, 3, v10 @@ -562,7 +1085,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v11, v0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -573,7 +1096,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -586,7 +1109,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -598,7 +1121,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -611,7 +1134,7 @@ define <11 x i32> @bitcast_v22i16_to_v11i32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -631,147 +1154,433 @@ end: ret <11 x i32> %phi } +define inreg <11 x i32> @bitcast_v22i16_to_v11i32_scalar(<22 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i16_to_v11i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v10, v0, v15 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v22i16_to_v11i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v22i16_to_v11i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22i16_to_v11i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i16> %a, splat (i16 3) + %a2 = bitcast <22 x i16> %a1 to <11 x i32> + br label %end + +cmp.false: + %a3 = bitcast <22 x i16> %a to <11 x i32> + br label %end + +end: + %phi = phi <11 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i32> %phi +} + define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i32_to_v22f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v31, v9 -; GCN-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NEXT: v_mov_b32_e32 v29, v7 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v5 -; GCN-NEXT: v_mov_b32_e32 v26, v4 -; GCN-NEXT: v_mov_b32_e32 v25, v3 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v22, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i32_to_v22f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v9 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v25, v3 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i32_to_v22f16: ; VI: ; %bb.0: @@ -780,7 +1589,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 @@ -793,7 +1602,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -804,7 +1613,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 ; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 @@ -817,7 +1626,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -829,7 +1638,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 ; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 @@ -842,7 +1651,7 @@ define <22 x half> @bitcast_v11i32_to_v22f16(<11 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -862,181 +1671,413 @@ end: ret <22 x half> %phi } +define inreg <22 x half> @bitcast_v11i32_to_v22f16_scalar(<11 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i32_to_v22f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s5, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v11i32_to_v22f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v11i32_to_v22f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v11i32_to_v22f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i32> %a, splat (i32 3) + %a2 = bitcast <11 x i32> %a1 to <22 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x i32> %a to <22 x half> + br label %end + +end: + %phi = phi <22 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x half> %phi +} + define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f16_to_v11i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v20 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_or_b32_e32 v1, v30, v1 -; GCN-NEXT: v_or_b32_e32 v2, v28, v2 -; GCN-NEXT: v_or_b32_e32 v3, v26, v3 -; GCN-NEXT: v_or_b32_e32 v4, v24, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v22, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v13, v8 -; GCN-NEXT: v_or_b32_e32 v9, v12, v9 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v18, v10 -; GCN-NEXT: v_or_b32_e32 v6, v20, v19 -; GCN-NEXT: v_or_b32_e32 v7, v15, v21 -; GCN-NEXT: v_or_b32_e32 v8, v13, v17 -; GCN-NEXT: v_or_b32_e32 v9, v12, v16 -; GCN-NEXT: v_or_b32_e32 v10, v11, v14 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f16_to_v11i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11i32: ; VI: ; %bb.0: @@ -1045,7 +2086,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 ; VI-NEXT: v_add_f16_sdwa v12, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1081,7 +2122,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v12 ; VI-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1092,7 +2133,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -1106,7 +2147,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1118,7 +2159,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -1131,7 +2172,7 @@ define <11 x i32> @bitcast_v22f16_to_v11i32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1151,80 +2192,399 @@ end: ret <11 x i32> %phi } +define inreg <11 x i32> @bitcast_v22f16_to_v11i32_scalar(<22 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f16_to_v11i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v19, v6 +; SI-NEXT: v_or_b32_e32 v7, v17, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v22f16_to_v11i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v11, v1 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_add_f16_sdwa v11, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v22f16_to_v11i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f16_to_v11i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x half> %a, splat (half 0xH0200) + %a2 = bitcast <22 x half> %a1 to <11 x i32> + br label %end + +cmp.false: + %a3 = bitcast <22 x half> %a to <11 x i32> + br label %end + +end: + %phi = phi <11 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i32> %phi +} + define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f32_to_v22i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f32_to_v22i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22i16: ; VI: ; %bb.0: @@ -1233,7 +2593,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1246,7 +2606,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1257,7 +2617,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1270,7 +2630,7 @@ define <22 x i16> @bitcast_v11f32_to_v22i16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1309,133 +2669,351 @@ end: ret <22 x i16> %phi } +define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f32_to_v22i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_4 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB13_2 +; SI-NEXT: .LBB13_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f32_to_v22i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: +; VI-NEXT: s_branch .LBB13_2 +; VI-NEXT: .LBB13_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f32_to_v22i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_branch .LBB13_2 +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f32_to_v22i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: +; GFX11-NEXT: s_branch .LBB13_2 +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <11 x float> %a1 to <22 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x float> %a to <22 x i16> + br label %end + +end: + %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i16> %phi +} + define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i16_to_v11f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v10 -; GCN-NEXT: v_mov_b32_e32 v27, v8 -; GCN-NEXT: v_mov_b32_e32 v26, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v0, v0, v22 -; GCN-NEXT: v_or_b32_e32 v1, v1, v29 -; GCN-NEXT: v_or_b32_e32 v2, v2, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v23 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v22, v0 -; GCN-NEXT: v_or_b32_e32 v1, v29, v1 -; GCN-NEXT: v_or_b32_e32 v2, v30, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i16_to_v11f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v10 +; SI-NEXT: v_mov_b32_e32 v27, v8 +; SI-NEXT: v_mov_b32_e32 v26, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v31, v4 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v11f32: ; VI: ; %bb.0: @@ -1444,7 +3022,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 3 ; VI-NEXT: v_add_u16_e32 v11, 3, v10 @@ -1480,7 +3058,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v11, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1491,7 +3069,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -1504,7 +3082,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1516,7 +3094,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -1529,7 +3107,7 @@ define <11 x float> @bitcast_v22i16_to_v11f32(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1549,147 +3127,433 @@ end: ret <11 x float> %phi } +define inreg <11 x float> @bitcast_v22i16_to_v11f32_scalar(<22 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i16_to_v11f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v11, v6 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v10, v0, v15 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v22i16_to_v11f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v22i16_to_v11f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22i16_to_v11f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i16> %a, splat (i16 3) + %a2 = bitcast <22 x i16> %a1 to <11 x float> + br label %end + +cmp.false: + %a3 = bitcast <22 x i16> %a to <11 x float> + br label %end + +end: + %phi = phi <11 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x float> %phi +} + define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f32_to_v22f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v31, v9 -; GCN-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NEXT: v_mov_b32_e32 v29, v7 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v5 -; GCN-NEXT: v_mov_b32_e32 v26, v4 -; GCN-NEXT: v_mov_b32_e32 v25, v3 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v22, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f32_to_v22f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v9 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v25, v3 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22f16: ; VI: ; %bb.0: @@ -1698,7 +3562,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1711,7 +3575,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1722,7 +3586,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 ; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 @@ -1735,7 +3599,7 @@ define <22 x half> @bitcast_v11f32_to_v22f16(<11 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1774,181 +3638,431 @@ end: ret <22 x half> %phi } +define inreg <22 x half> @bitcast_v11f32_to_v22f16_scalar(<11 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f32_to_v22f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s27, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s25, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v11f32_to_v22f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_4 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_3: +; VI-NEXT: s_branch .LBB17_2 +; VI-NEXT: .LBB17_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f32_to_v22f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_4 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_branch .LBB17_2 +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f32_to_v22f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_4 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: +; GFX11-NEXT: s_branch .LBB17_2 +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <11 x float> %a1 to <22 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x float> %a to <22 x half> + br label %end + +end: + %phi = phi <22 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x half> %phi +} + define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f16_to_v11f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v20 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_or_b32_e32 v1, v30, v1 -; GCN-NEXT: v_or_b32_e32 v2, v28, v2 -; GCN-NEXT: v_or_b32_e32 v3, v26, v3 -; GCN-NEXT: v_or_b32_e32 v4, v24, v4 -; GCN-NEXT: v_or_b32_e32 v5, v23, v5 -; GCN-NEXT: v_or_b32_e32 v6, v22, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v13, v8 -; GCN-NEXT: v_or_b32_e32 v9, v12, v9 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v18, v10 -; GCN-NEXT: v_or_b32_e32 v6, v20, v19 -; GCN-NEXT: v_or_b32_e32 v7, v15, v21 -; GCN-NEXT: v_or_b32_e32 v8, v13, v17 -; GCN-NEXT: v_or_b32_e32 v9, v12, v16 -; GCN-NEXT: v_or_b32_e32 v10, v11, v14 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f16_to_v11f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v4, v29, v4 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v6, v25, v6 +; SI-NEXT: v_or_b32_e32 v7, v23, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v11f32: ; VI: ; %bb.0: @@ -1957,7 +4071,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 0x200 ; VI-NEXT: v_add_f16_sdwa v12, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1993,7 +4107,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v12 ; VI-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2004,7 +4118,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -2018,7 +4132,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2030,7 +4144,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -2043,7 +4157,7 @@ define <11 x float> @bitcast_v22f16_to_v11f32(<22 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2063,158 +4177,477 @@ end: ret <11 x float> %phi } +define inreg <11 x float> @bitcast_v22f16_to_v11f32_scalar(<22 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f16_to_v11f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v2, v27, v2 +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: v_or_b32_e32 v4, v23, v4 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: v_or_b32_e32 v6, v19, v6 +; SI-NEXT: v_or_b32_e32 v7, v17, v7 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v22f16_to_v11f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v11, v1 +; VI-NEXT: v_mov_b32_e32 v11, s4 +; VI-NEXT: v_add_f16_sdwa v11, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v22f16_to_v11f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f16_to_v11f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x half> %a, splat (half 0xH0200) + %a2 = bitcast <22 x half> %a1 to <11 x float> + br label %end + +cmp.false: + %a3 = bitcast <22 x half> %a to <11 x float> + br label %end + +end: + %phi = phi <11 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x float> %phi +} + define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i16_to_v22f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v51, v21 -; GCN-NEXT: v_mov_b32_e32 v50, v20 -; GCN-NEXT: v_mov_b32_e32 v49, v19 -; GCN-NEXT: v_mov_b32_e32 v48, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v17 -; GCN-NEXT: v_mov_b32_e32 v38, v16 -; GCN-NEXT: v_mov_b32_e32 v37, v15 -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v13 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v33, v11 -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v31, v9 -; GCN-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NEXT: v_mov_b32_e32 v29, v7 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v5 -; GCN-NEXT: v_mov_b32_e32 v26, v4 -; GCN-NEXT: v_mov_b32_e32 v25, v3 -; GCN-NEXT: v_mov_b32_e32 v24, v2 -; GCN-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NEXT: v_mov_b32_e32 v52, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i16_to_v22f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v21 +; SI-NEXT: v_mov_b32_e32 v50, v20 +; SI-NEXT: v_mov_b32_e32 v49, v19 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v39, v17 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v37, v15 +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v13 +; SI-NEXT: v_mov_b32_e32 v34, v12 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v31, v9 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v5 +; SI-NEXT: v_mov_b32_e32 v26, v4 +; SI-NEXT: v_mov_b32_e32 v25, v3 +; SI-NEXT: v_mov_b32_e32 v24, v2 +; SI-NEXT: v_mov_b32_e32 v23, v1 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i16_to_v22f16: ; VI: ; %bb.0: @@ -2223,7 +4656,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v11, 3 ; VI-NEXT: v_add_u16_sdwa v19, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2259,7 +4692,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v14 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2270,7 +4703,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -2283,7 +4716,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2295,7 +4728,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] @@ -2308,7 +4741,7 @@ define <22 x half> @bitcast_v22i16_to_v22f16(<22 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2328,134 +4761,420 @@ end: ret <22 x half> %phi } +define inreg <22 x half> @bitcast_v22i16_to_v22f16_scalar(<22 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i16_to_v22f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mov_b32_e32 v26, v7 +; SI-NEXT: v_mov_b32_e32 v25, v6 +; SI-NEXT: v_mov_b32_e32 v24, v5 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_mov_b32_e32 v22, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v28, v1 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v27 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v22i16_to_v22f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v22i16_to_v22f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22i16_to_v22f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i16> %a, splat (i16 3) + %a2 = bitcast <22 x i16> %a1 to <22 x half> + br label %end + +cmp.false: + %a3 = bitcast <22 x i16> %a to <22 x half> + br label %end + +end: + %phi = phi <22 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x half> %phi +} + define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f16_to_v22i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f16_to_v22i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f16_to_v22i16: ; VI: ; %bb.0: @@ -2464,7 +5183,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 @@ -2500,7 +5219,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v14, v2 ; VI-NEXT: v_or_b32_e32 v1, v13, v1 ; VI-NEXT: v_or_b32_e32 v0, v11, v0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2511,7 +5230,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] @@ -2525,7 +5244,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2537,7 +5256,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] @@ -2550,7 +5269,7 @@ define <22 x i16> @bitcast_v22f16_to_v22i16(<22 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2569,3 +5288,327 @@ end: %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <22 x i16> %phi } + +define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f16_to_v22i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v22f16_to_v22i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s27, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: v_add_f16_e32 v1, s25, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s26, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v10, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v12, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v11, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v11, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v22f16_to_v22i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f16_to_v22i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s23, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x half> %a, splat (half 0xH0200) + %a2 = bitcast <22 x half> %a1 to <22 x i16> + br label %end + +cmp.false: + %a3 = bitcast <22 x half> %a to <22 x i16> + br label %end + +end: + %phi = phi <22 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index 264e2b2bf0122..c9860dbb7d72c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -1,35 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v12f32: ; VI: ; %bb.0: @@ -123,31 +124,193 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v12i32_to_v12f32_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v12i32_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v12i32: ; VI: ; %bb.0: @@ -156,7 +319,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -170,7 +333,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -181,7 +344,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -195,7 +358,7 @@ define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -234,31 +397,200 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v12f32_to_v12i32_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v6f64: ; VI: ; %bb.0: @@ -267,7 +599,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -281,7 +613,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -292,7 +624,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -306,7 +638,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -318,7 +650,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -332,7 +664,7 @@ define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -352,25 +684,187 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v12i32_to_v6f64_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v12i32_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v12i32: ; VI: ; %bb.0: @@ -379,7 +873,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -387,7 +881,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -398,7 +892,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -406,7 +900,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -418,7 +912,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -426,7 +920,7 @@ define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -446,31 +940,176 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v6f64_to_v12i32_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_4 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_3: +; SI-NEXT: s_branch .LBB7_2 +; SI-NEXT: .LBB7_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_4 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_3: +; VI-NEXT: s_branch .LBB7_2 +; VI-NEXT: .LBB7_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v6i64: ; VI: ; %bb.0: @@ -479,7 +1118,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -493,7 +1132,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -504,7 +1143,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -518,7 +1157,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -530,7 +1169,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -544,7 +1183,7 @@ define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -564,31 +1203,193 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v12i32_to_v6i64_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v12i32_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v12i32: ; VI: ; %bb.0: @@ -597,7 +1398,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -611,7 +1412,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -622,7 +1423,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -636,7 +1437,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -648,7 +1449,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -665,7 +1466,7 @@ define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -685,82 +1486,244 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v6i64_to_v12i32_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v6i64_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_3 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB11_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: s_branch .LBB11_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24i16: ; VI: ; %bb.0: @@ -769,7 +1732,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -783,7 +1746,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -794,7 +1757,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -808,7 +1771,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -820,7 +1783,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -834,7 +1797,7 @@ define <24 x i16> @bitcast_v12i32_to_v24i16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -854,142 +1817,358 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v12i32_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v25 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v12i32: ; VI: ; %bb.0: @@ -998,7 +2177,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -1037,7 +2216,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1048,7 +2227,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -1062,7 +2241,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1074,7 +2253,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -1088,7 +2267,7 @@ define <12 x i32> @bitcast_v24i16_to_v12i32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1108,158 +2287,461 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v11, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v24i16_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i32_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v11 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v31, v7 -; GCN-NEXT: v_mov_b32_e32 v30, v6 -; GCN-NEXT: v_mov_b32_e32 v29, v5 -; GCN-NEXT: v_mov_b32_e32 v28, v4 -; GCN-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i32_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v31, v7 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_mov_b32_e32 v28, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i32_to_v24f16: ; VI: ; %bb.0: @@ -1268,7 +2750,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -1282,7 +2764,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1293,7 +2775,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 ; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 @@ -1307,7 +2789,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1319,7 +2801,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 ; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 @@ -1333,7 +2815,7 @@ define <24 x half> @bitcast_v12i32_to_v24f16(<12 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1353,195 +2835,441 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v12i32_to_v24f16_scalar(<12 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i32_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v12i32_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v12i32_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v12i32_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v12i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: v_or_b32_e32 v3, v30, v3 -; GCN-NEXT: v_or_b32_e32 v4, v28, v4 -; GCN-NEXT: v_or_b32_e32 v5, v26, v5 -; GCN-NEXT: v_or_b32_e32 v6, v25, v6 -; GCN-NEXT: v_or_b32_e32 v7, v24, v7 -; GCN-NEXT: v_or_b32_e32 v8, v16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v14, v9 -; GCN-NEXT: v_or_b32_e32 v10, v13, v10 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v20, v18 -; GCN-NEXT: v_or_b32_e32 v8, v16, v21 -; GCN-NEXT: v_or_b32_e32 v9, v14, v19 -; GCN-NEXT: v_or_b32_e32 v10, v13, v17 -; GCN-NEXT: v_or_b32_e32 v11, v12, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12i32: ; VI: ; %bb.0: @@ -1550,7 +3278,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1589,7 +3317,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1600,7 +3328,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -1615,7 +3343,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1627,7 +3355,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -1641,7 +3369,7 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1661,31 +3389,371 @@ end: ret <12 x i32> %phi } +define inreg <12 x i32> @bitcast_v24f16_to_v12i32_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v12i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v24f16_to_v12i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v12i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v12i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v6f64: ; VI: ; %bb.0: @@ -1694,7 +3762,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1708,7 +3776,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1719,7 +3787,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1733,7 +3801,7 @@ define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1772,25 +3840,208 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v12f32_to_v6f64_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_3: +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v12f32: ; VI: ; %bb.0: @@ -1799,7 +4050,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -1807,7 +4058,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1818,7 +4069,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -1826,7 +4077,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1838,7 +4089,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -1846,7 +4097,7 @@ define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1866,31 +4117,176 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v6f64_to_v12f32_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_4 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_3: +; SI-NEXT: s_branch .LBB23_2 +; SI-NEXT: .LBB23_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v6i64: ; VI: ; %bb.0: @@ -1899,7 +4295,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1913,7 +4309,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1924,7 +4320,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1938,7 +4334,7 @@ define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,31 +4373,214 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v12f32_to_v6i64_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v12f32: ; VI: ; %bb.0: @@ -2010,7 +4589,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -2024,7 +4603,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2035,7 +4614,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -2049,7 +4628,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2061,7 +4640,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2078,7 +4657,7 @@ define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2098,82 +4677,244 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v6i64_to_v12f32_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v6i64_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24i16: ; VI: ; %bb.0: @@ -2182,7 +4923,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2196,7 +4937,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB28_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2207,7 +4948,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2221,7 +4962,7 @@ define <24 x i16> @bitcast_v12f32_to_v24i16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB28_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2260,142 +5001,369 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v27, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v35 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v7, v7, v15 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v25 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v35, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v31, v3 -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v13, v6 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v34 +; SI-NEXT: v_or_b32_e32 v5, v5, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_or_b32_e32 v7, v7, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v24 +; SI-NEXT: v_or_b32_e32 v9, v9, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 +; SI-NEXT: v_or_b32_e32 v2, v36, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v32, v6 +; SI-NEXT: v_or_b32_e32 v7, v31, v7 +; SI-NEXT: v_or_b32_e32 v8, v24, v8 +; SI-NEXT: v_or_b32_e32 v9, v17, v9 +; SI-NEXT: v_or_b32_e32 v10, v15, v10 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v12f32: ; VI: ; %bb.0: @@ -2404,7 +5372,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -2443,7 +5411,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2454,7 +5422,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -2468,7 +5436,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2480,7 +5448,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -2494,7 +5462,7 @@ define <12 x float> @bitcast_v24i16_to_v12f32(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2514,158 +5482,461 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v16, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v10, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v11, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v20, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v12 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v24i16_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f32_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v35, v11 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v31, v7 -; GCN-NEXT: v_mov_b32_e32 v30, v6 -; GCN-NEXT: v_mov_b32_e32 v29, v5 -; GCN-NEXT: v_mov_b32_e32 v28, v4 -; GCN-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v33 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f32_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v31, v7 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_mov_b32_e32 v28, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24f16: ; VI: ; %bb.0: @@ -2674,7 +5945,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2688,7 +5959,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2699,7 +5970,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 ; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -2713,7 +5984,7 @@ define <24 x half> @bitcast_v12f32_to_v24f16(<12 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2752,195 +6023,457 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v12f32_to_v24f16_scalar(<12 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f32_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s27, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v12f32_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v12f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; GCN-NEXT: v_or_b32_e32 v0, v39, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: v_or_b32_e32 v3, v30, v3 -; GCN-NEXT: v_or_b32_e32 v4, v28, v4 -; GCN-NEXT: v_or_b32_e32 v5, v26, v5 -; GCN-NEXT: v_or_b32_e32 v6, v25, v6 -; GCN-NEXT: v_or_b32_e32 v7, v24, v7 -; GCN-NEXT: v_or_b32_e32 v8, v16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v14, v9 -; GCN-NEXT: v_or_b32_e32 v10, v13, v10 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v20, v18 -; GCN-NEXT: v_or_b32_e32 v8, v16, v21 -; GCN-NEXT: v_or_b32_e32 v9, v14, v19 -; GCN-NEXT: v_or_b32_e32 v10, v13, v17 -; GCN-NEXT: v_or_b32_e32 v11, v12, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v33, v4 +; SI-NEXT: v_or_b32_e32 v5, v31, v5 +; SI-NEXT: v_or_b32_e32 v6, v29, v6 +; SI-NEXT: v_or_b32_e32 v7, v27, v7 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v25 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v12f32: ; VI: ; %bb.0: @@ -2949,7 +6482,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -2988,7 +6521,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2999,7 +6532,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -3014,7 +6547,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3026,7 +6559,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -3040,7 +6573,7 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3060,25 +6593,365 @@ end: ret <12 x float> %phi } +define inreg <12 x float> @bitcast_v24f16_to_v12f32_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v12f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_or_b32_e32 v4, v24, v4 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v6, v22, v6 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: v_or_b32_e32 v9, v16, v9 +; SI-NEXT: v_or_b32_e32 v10, v14, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v24 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v24f16_to_v12f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v12f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v12f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v6i64: ; VI: ; %bb.0: @@ -3087,7 +6960,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3095,7 +6968,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3106,7 +6979,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3114,7 +6987,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3126,7 +6999,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3134,7 +7007,7 @@ define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3154,31 +7027,190 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v6f64_to_v6i64_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_3: +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v6f64: ; VI: ; %bb.0: @@ -3187,7 +7219,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -3201,7 +7233,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3212,7 +7244,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -3226,7 +7258,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3238,7 +7270,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3255,7 +7287,7 @@ define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3275,86 +7307,247 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v6i64_to_v6f64_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v6i64_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_3 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: .LBB39_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: s_branch .LBB39_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v34, v11 -; GCN-NEXT: v_mov_b32_e32 v33, v10 -; GCN-NEXT: v_mov_b32_e32 v32, v9 -; GCN-NEXT: v_mov_b32_e32 v31, v8 -; GCN-NEXT: v_mov_b32_e32 v30, v7 -; GCN-NEXT: v_mov_b32_e32 v29, v6 -; GCN-NEXT: v_mov_b32_e32 v28, v5 -; GCN-NEXT: v_mov_b32_e32 v27, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v3 -; GCN-NEXT: v_mov_b32_e32 v25, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; GCN-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v25 -; GCN-NEXT: v_mov_b32_e32 v6, v26 -; GCN-NEXT: v_mov_b32_e32 v8, v27 -; GCN-NEXT: v_mov_b32_e32 v10, v28 -; GCN-NEXT: v_mov_b32_e32 v12, v29 -; GCN-NEXT: v_mov_b32_e32 v14, v30 -; GCN-NEXT: v_mov_b32_e32 v16, v31 -; GCN-NEXT: v_mov_b32_e32 v18, v32 -; GCN-NEXT: v_mov_b32_e32 v20, v33 -; GCN-NEXT: v_mov_b32_e32 v22, v34 -; GCN-NEXT: v_mov_b32_e32 v1, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v30, v7 +; SI-NEXT: v_mov_b32_e32 v29, v6 +; SI-NEXT: v_mov_b32_e32 v28, v5 +; SI-NEXT: v_mov_b32_e32 v27, v4 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v25 +; SI-NEXT: v_mov_b32_e32 v6, v26 +; SI-NEXT: v_mov_b32_e32 v8, v27 +; SI-NEXT: v_mov_b32_e32 v10, v28 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_mov_b32_e32 v14, v30 +; SI-NEXT: v_mov_b32_e32 v16, v31 +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v20, v33 +; SI-NEXT: v_mov_b32_e32 v22, v34 +; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24i16: ; VI: ; %bb.0: @@ -3363,7 +7556,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3371,7 +7564,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3382,7 +7575,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3390,7 +7583,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3402,7 +7595,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3410,7 +7603,7 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3430,144 +7623,360 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v24, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v25, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v27, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v28, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v29, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_4 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v24, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v25, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v27, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v29, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_branch .LBB41_5 +; SI-NEXT: .LBB41_3: +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB41_2 +; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: .LBB41_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v22, v21 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v5, v28 +; SI-NEXT: v_mov_b32_e32 v9, v27 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_mov_b32_e32 v17, v25 +; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v31, v12 -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v27, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v39 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v33 -; GCN-NEXT: v_or_b32_e32 v4, v4, v34 -; GCN-NEXT: v_or_b32_e32 v5, v5, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: v_or_b32_e32 v7, v7, v37 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v33, v3 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_or_b32_e32 v5, v35, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v37, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v31, v12 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v6f64: ; VI: ; %bb.0: @@ -3576,7 +7985,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -3615,7 +8024,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3626,7 +8035,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -3640,7 +8049,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3652,7 +8061,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -3666,7 +8075,7 @@ define <6 x double> @bitcast_v24i16_to_v6f64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3686,142 +8095,451 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v24 +; SI-NEXT: v_or_b32_e32 v11, v0, v21 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v24i16_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f64_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v29 -; GCN-NEXT: v_mov_b32_e32 v1, v35 -; GCN-NEXT: v_mov_b32_e32 v2, v24 -; GCN-NEXT: v_mov_b32_e32 v3, v34 -; GCN-NEXT: v_mov_b32_e32 v4, v25 -; GCN-NEXT: v_mov_b32_e32 v5, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v26 -; GCN-NEXT: v_mov_b32_e32 v7, v32 -; GCN-NEXT: v_mov_b32_e32 v8, v27 -; GCN-NEXT: v_mov_b32_e32 v9, v31 -; GCN-NEXT: v_mov_b32_e32 v10, v28 -; GCN-NEXT: v_mov_b32_e32 v11, v30 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f64_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v1, v35 +; SI-NEXT: v_mov_b32_e32 v2, v33 +; SI-NEXT: v_mov_b32_e32 v3, v32 +; SI-NEXT: v_mov_b32_e32 v4, v31 +; SI-NEXT: v_mov_b32_e32 v5, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v7, v27 +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_mov_b32_e32 v9, v24 +; SI-NEXT: v_mov_b32_e32 v10, v26 +; SI-NEXT: v_mov_b32_e32 v11, v25 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24f16: ; VI: ; %bb.0: @@ -3830,7 +8548,7 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3838,7 +8556,7 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3849,7 +8567,7 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -3857,28 +8575,266 @@ define <24 x half> @bitcast_v6f64_to_v24f16(<6 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v6f64_to_v24f16: +; GFX11-LABEL: bitcast_v6f64_to_v24f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + +define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f64_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v6f64_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v24f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB22_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -3898,194 +8854,194 @@ end: } define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v6f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; GCN-NEXT: v_or_b32_e32 v0, v49, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v30, v4 -; GCN-NEXT: v_or_b32_e32 v5, v28, v5 -; GCN-NEXT: v_or_b32_e32 v6, v27, v6 -; GCN-NEXT: v_or_b32_e32 v7, v25, v7 -; GCN-NEXT: v_or_b32_e32 v8, v24, v8 -; GCN-NEXT: v_or_b32_e32 v9, v18, v9 -; GCN-NEXT: v_or_b32_e32 v10, v17, v10 -; GCN-NEXT: v_or_b32_e32 v11, v16, v11 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v18, v20 -; GCN-NEXT: v_or_b32_e32 v10, v17, v21 -; GCN-NEXT: v_or_b32_e32 v11, v16, v19 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v6f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v31, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6f64: ; VI: ; %bb.0: @@ -4094,7 +9050,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4133,7 +9089,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4144,7 +9100,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -4159,7 +9115,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4171,7 +9127,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -4185,7 +9141,7 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4205,82 +9161,432 @@ end: ret <6 x double> %phi } +define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v6f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_or_b32_e32 v8, v22, v8 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v24f16_to_v6f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v6f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v6f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v24, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v12, v24 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v24, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v12, v24 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24i16: ; VI: ; %bb.0: @@ -4289,7 +9595,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -4303,7 +9609,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4314,7 +9620,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -4328,7 +9634,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4340,7 +9646,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4357,7 +9663,7 @@ define <24 x i16> @bitcast_v6i64_to_v24i16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4377,144 +9683,360 @@ end: ret <24 x i16> %phi } +define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: s_lshr_b32 s7, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v6i64_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB49_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: s_branch .LBB49_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} + define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v32, v14 -; GCN-NEXT: v_mov_b32_e32 v31, v12 -; GCN-NEXT: v_mov_b32_e32 v30, v10 -; GCN-NEXT: v_mov_b32_e32 v29, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v6 -; GCN-NEXT: v_mov_b32_e32 v25, v4 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v27, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v39 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_or_b32_e32 v3, v3, v33 -; GCN-NEXT: v_or_b32_e32 v4, v4, v34 -; GCN-NEXT: v_or_b32_e32 v5, v5, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: v_or_b32_e32 v7, v7, v37 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_or_b32_e32 v2, v24, v2 -; GCN-NEXT: v_or_b32_e32 v3, v33, v3 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_or_b32_e32 v5, v35, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v37, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v31, v12 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v27, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v37 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: v_or_b32_e32 v9, v9, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v34, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v19, v10 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v6i64: ; VI: ; %bb.0: @@ -4523,7 +10045,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 3 ; VI-NEXT: v_add_u16_e32 v12, 3, v11 @@ -4562,7 +10084,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v12, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4573,7 +10095,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -4587,7 +10109,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4599,7 +10121,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -4613,7 +10135,7 @@ define <6 x i64> @bitcast_v24i16_to_v6i64(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: .LBB50_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4633,158 +10155,467 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v17, v6 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v19, v2 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v24 +; SI-NEXT: v_or_b32_e32 v11, v0, v21 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v24i16_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i64_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v11 -; GCN-NEXT: v_mov_b32_e32 v25, v10 -; GCN-NEXT: v_mov_b32_e32 v28, v9 -; GCN-NEXT: v_mov_b32_e32 v27, v8 -; GCN-NEXT: v_mov_b32_e32 v30, v7 -; GCN-NEXT: v_mov_b32_e32 v29, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v5 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v34, v3 -; GCN-NEXT: v_mov_b32_e32 v33, v2 -; GCN-NEXT: v_mov_b32_e32 v35, v1 -; GCN-NEXT: v_mov_b32_e32 v24, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_4 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB26_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: .LBB26_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v35, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v34, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v31 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v32, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v30, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v26, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i64_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v31, v5 +; SI-NEXT: v_mov_b32_e32 v30, v4 +; SI-NEXT: v_mov_b32_e32 v33, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_mov_b32_e32 v34, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v25, vcc +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i64_to_v24f16: ; VI: ; %bb.0: @@ -4793,7 +10624,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc @@ -4807,7 +10638,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4818,7 +10649,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc @@ -4832,7 +10663,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4844,7 +10675,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4861,7 +10692,7 @@ define <24 x half> @bitcast_v6i64_to_v24f16(<6 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4881,195 +10712,441 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v6i64_to_v24f16_scalar(<6 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i64_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s24, 3 +; SI-NEXT: s_addc_u32 s21, s25, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s25, s27, 0 +; SI-NEXT: s_lshr_b32 s26, s24, 16 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v6i64_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v6i64_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v6i64_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v6i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v22 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; GCN-NEXT: v_or_b32_e32 v0, v49, v0 -; GCN-NEXT: v_or_b32_e32 v1, v39, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v30, v4 -; GCN-NEXT: v_or_b32_e32 v5, v28, v5 -; GCN-NEXT: v_or_b32_e32 v6, v27, v6 -; GCN-NEXT: v_or_b32_e32 v7, v25, v7 -; GCN-NEXT: v_or_b32_e32 v8, v24, v8 -; GCN-NEXT: v_or_b32_e32 v9, v18, v9 -; GCN-NEXT: v_or_b32_e32 v10, v17, v10 -; GCN-NEXT: v_or_b32_e32 v11, v16, v11 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v18, v20 -; GCN-NEXT: v_or_b32_e32 v10, v17, v21 -; GCN-NEXT: v_or_b32_e32 v11, v16, v19 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v6i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v6, v31, v6 +; SI-NEXT: v_or_b32_e32 v7, v29, v7 +; SI-NEXT: v_or_b32_e32 v8, v27, v8 +; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v31 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v25 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v6i64: ; VI: ; %bb.0: @@ -5078,7 +11155,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 0x200 ; VI-NEXT: v_add_f16_sdwa v13, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5117,7 +11194,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v0, v12 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5128,7 +11205,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -5143,7 +11220,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5155,7 +11232,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -5169,7 +11246,7 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5189,170 +11266,520 @@ end: ret <6 x i64> %phi } +define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v6i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v28, v4 +; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_or_b32_e32 v8, v22, v8 +; SI-NEXT: v_or_b32_e32 v9, v20, v9 +; SI-NEXT: v_or_b32_e32 v10, v18, v10 +; SI-NEXT: v_or_b32_e32 v11, v16, v11 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v22 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v24f16_to_v6i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v12, v1 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v12 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v6i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v6i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i16_to_v24f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v55, v23 -; GCN-NEXT: v_mov_b32_e32 v54, v22 -; GCN-NEXT: v_mov_b32_e32 v53, v21 -; GCN-NEXT: v_mov_b32_e32 v52, v20 -; GCN-NEXT: v_mov_b32_e32 v51, v19 -; GCN-NEXT: v_mov_b32_e32 v50, v18 -; GCN-NEXT: v_mov_b32_e32 v49, v17 -; GCN-NEXT: v_mov_b32_e32 v48, v16 -; GCN-NEXT: v_mov_b32_e32 v39, v15 -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_mov_b32_e32 v36, v12 -; GCN-NEXT: v_mov_b32_e32 v35, v11 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v31, v7 -; GCN-NEXT: v_mov_b32_e32 v30, v6 -; GCN-NEXT: v_mov_b32_e32 v29, v5 -; GCN-NEXT: v_mov_b32_e32 v28, v4 -; GCN-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NEXT: v_mov_b32_e32 v26, v2 -; GCN-NEXT: v_mov_b32_e32 v25, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v40, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i16_to_v24f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v23 +; SI-NEXT: v_mov_b32_e32 v54, v22 +; SI-NEXT: v_mov_b32_e32 v53, v21 +; SI-NEXT: v_mov_b32_e32 v52, v20 +; SI-NEXT: v_mov_b32_e32 v51, v19 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_mov_b32_e32 v49, v17 +; SI-NEXT: v_mov_b32_e32 v48, v16 +; SI-NEXT: v_mov_b32_e32 v39, v15 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v13 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v35, v11 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v31, v7 +; SI-NEXT: v_mov_b32_e32 v30, v6 +; SI-NEXT: v_mov_b32_e32 v29, v5 +; SI-NEXT: v_mov_b32_e32 v28, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i16_to_v24f16: ; VI: ; %bb.0: @@ -5361,7 +11788,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v12, 3 ; VI-NEXT: v_add_u16_sdwa v19, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5400,7 +11827,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v15 ; VI-NEXT: v_or_b32_e32 v1, v1, v14 ; VI-NEXT: v_or_b32_e32 v0, v0, v13 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5411,7 +11838,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -5425,7 +11852,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5437,7 +11864,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] @@ -5451,7 +11878,7 @@ define <24 x half> @bitcast_v24i16_to_v24f16(<24 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5471,145 +11898,449 @@ end: ret <24 x half> %phi } +define inreg <24 x half> @bitcast_v24i16_to_v24f16_scalar(<24 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i16_to_v24f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v30, v9 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v27, v6 +; SI-NEXT: v_mov_b32_e32 v26, v5 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_mov_b32_e32 v24, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v31 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v24i16_to_v24f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v24i16_to_v24f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i16_to_v24f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i16> %a, splat (i16 3) + %a2 = bitcast <24 x i16> %a1 to <24 x half> + br label %end + +cmp.false: + %a3 = bitcast <24 x i16> %a to <24 x half> + br label %end + +end: + %phi = phi <24 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x half> %phi +} + define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f16_to_v24i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f16_to_v24i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f16_to_v24i16: ; VI: ; %bb.0: @@ -5618,7 +12349,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 @@ -5657,7 +12388,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v15, v2 ; VI-NEXT: v_or_b32_e32 v1, v14, v1 ; VI-NEXT: v_or_b32_e32 v0, v12, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5668,7 +12399,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] @@ -5683,7 +12414,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5695,7 +12426,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] @@ -5709,7 +12440,7 @@ define <24 x i16> @bitcast_v24f16_to_v24i16(<24 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5728,3 +12459,347 @@ end: %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <24 x i16> %phi } + +define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v9 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v13, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_or_b32_e32 v18, v18, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v24f16_to_v24i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s27, 16 +; VI-NEXT: v_add_f16_e32 v1, s26, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s25, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s27, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v11, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v13, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v12, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v12, v13 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24f16_to_v24i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f16_to_v24i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x half> %a, splat (half 0xH0200) + %a2 = bitcast <24 x half> %a1 to <24 x i16> + br label %end + +cmp.false: + %a3 = bitcast <24 x half> %a to <24 x i16> + br label %end + +end: + %phi = phi <24 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 4b2b9560e5927..eaf314d4b65dc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -1,37 +1,38 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <14 x float> @bitcast_v14i32_to_v14f32(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v14f32: ; VI: ; %bb.0: @@ -131,33 +132,213 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v14i32_to_v14f32_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v14i32_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v14i32: ; VI: ; %bb.0: @@ -166,7 +347,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -182,7 +363,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -193,7 +374,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -209,7 +390,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -221,7 +402,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -230,7 +411,7 @@ define <14 x i32> @bitcast_v14f32_to_v14i32(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -250,33 +431,227 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v14f32_to_v14i32_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v7i64: ; VI: ; %bb.0: @@ -285,7 +660,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -301,7 +676,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -312,7 +687,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -328,7 +703,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -340,7 +715,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -356,7 +731,7 @@ define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -376,33 +751,213 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v14i32_to_v7i64_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v14i32_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v14i32: ; VI: ; %bb.0: @@ -411,7 +966,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -427,7 +982,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -438,7 +993,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -454,7 +1009,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -466,7 +1021,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -486,7 +1041,7 @@ define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -506,33 +1061,213 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v7i64_to_v14i32_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v7i64_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v7f64: ; VI: ; %bb.0: @@ -541,7 +1276,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -557,7 +1292,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -568,7 +1303,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -584,7 +1319,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -596,7 +1331,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -612,7 +1347,7 @@ define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -632,26 +1367,206 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v14i32_to_v7f64_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v14i32_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v14i32: ; VI: ; %bb.0: @@ -660,7 +1575,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -669,7 +1584,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -680,7 +1595,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -689,7 +1604,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -701,7 +1616,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -710,7 +1625,7 @@ define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -730,92 +1645,258 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v7f64_to_v14i32_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v14, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v28i16: ; VI: ; %bb.0: @@ -824,7 +1905,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -840,7 +1921,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -851,7 +1932,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -867,7 +1948,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -879,7 +1960,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -895,7 +1976,7 @@ define <28 x i16> @bitcast_v14i32_to_v28i16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -915,183 +1996,426 @@ end: ret <28 x i16> %phi } -define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s8 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: v_mov_b32_e32 v27, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 ; -; VI-LABEL: bitcast_v28i16_to_v14i32: +; VI-LABEL: bitcast_v14i32_to_v28i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v15, 3 -; VI-NEXT: v_add_u16_e32 v14, 3, v13 -; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_u16_e32 v14, 3, v12 -; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v14, v12 -; VI-NEXT: v_add_u16_e32 v14, 3, v11 -; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v14, v11 -; VI-NEXT: v_add_u16_e32 v14, 3, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + +define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v28i16_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v28i16_to_v14i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v15, 3 +; VI-NEXT: v_add_u16_e32 v14, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_u16_e32 v14, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_add_u16_e32 v14, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v14, v11 +; VI-NEXT: v_add_u16_e32 v14, 3, v10 ; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v10, v14, v10 ; VI-NEXT: v_add_u16_e32 v14, 3, v9 @@ -1124,7 +2448,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1135,7 +2459,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -1151,7 +2475,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1163,7 +2487,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -1179,7 +2503,7 @@ define <14 x i32> @bitcast_v28i16_to_v14i32(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1199,180 +2523,523 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v28i16_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i32_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v13 -; GCN-NEXT: v_mov_b32_e32 v30, v12 -; GCN-NEXT: v_mov_b32_e32 v31, v11 -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v34, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v7 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v38, v4 -; GCN-NEXT: v_mov_b32_e32 v39, v3 -; GCN-NEXT: v_mov_b32_e32 v48, v2 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i32_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: v_mov_b32_e32 v30, v11 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i32_to_v28f16: ; VI: ; %bb.0: @@ -1381,7 +3048,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 @@ -1397,7 +3064,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1408,7 +3075,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 ; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 @@ -1424,7 +3091,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1436,7 +3103,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 ; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 @@ -1452,7 +3119,7 @@ define <28 x half> @bitcast_v14i32_to_v28f16(<14 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1472,223 +3139,505 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v14i32_to_v28f16_scalar(<14 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i32_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: s_lshr_b32 s9, s21, 16 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: s_lshr_b32 s11, s23, 16 +; SI-NEXT: s_lshr_b32 s12, s24, 16 +; SI-NEXT: s_lshr_b32 s13, s25, 16 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v14i32_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v14i32_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v14i32_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v14i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v14i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14i32: ; VI: ; %bb.0: @@ -1697,7 +3646,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1742,7 +3691,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1753,7 +3702,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -1770,7 +3719,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1782,7 +3731,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -1798,7 +3747,7 @@ define <14 x i32> @bitcast_v28f16_to_v14i32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1818,33 +3767,423 @@ end: ret <14 x i32> %phi } +define inreg <14 x i32> @bitcast_v28f16_to_v14i32_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v14i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v28f16_to_v14i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v14i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v14i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v7i64: ; VI: ; %bb.0: @@ -1853,7 +4192,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -1869,7 +4208,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1880,7 +4219,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -1896,7 +4235,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1908,7 +4247,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -1917,7 +4256,7 @@ define <7 x i64> @bitcast_v14f32_to_v7i64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1937,33 +4276,227 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v14f32_to_v7i64_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_3: +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v14f32: ; VI: ; %bb.0: @@ -1972,7 +4505,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -1988,7 +4521,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,7 +4532,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -2015,7 +4548,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2027,7 +4560,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -2047,7 +4580,7 @@ define <14 x float> @bitcast_v7i64_to_v14f32(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2067,33 +4600,213 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v7i64_to_v14f32_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v7i64_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v7f64: ; VI: ; %bb.0: @@ -2102,7 +4815,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2118,7 +4831,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2129,7 +4842,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2145,7 +4858,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2157,7 +4870,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -2166,7 +4879,7 @@ define <7 x double> @bitcast_v14f32_to_v7f64(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2186,26 +4899,220 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v14f32_to_v7f64_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v14f32: ; VI: ; %bb.0: @@ -2214,7 +5121,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -2223,7 +5130,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2234,7 +5141,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -2243,7 +5150,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2255,7 +5162,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -2264,7 +5171,7 @@ define <14 x float> @bitcast_v7f64_to_v14f32(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2284,92 +5191,258 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v7f64_to_v14f32_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB27_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_4 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_3: +; SI-NEXT: s_branch .LBB27_2 +; SI-NEXT: .LBB27_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB27_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_4 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_3: +; VI-NEXT: s_branch .LBB27_2 +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v14, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28i16: ; VI: ; %bb.0: @@ -2378,7 +5451,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2394,7 +5467,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB28_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2405,7 +5478,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2421,7 +5494,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB28_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2433,7 +5506,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -2442,7 +5515,7 @@ define <28 x i16> @bitcast_v14f32_to_v28i16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2462,162 +5535,410 @@ end: ret <28 x i16> %phi } +define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v27, s12 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f32_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v14f32: ; VI: ; %bb.0: @@ -2626,7 +5947,7 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 3 ; VI-NEXT: v_add_u16_e32 v14, 3, v13 @@ -2671,7 +5992,7 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2682,7 +6003,7 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -2698,36 +6019,379 @@ define <14 x float> @bitcast_v28i16_to_v14f32(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v28i16_to_v14f32: +; GFX11-LABEL: bitcast_v28i16_to_v14f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + +define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v28i16_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v14f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB15_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -2747,179 +6411,179 @@ end: } define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f32_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, v13 -; GCN-NEXT: v_mov_b32_e32 v30, v12 -; GCN-NEXT: v_mov_b32_e32 v31, v11 -; GCN-NEXT: v_mov_b32_e32 v32, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v9 -; GCN-NEXT: v_mov_b32_e32 v34, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v7 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v38, v4 -; GCN-NEXT: v_mov_b32_e32 v39, v3 -; GCN-NEXT: v_mov_b32_e32 v48, v2 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v49 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v48 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v39 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v38 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v37 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v36 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v35 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v33 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v31 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14f32_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: v_mov_b32_e32 v30, v11 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: v_mov_b32_e32 v32, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v37, v4 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v49, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28f16: ; VI: ; %bb.0: @@ -2928,7 +6592,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2944,7 +6608,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2955,7 +6619,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 ; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -2971,7 +6635,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2983,7 +6647,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 ; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 @@ -2992,7 +6656,7 @@ define <28 x half> @bitcast_v14f32_to_v28f16(<14 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3012,223 +6676,516 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v14f32_to_v28f16_scalar(<14 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f32_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v14f32_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v13, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v12, s28, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v10, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v6, s22, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f32_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v12, s28, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v10, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v6, s22, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f32_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <14 x float> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x float> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v14f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v14f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v14f32: ; VI: ; %bb.0: @@ -3237,7 +7194,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3282,7 +7239,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3293,7 +7250,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -3310,7 +7267,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3322,7 +7279,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -3338,7 +7295,7 @@ define <14 x float> @bitcast_v28f16_to_v14f32(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3358,33 +7315,423 @@ end: ret <14 x float> %phi } +define inreg <14 x float> @bitcast_v28f16_to_v14f32_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v14f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v28f16_to_v14f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v14f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v14f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <14 x float> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <14 x float> + br label %end + +end: + %phi = phi <14 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x float> %phi +} + define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v7f64: ; VI: ; %bb.0: @@ -3393,7 +7740,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -3409,7 +7756,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3420,7 +7767,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -3436,7 +7783,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3448,7 +7795,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3468,7 +7815,7 @@ define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3488,26 +7835,205 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v7i64_to_v7f64_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v7i64_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: s_branch .LBB37_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v7i64: ; VI: ; %bb.0: @@ -3516,7 +8042,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3525,7 +8051,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3536,7 +8062,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3545,7 +8071,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3557,7 +8083,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -3566,7 +8092,7 @@ define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3586,92 +8112,258 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v7f64_to_v7i64_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB39_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_4 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_3: +; SI-NEXT: s_branch .LBB39_2 +; SI-NEXT: .LBB39_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: v_mov_b32_e32 v14, s30 +; SI-NEXT: v_mov_b32_e32 v15, s31 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_3: +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NEXT: v_mov_b32_e32 v28, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; GCN-NEXT: v_alignbit_b32 v13, v28, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v14, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v16, v8 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v28, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28i16: ; VI: ; %bb.0: @@ -3680,7 +8372,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -3696,7 +8388,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3707,7 +8399,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -3723,7 +8415,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,7 +8427,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3755,7 +8447,7 @@ define <28 x i16> @bitcast_v7i64_to_v28i16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3775,162 +8467,405 @@ end: ret <28 x i16> %phi } +define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: s_lshr_b32 s7, s27, 16 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s8 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: v_mov_b32_e32 v27, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v7i64_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB41_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v7i64: ; VI: ; %bb.0: @@ -3939,7 +8874,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 3 ; VI-NEXT: v_add_u16_e32 v14, 3, v13 @@ -3984,7 +8919,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3995,7 +8930,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -4011,7 +8946,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4023,7 +8958,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -4039,7 +8974,7 @@ define <7 x i64> @bitcast_v28i16_to_v7i64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4059,180 +8994,523 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v28i16_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v7i64_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v13 -; GCN-NEXT: v_mov_b32_e32 v29, v12 -; GCN-NEXT: v_mov_b32_e32 v32, v11 -; GCN-NEXT: v_mov_b32_e32 v31, v10 -; GCN-NEXT: v_mov_b32_e32 v34, v9 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v7 -; GCN-NEXT: v_mov_b32_e32 v35, v6 -; GCN-NEXT: v_mov_b32_e32 v38, v5 -; GCN-NEXT: v_mov_b32_e32 v37, v4 -; GCN-NEXT: v_mov_b32_e32 v48, v3 -; GCN-NEXT: v_mov_b32_e32 v39, v2 -; GCN-NEXT: v_mov_b32_e32 v49, v1 -; GCN-NEXT: v_mov_b32_e32 v28, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v49, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v39 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v48, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v37 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v38, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v36, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v33 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v34, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v31 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v32, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v30, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7i64_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v29, v13 +; SI-NEXT: v_mov_b32_e32 v28, v12 +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v49, v1 +; SI-NEXT: v_mov_b32_e32 v48, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v31, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v29, vcc +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7i64_to_v28f16: ; VI: ; %bb.0: @@ -4241,7 +9519,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc @@ -4257,7 +9535,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4268,7 +9546,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc @@ -4284,7 +9562,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4296,7 +9574,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4316,7 +9594,7 @@ define <28 x half> @bitcast_v7i64_to_v28f16(<7 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4336,223 +9614,505 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v7i64_to_v28f16_scalar(<7 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7i64_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_add_u32 s8, s18, 3 +; SI-NEXT: s_addc_u32 s9, s19, 0 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_lshr_b32 s11, s9, 16 +; SI-NEXT: s_add_u32 s12, s20, 3 +; SI-NEXT: s_addc_u32 s13, s21, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s22, 3 +; SI-NEXT: s_addc_u32 s17, s23, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s24, 3 +; SI-NEXT: s_addc_u32 s21, s25, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s25, s27, 0 +; SI-NEXT: s_lshr_b32 s26, s24, 16 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s40, s28, 16 +; SI-NEXT: s_lshr_b32 s41, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v7i64_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v7i64_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v7i64_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB45_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v7i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v7i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7i64: ; VI: ; %bb.0: @@ -4561,7 +10121,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4606,7 +10166,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4617,7 +10177,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -4634,7 +10194,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4646,7 +10206,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -4662,7 +10222,7 @@ define <7 x i64> @bitcast_v28f16_to_v7i64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: .LBB46_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4682,97 +10242,487 @@ end: ret <7 x i64> %phi } +define inreg <7 x i64> @bitcast_v28f16_to_v7i64_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v7i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v28f16_to_v7i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v7i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v7i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v49, v13 -; GCN-NEXT: v_mov_b32_e32 v48, v12 -; GCN-NEXT: v_mov_b32_e32 v38, v11 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v36, v9 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v7 -; GCN-NEXT: v_mov_b32_e32 v33, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v5 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v3 -; GCN-NEXT: v_mov_b32_e32 v29, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; GCN-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; GCN-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v29 -; GCN-NEXT: v_mov_b32_e32 v6, v30 -; GCN-NEXT: v_mov_b32_e32 v8, v31 -; GCN-NEXT: v_mov_b32_e32 v10, v32 -; GCN-NEXT: v_mov_b32_e32 v12, v33 -; GCN-NEXT: v_mov_b32_e32 v14, v34 -; GCN-NEXT: v_mov_b32_e32 v16, v35 -; GCN-NEXT: v_mov_b32_e32 v18, v36 -; GCN-NEXT: v_mov_b32_e32 v20, v37 -; GCN-NEXT: v_mov_b32_e32 v22, v38 -; GCN-NEXT: v_mov_b32_e32 v24, v48 -; GCN-NEXT: v_mov_b32_e32 v26, v49 -; GCN-NEXT: v_mov_b32_e32 v1, v28 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v38, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v36, v9 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v32, v5 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v3 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 +; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 +; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 +; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v8, v31 +; SI-NEXT: v_mov_b32_e32 v10, v32 +; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_mov_b32_e32 v16, v35 +; SI-NEXT: v_mov_b32_e32 v18, v36 +; SI-NEXT: v_mov_b32_e32 v20, v37 +; SI-NEXT: v_mov_b32_e32 v22, v38 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v1, v28 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28i16: ; VI: ; %bb.0: @@ -4781,7 +10731,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -4790,7 +10740,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4801,7 +10751,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -4810,7 +10760,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4822,7 +10772,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -4831,7 +10781,7 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: .LBB48_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4851,162 +10801,397 @@ end: ret <28 x i16> %phi } +define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s28 +; SI-NEXT: v_alignbit_b32 v28, s29, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s26 +; SI-NEXT: v_alignbit_b32 v29, s27, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_alignbit_b32 v30, s25, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_alignbit_b32 v31, s23, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: v_alignbit_b32 v32, s21, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v33, s19, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v34, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s11, s27, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s9, s23, 16 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: s_lshr_b32 s7, s19, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[26:27], 1.0 +; SI-NEXT: v_alignbit_b32 v28, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v29, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v31, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v32, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v33, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_branch .LBB49_5 +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v27, s12 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v22, v21 +; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v1, v34 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v9, v32 +; SI-NEXT: v_mov_b32_e32 v13, v31 +; SI-NEXT: v_mov_b32_e32 v17, v30 +; SI-NEXT: v_mov_b32_e32 v21, v29 +; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} + define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v36, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v34, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v32, v6 -; GCN-NEXT: v_mov_b32_e32 v31, v4 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v0, v0, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_or_b32_e32 v4, v4, v48 -; GCN-NEXT: v_or_b32_e32 v5, v5, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v51 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v0, v28, v0 -; GCN-NEXT: v_or_b32_e32 v1, v37, v1 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_or_b32_e32 v3, v39, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v49, v5 -; GCN-NEXT: v_or_b32_e32 v6, v50, v6 -; GCN-NEXT: v_or_b32_e32 v7, v51, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v36, v14 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_mov_b32_e32 v31, v4 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v51 +; SI-NEXT: v_or_b32_e32 v4, v4, v50 +; SI-NEXT: v_or_b32_e32 v5, v5, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v48 +; SI-NEXT: v_or_b32_e32 v7, v7, v39 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v5, v49, v5 +; SI-NEXT: v_or_b32_e32 v6, v48, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v37, v9 +; SI-NEXT: v_or_b32_e32 v10, v28, v10 +; SI-NEXT: v_or_b32_e32 v11, v21, v11 +; SI-NEXT: v_or_b32_e32 v12, v19, v12 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v7f64: ; VI: ; %bb.0: @@ -5015,7 +11200,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 3 ; VI-NEXT: v_add_u16_e32 v14, 3, v13 @@ -5060,7 +11245,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v14, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5071,7 +11256,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -5087,7 +11272,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5099,7 +11284,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -5115,7 +11300,7 @@ define <7 x double> @bitcast_v28i16_to_v7f64(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: .LBB50_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5135,162 +11320,505 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_mov_b32_e32 v17, v10 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v22, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v13 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v12, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: v_or_b32_e32 v13, v0, v23 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v26, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v24, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v28i16_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v7f64_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v34 -; GCN-NEXT: v_mov_b32_e32 v1, v49 -; GCN-NEXT: v_mov_b32_e32 v2, v28 -; GCN-NEXT: v_mov_b32_e32 v3, v48 -; GCN-NEXT: v_mov_b32_e32 v4, v29 -; GCN-NEXT: v_mov_b32_e32 v5, v39 -; GCN-NEXT: v_mov_b32_e32 v6, v30 -; GCN-NEXT: v_mov_b32_e32 v7, v38 -; GCN-NEXT: v_mov_b32_e32 v8, v31 -; GCN-NEXT: v_mov_b32_e32 v9, v37 -; GCN-NEXT: v_mov_b32_e32 v10, v32 -; GCN-NEXT: v_mov_b32_e32 v11, v36 -; GCN-NEXT: v_mov_b32_e32 v12, v33 -; GCN-NEXT: v_mov_b32_e32 v13, v35 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v7f64_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v1, v49 +; SI-NEXT: v_mov_b32_e32 v2, v39 +; SI-NEXT: v_mov_b32_e32 v3, v38 +; SI-NEXT: v_mov_b32_e32 v4, v37 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v6, v36 +; SI-NEXT: v_mov_b32_e32 v7, v33 +; SI-NEXT: v_mov_b32_e32 v8, v34 +; SI-NEXT: v_mov_b32_e32 v9, v30 +; SI-NEXT: v_mov_b32_e32 v10, v32 +; SI-NEXT: v_mov_b32_e32 v11, v28 +; SI-NEXT: v_mov_b32_e32 v12, v31 +; SI-NEXT: v_mov_b32_e32 v13, v29 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28f16: ; VI: ; %bb.0: @@ -5299,7 +11827,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -5308,7 +11836,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5319,7 +11847,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -5328,7 +11856,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5340,7 +11868,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -5349,7 +11877,7 @@ define <28 x half> @bitcast_v7f64_to_v28f16(<7 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5369,223 +11897,488 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v7f64_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v7f64_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB53_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: +; VI-NEXT: s_branch .LBB53_2 +; VI-NEXT: .LBB53_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v7f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v26 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v51, v0 -; GCN-NEXT: v_or_b32_e32 v1, v49, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v37, v3 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v5, v33, v5 -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_or_b32_e32 v7, v30, v7 -; GCN-NEXT: v_or_b32_e32 v8, v29, v8 -; GCN-NEXT: v_or_b32_e32 v9, v28, v9 -; GCN-NEXT: v_or_b32_e32 v10, v19, v10 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v13, v16, v13 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: v_or_b32_e32 v8, v24, v22 -; GCN-NEXT: v_or_b32_e32 v9, v26, v25 -; GCN-NEXT: v_or_b32_e32 v10, v19, v27 -; GCN-NEXT: v_or_b32_e32 v11, v18, v23 -; GCN-NEXT: v_or_b32_e32 v12, v17, v21 -; GCN-NEXT: v_or_b32_e32 v13, v16, v20 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v7f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v26 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v51, v3 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v37, v6 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 +; SI-NEXT: v_or_b32_e32 v8, v33, v8 +; SI-NEXT: v_or_b32_e32 v9, v31, v9 +; SI-NEXT: v_or_b32_e32 v10, v29, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v33 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v30 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v29 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v7f64: ; VI: ; %bb.0: @@ -5594,7 +12387,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 0x200 ; VI-NEXT: v_add_f16_sdwa v15, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5639,7 +12432,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v15 ; VI-NEXT: v_or_b32_e32 v0, v0, v14 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5650,7 +12443,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -5667,7 +12460,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5679,7 +12472,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -5695,7 +12488,7 @@ define <7 x double> @bitcast_v28f16_to_v7f64(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5715,211 +12508,601 @@ end: ret <7 x double> %phi } +define inreg <7 x double> @bitcast_v28f16_to_v7f64_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v7f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: v_or_b32_e32 v10, v22, v10 +; SI-NEXT: v_or_b32_e32 v11, v20, v11 +; SI-NEXT: v_or_b32_e32 v12, v18, v12 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v29 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v28f16_to_v7f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s29, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s29, v0 +; VI-NEXT: s_lshr_b32 s4, s28, 16 +; VI-NEXT: v_or_b32_e32 v13, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s28, v0 +; VI-NEXT: s_lshr_b32 s4, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s27, v0 +; VI-NEXT: s_lshr_b32 s4, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s26, v0 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s25, v0 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s24, v0 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s23, v0 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s22, v0 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s21, v0 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s20, v0 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s19, v0 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_add_f16_sdwa v14, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v7f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v7f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i16_to_v28f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v47, v27 -; GCN-NEXT: v_mov_b32_e32 v46, v26 -; GCN-NEXT: v_mov_b32_e32 v45, v25 -; GCN-NEXT: v_mov_b32_e32 v44, v24 -; GCN-NEXT: v_mov_b32_e32 v43, v23 -; GCN-NEXT: v_mov_b32_e32 v42, v22 -; GCN-NEXT: v_mov_b32_e32 v41, v21 -; GCN-NEXT: v_mov_b32_e32 v40, v20 -; GCN-NEXT: v_mov_b32_e32 v55, v19 -; GCN-NEXT: v_mov_b32_e32 v54, v18 -; GCN-NEXT: v_mov_b32_e32 v53, v17 -; GCN-NEXT: v_mov_b32_e32 v52, v16 -; GCN-NEXT: v_mov_b32_e32 v51, v15 -; GCN-NEXT: v_mov_b32_e32 v50, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v13 -; GCN-NEXT: v_mov_b32_e32 v48, v12 -; GCN-NEXT: v_mov_b32_e32 v39, v11 -; GCN-NEXT: v_mov_b32_e32 v38, v10 -; GCN-NEXT: v_mov_b32_e32 v37, v9 -; GCN-NEXT: v_mov_b32_e32 v36, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v7 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v5 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v31, v3 -; GCN-NEXT: v_mov_b32_e32 v30, v2 -; GCN-NEXT: v_mov_b32_e32 v29, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v56, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i16_to_v28f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v27 +; SI-NEXT: v_mov_b32_e32 v46, v26 +; SI-NEXT: v_mov_b32_e32 v45, v25 +; SI-NEXT: v_mov_b32_e32 v44, v24 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v55, v19 +; SI-NEXT: v_mov_b32_e32 v54, v18 +; SI-NEXT: v_mov_b32_e32 v53, v17 +; SI-NEXT: v_mov_b32_e32 v52, v16 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v49, v13 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v38, v10 +; SI-NEXT: v_mov_b32_e32 v37, v9 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i16_to_v28f16: ; VI: ; %bb.0: @@ -5928,7 +13111,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v14, 3 ; VI-NEXT: v_add_u16_sdwa v19, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5973,7 +13156,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v17 ; VI-NEXT: v_or_b32_e32 v1, v1, v16 ; VI-NEXT: v_or_b32_e32 v0, v0, v15 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5984,7 +13167,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -6000,7 +13183,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6012,7 +13195,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] @@ -6028,7 +13211,7 @@ define <28 x half> @bitcast_v28i16_to_v28f16(<28 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6048,166 +13231,508 @@ end: ret <28 x half> %phi } +define inreg <28 x half> @bitcast_v28i16_to_v28f16_scalar(<28 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i16_to_v28f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v35, v10 +; SI-NEXT: v_mov_b32_e32 v34, v9 +; SI-NEXT: v_mov_b32_e32 v33, v8 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_mov_b32_e32 v31, v6 +; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: v_mov_b32_e32 v29, v4 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_mov_b32_e32 v39, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v38 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v39 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v28i16_to_v28f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s10, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s19, 3 +; VI-NEXT: s_and_b32 s12, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s20, 3 +; VI-NEXT: s_and_b32 s14, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s21, 3 +; VI-NEXT: s_and_b32 s16, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s22, 3 +; VI-NEXT: s_and_b32 s18, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s23, 3 +; VI-NEXT: s_and_b32 s20, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s24, 3 +; VI-NEXT: s_and_b32 s22, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s25, 3 +; VI-NEXT: s_and_b32 s24, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s26, 3 +; VI-NEXT: s_and_b32 s26, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s40, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s41, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s29, s41, s29 +; VI-NEXT: s_or_b32 s28, s40, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s26, 0x30000 +; VI-NEXT: s_add_i32 s26, s24, 0x30000 +; VI-NEXT: s_add_i32 s25, s22, 0x30000 +; VI-NEXT: s_add_i32 s24, s20, 0x30000 +; VI-NEXT: s_add_i32 s23, s18, 0x30000 +; VI-NEXT: s_add_i32 s22, s16, 0x30000 +; VI-NEXT: s_add_i32 s21, s14, 0x30000 +; VI-NEXT: s_add_i32 s20, s12, 0x30000 +; VI-NEXT: s_add_i32 s19, s10, 0x30000 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v28i16_to_v28f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i16_to_v28f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i16> %a, splat (i16 3) + %a2 = bitcast <28 x i16> %a1 to <28 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x i16> %a to <28 x half> + br label %end + +end: + %phi = phi <28 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x half> %phi +} + define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f16_to_v28i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f16_to_v28i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f16_to_v28i16: ; VI: ; %bb.0: @@ -6216,7 +13741,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v15, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 @@ -6261,7 +13786,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v17, v2 ; VI-NEXT: v_or_b32_e32 v1, v16, v1 ; VI-NEXT: v_or_b32_e32 v0, v14, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6272,7 +13797,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] @@ -6289,7 +13814,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6301,7 +13826,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -6317,7 +13842,7 @@ define <28 x i16> @bitcast_v28f16_to_v28i16(<28 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6336,3 +13861,380 @@ end: %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <28 x i16> %phi } + +define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v14, v14, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v28f16_to_v28i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s28, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s5, s29, 16 +; VI-NEXT: v_add_f16_e32 v1, s28, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_lshr_b32 s5, s27, 16 +; VI-NEXT: v_or_b32_e32 v12, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s27, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: v_or_b32_e32 v11, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s26, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_or_b32_e32 v10, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s25, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_or_b32_e32 v9, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s24, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s23, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s22, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s21, 16 +; VI-NEXT: v_or_b32_e32 v6, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s21, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_or_b32_e32 v5, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v3, s29, v0 +; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s20, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s19, 16 +; VI-NEXT: v_or_b32_e32 v13, v3, v4 +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v1, s19, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s5, s18, 16 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_add_f16_e32 v1, s18, v0 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_e32 v14, s16, v0 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s17, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v14, v15 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f16_to_v28i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s29, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s28, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s27, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s26, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s25, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s24, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28f16_to_v28i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x half> %a, splat (half 0xH0200) + %a2 = bitcast <28 x half> %a1 to <28 x i16> + br label %end + +cmp.false: + %a3 = bitcast <28 x half> %a to <28 x i16> + br label %end + +end: + %phi = phi <28 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i16> %phi +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index b52128024fbc3..6ec9c1177c180 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -1,58 +1,58 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v3bf16_to_v3f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB0_4 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB0_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: .LBB0_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3bf16_to_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_4 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB0_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: .LBB0_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3bf16_to_v3f16: ; VI: ; %bb.0: @@ -238,50 +238,306 @@ end: ret <3 x half> %phi } +define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_4 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_3: +; VI-NEXT: s_branch .LBB1_2 +; VI-NEXT: .LBB1_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fc0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_3: +; GFX9-NEXT: s_branch .LBB1_2 +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX11-TRUE16-NEXT: .LBB1_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB1_3: +; GFX11-TRUE16-NEXT: s_branch .LBB1_2 +; GFX11-TRUE16-NEXT: .LBB1_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB1_4 +; GFX11-FAKE16-NEXT: .LBB1_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB1_3: +; GFX11-FAKE16-NEXT: s_branch .LBB1_2 +; GFX11-FAKE16-NEXT: .LBB1_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <3 x bfloat> %a1 to <3 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x bfloat> %a to <3 x half> + br label %end + +end: + %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x half> %phi +} + define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f16_to_v3bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_4 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB1_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: .LBB1_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f16_to_v3bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB2_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB2_4 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB2_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: .LBB2_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f16_to_v3bf16: ; VI: ; %bb.0: @@ -347,49 +603,162 @@ end: ret <3 x bfloat> %phi } +define inreg <3 x bfloat> @bitcast_v3f16_to_v3bf16_scalar(<3 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x7e000000, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f16_to_v3bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x half> %a, splat (half 0xH0200) + %a2 = bitcast <3 x half> %a1 to <3 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x half> %a to <3 x bfloat> + br label %end + +end: + %phi = phi <3 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x bfloat> %phi +} + define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v3bf16_to_v3i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3bf16_to_v3i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3bf16_to_v3i16: ; VI: ; %bb.0: @@ -398,7 +767,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -429,7 +798,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -440,7 +809,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -468,7 +837,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s6 ; GFX9-NEXT: s_movk_i32 s6, 0x7fc0 ; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -480,7 +849,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -516,7 +885,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 -; GFX11-TRUE16-NEXT: .LBB2_2: ; %end +; GFX11-TRUE16-NEXT: .LBB4_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -528,7 +897,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -558,7 +927,7 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, 0x7fc0, v1, 16 -; GFX11-FAKE16-NEXT: .LBB2_2: ; %end +; GFX11-FAKE16-NEXT: .LBB4_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -578,39 +947,284 @@ end: ret <3 x i16> %phi } +define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_4 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_3: +; VI-NEXT: s_branch .LBB5_2 +; VI-NEXT: .LBB5_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v0, v2, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: s_movk_i32 s4, 0x7fc0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_3: +; GFX9-NEXT: s_branch .LBB5_2 +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB5_3: +; GFX11-TRUE16-NEXT: s_branch .LBB5_2 +; GFX11-TRUE16-NEXT: .LBB5_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB5_4 +; GFX11-FAKE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s0, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v2, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB5_3: +; GFX11-FAKE16-NEXT: s_branch .LBB5_2 +; GFX11-FAKE16-NEXT: .LBB5_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <3 x bfloat> %a1 to <3 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x bfloat> %a to <3 x i16> + br label %end + +end: + %phi = phi <3 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i16> %phi +} + define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i16_to_v3bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i16_to_v3bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3bf16: ; VI: ; %bb.0: @@ -674,34 +1288,140 @@ end: ret <3 x bfloat> %phi } +define inreg <3 x bfloat> @bitcast_v3i16_to_v3bf16_scalar(<3 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_lshl_b32 s5, s18, 16 +; SI-NEXT: s_and_b32 s7, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s4, 16 +; SI-NEXT: s_add_i32 s8, s5, 0x30000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_4 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_branch .LBB7_2 +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i16_to_v3bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_4 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: s_branch .LBB7_2 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i16> %a, splat (i16 3) + %a2 = bitcast <3 x i16> %a1 to <3 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x i16> %a to <3 x bfloat> + br label %end + +end: + %phi = phi <3 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x bfloat> %phi +} + define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f16_to_v3i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f16_to_v3i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f16_to_v3i16: ; VI: ; %bb.0: @@ -767,44 +1487,151 @@ end: ret <3 x i16> %phi } +define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f16_to_v3i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v3f16_to_v3i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_4 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x7e000000, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_3: +; VI-NEXT: s_branch .LBB9_2 +; VI-NEXT: .LBB9_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f16_to_v3i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_4 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_branch .LBB9_2 +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f16_to_v3i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_4 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: s_branch .LBB9_2 +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x half> %a, splat (half 0xH0200) + %a2 = bitcast <3 x half> %a1 to <3 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x half> %a to <3 x i16> + br label %end + +end: + %phi = phi <3 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i16> %phi +} + define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i16_to_v3f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i16_to_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i16_to_v3f16: ; VI: ; %bb.0: @@ -867,3 +1694,103 @@ end: %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <3 x half> %phi } + +define inreg <3 x half> @bitcast_v3i16_to_v3f16_scalar(<3 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i16_to_v3f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v3i16_to_v3f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v3i16_to_v3f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i16_to_v3f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i16> %a, splat (i16 3) + %a2 = bitcast <3 x i16> %a1 to <3 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i16> %a to <3 x half> + br label %end + +end: + %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 5f21bdc09a15d..65fde2fd5e190 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -1,40 +1,40 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v16f32: ; VI: ; %bb.0: @@ -140,35 +140,233 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v16i32_to_v16f32_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v16i32_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v16i32: ; VI: ; %bb.0: @@ -177,7 +375,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -195,7 +393,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -206,7 +404,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -224,7 +422,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -236,7 +434,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -246,7 +444,7 @@ define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -266,35 +464,237 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v16f32_to_v16i32_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v16f32_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v8i64: ; VI: ; %bb.0: @@ -303,7 +703,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -321,7 +721,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -332,7 +732,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -350,7 +750,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -362,7 +762,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -380,7 +780,7 @@ define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -400,35 +800,233 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v16i32_to_v8i64_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v16i32_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v16i32: ; VI: ; %bb.0: @@ -437,7 +1035,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -455,7 +1053,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -466,7 +1064,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -484,7 +1082,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -496,7 +1094,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -518,7 +1116,7 @@ define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -538,35 +1136,233 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v8i64_to_v16i32_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v8i64_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v8f64: ; VI: ; %bb.0: @@ -575,7 +1371,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -593,7 +1389,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -604,7 +1400,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -622,7 +1418,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -634,7 +1430,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -652,7 +1448,7 @@ define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -672,27 +1468,225 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v16i32_to_v8f64_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v16i32_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v16i32: ; VI: ; %bb.0: @@ -701,7 +1695,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -711,7 +1705,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -722,7 +1716,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -732,7 +1726,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -744,7 +1738,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -754,7 +1748,7 @@ define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -774,102 +1768,272 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v8f64_to_v16i32_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v8f64_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v15 -; GCN-NEXT: v_mov_b32_e32 v28, v14 -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32i16: ; VI: ; %bb.0: @@ -878,7 +2042,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -896,7 +2060,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: .LBB12_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -907,7 +2071,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -925,7 +2089,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: .LBB12_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -937,7 +2101,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -955,7 +2119,7 @@ define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: .LBB12_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -975,183 +2139,434 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v16i32_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v16i32: ; VI: ; %bb.0: @@ -1160,7 +2575,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -1211,7 +2626,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,7 +2637,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -1240,7 +2655,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: .LBB14_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1252,7 +2667,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -1270,7 +2685,7 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: .LBB14_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1290,248 +2705,605 @@ end: ret <16 x i32> %phi } -define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v15 -; GCN-NEXT: v_mov_b32_e32 v34, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v13 -; GCN-NEXT: v_mov_b32_e32 v36, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v11 -; GCN-NEXT: v_mov_b32_e32 v38, v10 -; GCN-NEXT: v_mov_b32_e32 v39, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v49, v7 -; GCN-NEXT: v_mov_b32_e32 v50, v6 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v16i32_to_v32f16: +; VI-LABEL: bitcast_v32i16_to_v16i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v16i32_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB16_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1542,7 +3314,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -1560,7 +3332,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: .LBB16_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1572,7 +3344,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -1590,7 +3362,7 @@ define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: .LBB16_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1610,269 +3382,583 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v16i32_to_v32f16_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: s_lshr_b32 s9, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: s_lshr_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: s_lshr_b32 s13, s23, 16 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: s_lshr_b32 s15, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v16i32_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v16i32: ; VI: ; %bb.0: @@ -1881,7 +3967,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1932,7 +4018,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1943,7 +4029,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -1962,7 +4048,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: .LBB18_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1974,7 +4060,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -1992,7 +4078,7 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: .LBB18_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2012,170 +4098,570 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v32f16_to_v16i32_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v32f16_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v55 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v32bf16: ; VI: ; %bb.0: @@ -2184,7 +4670,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -2202,7 +4688,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2213,7 +4699,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -2231,7 +4717,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2243,7 +4729,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 ; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -2261,7 +4747,7 @@ define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2281,237 +4767,551 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v16i32_to_v32bf16_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s79, v1 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s26, 16 +; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s25, 16 +; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s24, 16 +; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s23, 16 +; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s22, 16 +; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s21, 16 +; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s20, 16 +; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s19, 16 +; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s18, 16 +; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s78, s78, 3 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s26, 16 +; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s25, 16 +; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s24, 16 +; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s23, 16 +; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s22, 16 +; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s21, 16 +; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s20, 16 +; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s19, 16 +; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s18, 16 +; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s77 +; SI-NEXT: v_mov_b32_e32 v1, s76 +; SI-NEXT: v_mov_b32_e32 v2, s75 +; SI-NEXT: v_mov_b32_e32 v3, s74 +; SI-NEXT: v_mov_b32_e32 v4, s73 +; SI-NEXT: v_mov_b32_e32 v5, s72 +; SI-NEXT: v_mov_b32_e32 v6, s63 +; SI-NEXT: v_mov_b32_e32 v7, s62 +; SI-NEXT: v_mov_b32_e32 v8, s61 +; SI-NEXT: v_mov_b32_e32 v9, s60 +; SI-NEXT: v_mov_b32_e32 v10, s59 +; SI-NEXT: v_mov_b32_e32 v11, s58 +; SI-NEXT: v_mov_b32_e32 v12, s57 +; SI-NEXT: v_mov_b32_e32 v13, s56 +; SI-NEXT: v_mov_b32_e32 v14, s47 +; SI-NEXT: v_mov_b32_e32 v15, s46 +; SI-NEXT: v_mov_b32_e32 v16, s45 +; SI-NEXT: v_mov_b32_e32 v17, s44 +; SI-NEXT: v_mov_b32_e32 v18, s43 +; SI-NEXT: v_mov_b32_e32 v19, s42 +; SI-NEXT: v_mov_b32_e32 v20, s41 +; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v22, s15 +; SI-NEXT: v_mov_b32_e32 v23, s14 +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v25, s12 +; SI-NEXT: v_mov_b32_e32 v26, s11 +; SI-NEXT: v_mov_b32_e32 v27, s10 +; SI-NEXT: v_mov_b32_e32 v28, s9 +; SI-NEXT: v_mov_b32_e32 v29, s8 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v16i32_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB11_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB11_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB22_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB22_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v16i32: ; VI: ; %bb.0: @@ -2520,7 +5320,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -2811,7 +5611,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2822,7 +5622,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -3066,7 +5866,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3078,7 +5878,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -3352,7 +6152,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3364,7 +6164,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -3642,7 +6442,7 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3662,413 +6462,1657 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB23_5 +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB23_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB23_5 +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB23_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v16i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: s_branch .LBB23_2 +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v16i32_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB12_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB12_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_or_b32_e32 v60, v1, v18 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v7, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 -; GCN-NEXT: v_or_b32_e32 v54, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v4, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 -; GCN-NEXT: v_or_b32_e32 v48, v5, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 -; GCN-NEXT: v_or_b32_e32 v34, v6, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v50 -; GCN-NEXT: v_or_b32_e32 v11, v8, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 -; GCN-NEXT: v_or_b32_e32 v13, v10, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v38 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 -; GCN-NEXT: v_or_b32_e32 v28, v12, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GCN-NEXT: v_or_b32_e32 v21, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v31 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v15, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v31, v19, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; GCN-NEXT: v_or_b32_e32 v27, v61, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; GCN-NEXT: v_or_b32_e32 v40, v40, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 -; GCN-NEXT: v_or_b32_e32 v56, v56, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_or_b32_e32 v49, v49, v47 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 -; GCN-NEXT: v_or_b32_e32 v51, v51, v43 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v36, v36, v55 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v37, v37, v53 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v30, v30, v50 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v32, v32, v39 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v25, v25, v38 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v41, v23 -; GCN-NEXT: v_or_b32_e32 v31, v57, v31 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v58, v40 -; GCN-NEXT: v_or_b32_e32 v35, v59, v56 -; GCN-NEXT: v_or_b32_e32 v38, v46, v49 -; GCN-NEXT: v_or_b32_e32 v39, v44, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v36 -; GCN-NEXT: v_or_b32_e32 v36, v54, v37 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_or_b32_e32 v30, v48, v32 -; GCN-NEXT: v_or_b32_e32 v24, v34, v24 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_or_b32_e32 v20, v28, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v33 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16i32_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB24_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB24_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i32_to_v64i8: ; VI: ; %bb.0: @@ -4142,7 +8186,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -4195,9 +8239,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: .LBB24_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: s_cbranch_execz .LBB24_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 @@ -4265,7 +8309,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: .LBB24_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -4473,7 +8517,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -4526,9 +8570,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: s_cbranch_execz .LBB24_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 @@ -4596,7 +8640,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -4756,7 +8800,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -4790,9 +8834,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 @@ -4842,7 +8886,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB12_4: ; %end +; GFX11-TRUE16-NEXT: .LBB24_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5054,7 +9098,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -5104,9 +9148,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 @@ -5172,7 +9216,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -5342,606 +9386,2577 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16i32_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s61, 24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s58, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s47, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s44, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s43, 8 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s41, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v10 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s12, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v16i32_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_lshl_b32 s9, s67, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s66, 0xff +; VI-NEXT: s_lshl_b32 s11, s44, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_and_b32 s7, s17, 0xff +; VI-NEXT: s_lshl_b32 s9, s65, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s64, 0xff +; VI-NEXT: s_lshl_b32 s11, s55, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s54, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s53, 0xff +; VI-NEXT: s_lshl_b32 s11, s42, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s19, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s51, 0xff +; VI-NEXT: s_lshl_b32 s11, s50, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s49, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s48, 0xff +; VI-NEXT: s_lshl_b32 s11, s40, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s21, 0xff +; VI-NEXT: s_lshl_b32 s9, s39, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s38, 0xff +; VI-NEXT: s_lshl_b32 s11, s37, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s35, 0xff +; VI-NEXT: s_lshl_b32 s11, s14, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s23, 0xff +; VI-NEXT: s_lshl_b32 s9, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s31, 0xff +; VI-NEXT: s_lshl_b32 s11, s30, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_lshl_b32 s9, s91, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s90, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s25, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s88, 0xff +; VI-NEXT: s_lshl_b32 s11, s79, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s9, s78, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s77, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: s_lshl_b32 s9, s76, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s75, 0xff +; VI-NEXT: s_lshl_b32 s10, s74, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s9, s73, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s72, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s29, 0xff +; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s61, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s59, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v16i32_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-NEXT: v_writelane_b32 v4, s31, 1 +; GFX9-NEXT: v_writelane_b32 v4, s34, 2 +; GFX9-NEXT: v_writelane_b32 v4, s35, 3 +; GFX9-NEXT: v_writelane_b32 v4, s36, 4 +; GFX9-NEXT: v_writelane_b32 v4, s37, 5 +; GFX9-NEXT: v_writelane_b32 v4, s38, 6 +; GFX9-NEXT: v_writelane_b32 v4, s39, 7 +; GFX9-NEXT: v_writelane_b32 v4, s48, 8 +; GFX9-NEXT: v_writelane_b32 v4, s49, 9 +; GFX9-NEXT: v_writelane_b32 v4, s50, 10 +; GFX9-NEXT: v_writelane_b32 v4, s51, 11 +; GFX9-NEXT: v_writelane_b32 v4, s52, 12 +; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s5, s5, 3 +; GFX9-NEXT: s_add_i32 s4, s4, 3 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_and_b32 s7, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s55, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s44, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s53, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s51, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s50, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s42, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s48, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s38, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s37, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s40, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s35, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s31, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s14, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s94, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s92, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s91, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s89, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s79, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s78, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s76, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s74, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s73, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s63, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s7, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v4, 15 +; GFX9-NEXT: v_readlane_b32 s54, v4, 14 +; GFX9-NEXT: v_readlane_b32 s53, v4, 13 +; GFX9-NEXT: v_readlane_b32 s52, v4, 12 +; GFX9-NEXT: v_readlane_b32 s51, v4, 11 +; GFX9-NEXT: v_readlane_b32 s50, v4, 10 +; GFX9-NEXT: v_readlane_b32 s49, v4, 9 +; GFX9-NEXT: v_readlane_b32 s48, v4, 8 +; GFX9-NEXT: v_readlane_b32 s39, v4, 7 +; GFX9-NEXT: v_readlane_b32 s38, v4, 6 +; GFX9-NEXT: v_readlane_b32 s37, v4, 5 +; GFX9-NEXT: v_readlane_b32 s36, v4, 4 +; GFX9-NEXT: v_readlane_b32 s35, v4, 3 +; GFX9-NEXT: v_readlane_b32 s34, v4, 2 +; GFX9-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_v16i32_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s49, 9 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s48 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s37 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s35 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s31 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s94 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s93 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s91 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s73 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s47 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v17, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_v16i32_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s48, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s39, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s38, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s37, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s36, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s35, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s34, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s28, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s31, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s30, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, vcc_hi, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s95, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s94, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s93, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s92, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s91, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s89, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s79, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s78, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s76, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s75, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s74, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s73, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s72, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s63, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s60, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s59, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s47, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s46, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s45, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB13_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB13_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v16i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB26_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB26_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v16i32: ; VI: ; %bb.0: @@ -6055,7 +12070,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6219,9 +12234,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB13_2: ; %Flow +; VI-NEXT: .LBB26_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_4 +; VI-NEXT: s_cbranch_execz .LBB26_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6370,7 +12385,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB13_4: ; %end +; VI-NEXT: .LBB26_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -6515,7 +12530,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6679,9 +12694,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB13_2: ; %Flow +; GFX9-NEXT: .LBB26_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_4 +; GFX9-NEXT: s_cbranch_execz .LBB26_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -6830,7 +12845,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB13_4: ; %end +; GFX9-NEXT: .LBB26_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -6957,15 +12972,15 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -7147,8 +13162,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -7427,15 +13442,15 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -7614,8 +13629,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -7812,35 +13827,2147 @@ end: ret <16 x i32> %phi } +define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v16i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v64i8_to_v16i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v16i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-TRUE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB27_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB27_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB27_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-FAKE16-NEXT: .LBB27_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB27_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB27_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v8i64: ; VI: ; %bb.0: @@ -7849,7 +15976,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -7867,7 +15994,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: .LBB28_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7878,7 +16005,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -7896,7 +16023,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: .LBB28_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7908,7 +16035,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -7918,7 +16045,7 @@ define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7938,35 +16065,237 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v16f32_to_v8i64_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v16f32_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v16f32: ; VI: ; %bb.0: @@ -7975,7 +16304,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -7993,7 +16322,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8004,7 +16333,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -8022,7 +16351,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: .LBB30_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8034,7 +16363,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8056,7 +16385,7 @@ define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: .LBB30_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8076,35 +16405,233 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v8i64_to_v16f32_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v8i64_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v8f64: ; VI: ; %bb.0: @@ -8113,7 +16640,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8131,7 +16658,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: .LBB32_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8142,7 +16669,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8160,7 +16687,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: .LBB32_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8172,7 +16699,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -8182,7 +16709,7 @@ define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8202,27 +16729,229 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v16f32_to_v8f64_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v16f32_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v16f32: ; VI: ; %bb.0: @@ -8231,7 +16960,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -8241,7 +16970,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8252,7 +16981,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -8262,7 +16991,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: .LBB34_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8274,7 +17003,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -8284,7 +17013,7 @@ define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: .LBB34_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8304,102 +17033,272 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v8f64_to_v16f32_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v8f64_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v15 -; GCN-NEXT: v_mov_b32_e32 v28, v14 -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB36_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32i16: ; VI: ; %bb.0: @@ -8408,7 +17307,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8426,7 +17325,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8437,7 +17336,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -8455,7 +17354,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8467,7 +17366,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -8477,7 +17376,7 @@ define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8497,183 +17396,438 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v16f32_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB38_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB38_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v16f32: ; VI: ; %bb.0: @@ -8682,7 +17836,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -8733,7 +17887,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8744,7 +17898,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -8762,7 +17916,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8774,7 +17928,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -8792,7 +17946,7 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8812,221 +17966,578 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v32i16_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v15 -; GCN-NEXT: v_mov_b32_e32 v34, v14 -; GCN-NEXT: v_mov_b32_e32 v35, v13 -; GCN-NEXT: v_mov_b32_e32 v36, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v11 -; GCN-NEXT: v_mov_b32_e32 v38, v10 -; GCN-NEXT: v_mov_b32_e32 v39, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v49, v7 -; GCN-NEXT: v_mov_b32_e32 v50, v6 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_mov_b32_e32 v53, v3 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v55 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v54 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v53 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v52 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v51 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v50 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v49 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v48 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v39 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v38 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v37 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v36 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v35 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v15 +; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_mov_b32_e32 v34, v13 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v36, v11 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v39, v8 +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v5 +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v3 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB40_4 +; SI-NEXT: .LBB40_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB40_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: .LBB40_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32f16: ; VI: ; %bb.0: @@ -9035,7 +18546,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9053,7 +18564,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9064,7 +18575,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9082,7 +18593,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9094,7 +18605,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -9104,7 +18615,7 @@ define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9124,269 +18635,587 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v16f32_to_v32f16_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s7, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v16f32_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v16f32: ; VI: ; %bb.0: @@ -9395,7 +19224,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9446,7 +19275,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9457,7 +19286,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -9476,7 +19305,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: .LBB42_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9488,7 +19317,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -9506,7 +19335,7 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: .LBB42_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9526,170 +19355,570 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v32f16_to_v16f32_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v32f16_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v32 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v33 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v34 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v35 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v36 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v37 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v38 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v39 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v48 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v49 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v50 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v51 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v52 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v53 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v54 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v55 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v55 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v32bf16: ; VI: ; %bb.0: @@ -9698,7 +19927,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9716,7 +19945,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9727,7 +19956,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -9745,7 +19974,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9757,7 +19986,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 ; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 @@ -9767,7 +19996,7 @@ define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9787,237 +20016,537 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v16f32_to_v32bf16_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v34 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v33, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v16f32_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s26, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s25, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s24, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s23, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s21, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s20, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s19, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s18, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s16, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s15, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s14, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s13, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s12, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v16f32: ; VI: ; %bb.0: @@ -10026,7 +20555,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -10317,7 +20846,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10328,7 +20857,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -10572,7 +21101,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10584,7 +21113,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -10858,7 +21387,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10870,7 +21399,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -11148,7 +21677,7 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11168,413 +21697,1657 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB47_5 +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB47_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB47_5 +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB47_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v16f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-NEXT: .LBB47_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB47_3: +; GFX11-NEXT: s_branch .LBB47_2 +; GFX11-NEXT: .LBB47_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v16f32_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_or_b32_e32 v60, v1, v18 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v7, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 -; GCN-NEXT: v_or_b32_e32 v54, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v4, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 -; GCN-NEXT: v_or_b32_e32 v48, v5, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 -; GCN-NEXT: v_or_b32_e32 v34, v6, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v50 -; GCN-NEXT: v_or_b32_e32 v11, v8, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 -; GCN-NEXT: v_or_b32_e32 v13, v10, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v38 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 -; GCN-NEXT: v_or_b32_e32 v28, v12, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GCN-NEXT: v_or_b32_e32 v21, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v31 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v15, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v31, v19, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; GCN-NEXT: v_or_b32_e32 v27, v61, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; GCN-NEXT: v_or_b32_e32 v40, v40, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 -; GCN-NEXT: v_or_b32_e32 v56, v56, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_or_b32_e32 v49, v49, v47 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 -; GCN-NEXT: v_or_b32_e32 v51, v51, v43 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v36, v36, v55 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v37, v37, v53 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v30, v30, v50 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v32, v32, v39 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v25, v25, v38 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v23, v41, v23 -; GCN-NEXT: v_or_b32_e32 v31, v57, v31 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v58, v40 -; GCN-NEXT: v_or_b32_e32 v35, v59, v56 -; GCN-NEXT: v_or_b32_e32 v38, v46, v49 -; GCN-NEXT: v_or_b32_e32 v39, v44, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v36 -; GCN-NEXT: v_or_b32_e32 v36, v54, v37 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_or_b32_e32 v30, v48, v32 -; GCN-NEXT: v_or_b32_e32 v24, v34, v24 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_or_b32_e32 v20, v28, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v33 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v16f32_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v29, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f32_to_v64i8: ; VI: ; %bb.0: @@ -11648,7 +23421,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -11701,9 +23474,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -11771,7 +23544,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -11979,7 +23752,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -12032,9 +23805,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -12102,7 +23875,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -12262,7 +24035,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -12296,9 +24069,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7 @@ -12340,7 +24113,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12552,7 +24325,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -12602,9 +24375,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7 @@ -12662,7 +24435,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12832,606 +24605,2717 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v16f32_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v25, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v14, s21 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v9, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v7, s25 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v3, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 +; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 8 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v22, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v6, 8 +; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 +; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 +; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 +; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 +; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 +; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 +; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 +; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v19 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 +; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 +; SI-NEXT: v_alignbit_b32 v17, v3, v4, 16 +; SI-NEXT: v_alignbit_b32 v18, v3, v4, 8 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 24 +; SI-NEXT: v_alignbit_b32 v22, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v6, 8 +; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 +; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 +; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 +; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 +; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 +; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 +; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 +; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v19 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v28, v28, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v28, v28, v46 +; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v63 +; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v62 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v25, v28, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v20, v25, v20 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v61 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v59 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v52 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v49 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v15, v19, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v45 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v10, v10, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v35 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v40 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v54 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v23 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v34 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v16f32_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s59, s5, 8 +; VI-NEXT: s_lshr_b32 s58, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s72, s29, 8 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s27, 8 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 8 +; VI-NEXT: s_lshr_b32 s89, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s35, s23, 8 +; VI-NEXT: s_lshr_b32 s34, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s48, s21, 8 +; VI-NEXT: s_lshr_b32 s39, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s53, s19, 8 +; VI-NEXT: s_lshr_b32 s52, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s66, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v6, s27, 1.0 +; VI-NEXT: v_add_f32_e64 v5, s26, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s25, 1.0 +; VI-NEXT: v_add_f32_e64 v7, s24, 1.0 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] +; VI-NEXT: v_add_f32_e64 v10, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v9, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_add_f32_e64 v12, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v11, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s29, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s28, 1.0 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] +; VI-NEXT: v_add_f32_e64 v16, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v15, s18, 1.0 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; VI-NEXT: v_add_f32_e64 v18, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v17, s16, 1.0 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v17 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v20, s44 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v20, s42 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v11, s20 +; VI-NEXT: v_mov_b32_e32 v12, s21 +; VI-NEXT: v_mov_b32_e32 v9, s22 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v8, s25 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v6, s27 +; VI-NEXT: v_mov_b32_e32 v3, s28 +; VI-NEXT: v_mov_b32_e32 v4, s29 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v19, s67 +; VI-NEXT: v_mov_b32_e32 v62, s65 +; VI-NEXT: v_mov_b32_e32 v13, s66 +; VI-NEXT: v_mov_b32_e32 v60, s64 +; VI-NEXT: v_mov_b32_e32 v61, s55 +; VI-NEXT: v_mov_b32_e32 v58, s54 +; VI-NEXT: v_mov_b32_e32 v59, s52 +; VI-NEXT: v_mov_b32_e32 v57, s53 +; VI-NEXT: v_mov_b32_e32 v47, s51 +; VI-NEXT: v_mov_b32_e32 v56, s50 +; VI-NEXT: v_mov_b32_e32 v46, s49 +; VI-NEXT: v_mov_b32_e32 v45, s39 +; VI-NEXT: v_mov_b32_e32 v44, s48 +; VI-NEXT: v_mov_b32_e32 v42, s38 +; VI-NEXT: v_mov_b32_e32 v43, s37 +; VI-NEXT: v_mov_b32_e32 v41, s36 +; VI-NEXT: v_mov_b32_e32 v40, s34 +; VI-NEXT: v_mov_b32_e32 v55, s35 +; VI-NEXT: v_mov_b32_e32 v53, s31 +; VI-NEXT: v_mov_b32_e32 v54, s30 +; VI-NEXT: v_mov_b32_e32 v52, s91 +; VI-NEXT: v_mov_b32_e32 v51, s89 +; VI-NEXT: v_mov_b32_e32 v50, s90 +; VI-NEXT: v_mov_b32_e32 v48, s88 +; VI-NEXT: v_mov_b32_e32 v49, s79 +; VI-NEXT: v_mov_b32_e32 v39, s78 +; VI-NEXT: v_mov_b32_e32 v38, s76 +; VI-NEXT: v_mov_b32_e32 v37, s77 +; VI-NEXT: v_mov_b32_e32 v35, s75 +; VI-NEXT: v_mov_b32_e32 v36, s74 +; VI-NEXT: v_mov_b32_e32 v34, s73 +; VI-NEXT: v_mov_b32_e32 v33, s63 +; VI-NEXT: v_mov_b32_e32 v32, s72 +; VI-NEXT: v_mov_b32_e32 v30, s62 +; VI-NEXT: v_mov_b32_e32 v31, s61 +; VI-NEXT: v_mov_b32_e32 v29, s60 +; VI-NEXT: v_mov_b32_e32 v28, s58 +; VI-NEXT: v_mov_b32_e32 v27, s59 +; VI-NEXT: v_mov_b32_e32 v14, s57 +; VI-NEXT: v_mov_b32_e32 v26, s56 +; VI-NEXT: v_mov_b32_e32 v22, s12 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s8 +; VI-NEXT: v_mov_b32_e32 v25, s6 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v20, s40 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v25 +; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v62, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; VI-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 +; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; VI-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v12, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v22 +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; VI-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; VI-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v20 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; VI-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s59, s5, 8 +; GFX9-NEXT: s_lshr_b32 s58, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s72, s29, 8 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s27, 8 +; GFX9-NEXT: s_lshr_b32 s76, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 8 +; GFX9-NEXT: s_lshr_b32 s89, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s95, s23, 8 +; GFX9-NEXT: s_lshr_b32 s94, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s36, s21, 8 +; GFX9-NEXT: s_lshr_b32 s35, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s49, s19, 8 +; GFX9-NEXT: s_lshr_b32 s48, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s54, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v6, s27, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s26, 1.0 +; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s25, 1.0 +; GFX9-NEXT: v_add_f32_e64 v7, s24, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e64 v10, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v9, s22, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e64 v12, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v11, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v4, s29, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s28, 1.0 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e64 v20, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v19, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v17, s55 +; GFX9-NEXT: v_mov_b32_e32 v62, s53 +; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v60, s52 +; GFX9-NEXT: v_mov_b32_e32 v61, s51 +; GFX9-NEXT: v_mov_b32_e32 v58, s50 +; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v57, s49 +; GFX9-NEXT: v_mov_b32_e32 v47, s39 +; GFX9-NEXT: v_mov_b32_e32 v56, s38 +; GFX9-NEXT: v_mov_b32_e32 v46, s37 +; GFX9-NEXT: v_mov_b32_e32 v45, s35 +; GFX9-NEXT: v_mov_b32_e32 v44, s36 +; GFX9-NEXT: v_mov_b32_e32 v42, s34 +; GFX9-NEXT: v_mov_b32_e32 v43, s31 +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v55, s95 +; GFX9-NEXT: v_mov_b32_e32 v53, s93 +; GFX9-NEXT: v_mov_b32_e32 v54, s92 +; GFX9-NEXT: v_mov_b32_e32 v52, s91 +; GFX9-NEXT: v_mov_b32_e32 v51, s89 +; GFX9-NEXT: v_mov_b32_e32 v50, s90 +; GFX9-NEXT: v_mov_b32_e32 v48, s88 +; GFX9-NEXT: v_mov_b32_e32 v49, s79 +; GFX9-NEXT: v_mov_b32_e32 v39, s78 +; GFX9-NEXT: v_mov_b32_e32 v38, s76 +; GFX9-NEXT: v_mov_b32_e32 v37, s77 +; GFX9-NEXT: v_mov_b32_e32 v35, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s74 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v33, s63 +; GFX9-NEXT: v_mov_b32_e32 v32, s72 +; GFX9-NEXT: v_mov_b32_e32 v30, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s61 +; GFX9-NEXT: v_mov_b32_e32 v29, s60 +; GFX9-NEXT: v_mov_b32_e32 v28, s58 +; GFX9-NEXT: v_mov_b32_e32 v27, s59 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v18, s56 +; GFX9-NEXT: v_mov_b32_e32 v23, s12 +; GFX9-NEXT: v_mov_b32_e32 v24, s10 +; GFX9-NEXT: v_mov_b32_e32 v25, s8 +; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v16f32_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v18, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v22, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v21, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v25, s0, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s18, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s89 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s77 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s75 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s4 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v87, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xff, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v85, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v82, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v69, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v21, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v22, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v17, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v23, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v21, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v23, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v38, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v6 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v16f32_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, s0, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s20, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v23 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s0 :: v_dual_mov_b32 v24, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v6, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v4, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v2, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s49 :: v_dual_mov_b32 v87, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s48 :: v_dual_mov_b32 v85, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s37 :: v_dual_mov_b32 v83, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s34 :: v_dual_mov_b32 v81, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s31 :: v_dual_mov_b32 v71, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, vcc_hi :: v_dual_mov_b32 v69, s94 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s95 :: v_dual_mov_b32 v67, s93 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s92 :: v_dual_mov_b32 v65, s91 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s89 :: v_dual_mov_b32 v55, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s88 :: v_dual_mov_b32 v53, s79 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s78 :: v_dual_mov_b32 v51, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s77 :: v_dual_mov_b32 v49, s75 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s74 :: v_dual_mov_b32 v39, s73 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s63 :: v_dual_mov_b32 v37, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s62 :: v_dual_mov_b32 v35, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s60 :: v_dual_mov_b32 v33, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s59 :: v_dual_mov_b32 v31, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s56 :: v_dual_mov_b32 v29, s47 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v7, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s46 :: v_dual_mov_b32 v11, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s44 :: v_dual_mov_b32 v17, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v21, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s10 :: v_dual_mov_b32 v26, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s6 :: v_dual_mov_b32 v28, s4 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v87, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v23, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v85, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v87, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v69, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, v24, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, v20, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v15, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v28, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v14, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v5, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v8 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[82:85], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v16f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v16f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v16f32: ; VI: ; %bb.0: @@ -13545,7 +27429,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -13709,9 +27593,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -13860,7 +27744,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -14005,7 +27889,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -14169,9 +28053,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -14320,7 +28204,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -14447,15 +28331,15 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -14637,8 +28521,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -14917,15 +28801,15 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -15104,8 +28988,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -15302,35 +29186,2147 @@ end: ret <16 x float> %phi } +define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v16f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v64i8_to_v16f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v16f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v8f64: ; VI: ; %bb.0: @@ -15339,7 +31335,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -15357,7 +31353,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15368,7 +31364,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -15386,7 +31382,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -15398,7 +31394,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -15420,7 +31416,7 @@ define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15440,27 +31436,224 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v8i64_to_v8f64_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v8i64_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v8i64: ; VI: ; %bb.0: @@ -15469,7 +31662,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -15479,7 +31672,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15490,7 +31683,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -15500,7 +31693,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -15512,7 +31705,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -15522,7 +31715,7 @@ define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15542,102 +31735,272 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v8f64_to_v8i64_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v8f64_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v15 -; GCN-NEXT: v_mov_b32_e32 v28, v14 -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_mov_b32_e32 v18, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v10, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32i16: ; VI: ; %bb.0: @@ -15646,7 +32009,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -15664,7 +32027,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15675,7 +32038,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -15693,7 +32056,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -15705,7 +32068,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -15727,7 +32090,7 @@ define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: .LBB56_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15747,183 +32110,434 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_mov_b32_e32 v30, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v8i64_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB58_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB58_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v8i64: ; VI: ; %bb.0: @@ -15932,7 +32546,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -15983,7 +32597,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -15994,7 +32608,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -16012,7 +32626,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -16024,7 +32638,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -16042,7 +32656,7 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: .LBB58_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -16062,221 +32676,578 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v32i16_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_3 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB59_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: s_branch .LBB59_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v15 -; GCN-NEXT: v_mov_b32_e32 v33, v14 -; GCN-NEXT: v_mov_b32_e32 v36, v13 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v38, v11 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v48, v9 -; GCN-NEXT: v_mov_b32_e32 v39, v8 -; GCN-NEXT: v_mov_b32_e32 v50, v7 -; GCN-NEXT: v_mov_b32_e32 v49, v6 -; GCN-NEXT: v_mov_b32_e32 v52, v5 -; GCN-NEXT: v_mov_b32_e32 v51, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_mov_b32_e32 v53, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: .LBB30_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v54, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v52, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v48, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v38, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v36, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v34, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: .LBB30_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v35, v13 +; SI-NEXT: v_mov_b32_e32 v34, v12 +; SI-NEXT: v_mov_b32_e32 v37, v11 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v39, v9 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v49, v7 +; SI-NEXT: v_mov_b32_e32 v48, v6 +; SI-NEXT: v_mov_b32_e32 v51, v5 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v53, v3 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_mov_b32_e32 v55, v1 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v53, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v50 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v51, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v33, vcc +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32f16: ; VI: ; %bb.0: @@ -16285,7 +33256,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: s_cbranch_execz .LBB60_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -16303,7 +33274,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB30_2: ; %end +; VI-NEXT: .LBB60_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -16314,7 +33285,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB30_2 +; GFX9-NEXT: s_cbranch_execz .LBB60_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -16332,7 +33303,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB30_2: ; %end +; GFX9-NEXT: .LBB60_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -16344,7 +33315,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: s_cbranch_execz .LBB60_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -16366,7 +33337,7 @@ define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: .LBB60_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -16386,269 +33357,583 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v8i64_to_v32f16_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_add_u32 s10, s18, 3 +; SI-NEXT: s_addc_u32 s11, s19, 0 +; SI-NEXT: s_lshr_b32 s12, s10, 16 +; SI-NEXT: s_lshr_b32 s13, s11, 16 +; SI-NEXT: s_add_u32 s14, s20, 3 +; SI-NEXT: s_addc_u32 s15, s21, 0 +; SI-NEXT: s_lshr_b32 s16, s14, 16 +; SI-NEXT: s_lshr_b32 s17, s15, 16 +; SI-NEXT: s_add_u32 s18, s22, 3 +; SI-NEXT: s_addc_u32 s19, s23, 0 +; SI-NEXT: s_lshr_b32 s20, s18, 16 +; SI-NEXT: s_lshr_b32 s21, s19, 16 +; SI-NEXT: s_add_u32 s22, s24, 3 +; SI-NEXT: s_addc_u32 s23, s25, 0 +; SI-NEXT: s_lshr_b32 s24, s22, 16 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: s_lshr_b32 s41, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: s_lshr_b32 s43, s29, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v8i64_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB31_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB31_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB62_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB62_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v8i64: ; VI: ; %bb.0: @@ -16657,7 +33942,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -16708,7 +33993,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -16719,7 +34004,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: s_cbranch_execz .LBB62_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -16738,7 +34023,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB31_2: ; %end +; GFX9-NEXT: .LBB62_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -16750,7 +34035,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: s_cbranch_execz .LBB62_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -16768,7 +34053,7 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: .LBB62_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -16788,170 +34073,570 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v32f16_to_v8i64_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v32f16_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB63_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_3 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB63_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_4: +; VI-NEXT: s_branch .LBB63_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_3 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB63_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: s_branch .LBB63_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v35, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v37, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v39, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v49, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v51, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v53, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v55, vcc -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v51, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v53, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v55, vcc +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v32bf16: ; VI: ; %bb.0: @@ -16960,7 +34645,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: s_cbranch_execz .LBB64_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc @@ -16978,7 +34663,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB32_2: ; %end +; VI-NEXT: .LBB64_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -16989,7 +34674,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: s_cbranch_execz .LBB64_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc @@ -17007,7 +34692,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB32_2: ; %end +; GFX9-NEXT: .LBB64_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17019,7 +34704,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: s_cbranch_execz .LBB64_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -17041,7 +34726,7 @@ define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: .LBB64_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -17061,237 +34746,551 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v8i64_to_v32bf16_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s79, v1 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s79, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s79, 16 +; SI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: s_and_b32 s10, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_and_b32 s12, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s28, 16 +; SI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s27, 16 +; SI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s26, 16 +; SI-NEXT: s_and_b32 s42, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s25, 16 +; SI-NEXT: s_and_b32 s44, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s24, 16 +; SI-NEXT: s_and_b32 s46, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s23, 16 +; SI-NEXT: s_and_b32 s56, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s22, 16 +; SI-NEXT: s_and_b32 s58, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s21, 16 +; SI-NEXT: s_and_b32 s60, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s20, 16 +; SI-NEXT: s_and_b32 s62, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s19, 16 +; SI-NEXT: s_and_b32 s72, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s18, 16 +; SI-NEXT: s_and_b32 s74, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s76, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s17, s19, 0 +; SI-NEXT: s_add_u32 s18, s20, 3 +; SI-NEXT: s_addc_u32 s19, s21, 0 +; SI-NEXT: s_add_u32 s20, s22, 3 +; SI-NEXT: s_addc_u32 s21, s23, 0 +; SI-NEXT: s_add_u32 s22, s24, 3 +; SI-NEXT: s_addc_u32 s23, s25, 0 +; SI-NEXT: s_add_u32 s24, s26, 3 +; SI-NEXT: s_addc_u32 s15, s27, 0 +; SI-NEXT: s_add_u32 s13, s28, 3 +; SI-NEXT: s_addc_u32 s11, s29, 0 +; SI-NEXT: s_add_u32 s9, s78, 3 +; SI-NEXT: s_addc_u32 s7, s79, 0 +; SI-NEXT: s_and_b32 s6, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s8, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s10, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s12, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s40, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s24, 16 +; SI-NEXT: s_and_b32 s42, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s23, 16 +; SI-NEXT: s_and_b32 s44, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s22, 16 +; SI-NEXT: s_and_b32 s46, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s21, 16 +; SI-NEXT: s_and_b32 s56, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s20, 16 +; SI-NEXT: s_and_b32 s58, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s19, 16 +; SI-NEXT: s_and_b32 s60, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s18, 16 +; SI-NEXT: s_and_b32 s62, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s17, 16 +; SI-NEXT: s_and_b32 s72, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s16, 16 +; SI-NEXT: s_and_b32 s74, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s5, 16 +; SI-NEXT: s_and_b32 s76, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s77 +; SI-NEXT: v_mov_b32_e32 v1, s76 +; SI-NEXT: v_mov_b32_e32 v2, s75 +; SI-NEXT: v_mov_b32_e32 v3, s74 +; SI-NEXT: v_mov_b32_e32 v4, s73 +; SI-NEXT: v_mov_b32_e32 v5, s72 +; SI-NEXT: v_mov_b32_e32 v6, s63 +; SI-NEXT: v_mov_b32_e32 v7, s62 +; SI-NEXT: v_mov_b32_e32 v8, s61 +; SI-NEXT: v_mov_b32_e32 v9, s60 +; SI-NEXT: v_mov_b32_e32 v10, s59 +; SI-NEXT: v_mov_b32_e32 v11, s58 +; SI-NEXT: v_mov_b32_e32 v12, s57 +; SI-NEXT: v_mov_b32_e32 v13, s56 +; SI-NEXT: v_mov_b32_e32 v14, s47 +; SI-NEXT: v_mov_b32_e32 v15, s46 +; SI-NEXT: v_mov_b32_e32 v16, s45 +; SI-NEXT: v_mov_b32_e32 v17, s44 +; SI-NEXT: v_mov_b32_e32 v18, s43 +; SI-NEXT: v_mov_b32_e32 v19, s42 +; SI-NEXT: v_mov_b32_e32 v20, s41 +; SI-NEXT: v_mov_b32_e32 v21, s40 +; SI-NEXT: v_mov_b32_e32 v22, s15 +; SI-NEXT: v_mov_b32_e32 v23, s14 +; SI-NEXT: v_mov_b32_e32 v24, s13 +; SI-NEXT: v_mov_b32_e32 v25, s12 +; SI-NEXT: v_mov_b32_e32 v26, s11 +; SI-NEXT: v_mov_b32_e32 v27, s10 +; SI-NEXT: v_mov_b32_e32 v28, s9 +; SI-NEXT: v_mov_b32_e32 v29, s8 +; SI-NEXT: v_mov_b32_e32 v30, s7 +; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v8i64_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB33_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB33_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB66_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB66_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v8i64: ; VI: ; %bb.0: @@ -17300,7 +35299,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -17591,7 +35590,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17602,7 +35601,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -17846,7 +35845,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -17858,7 +35857,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -18132,7 +36131,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18144,7 +36143,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -18422,7 +36421,7 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -18442,413 +36441,1657 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB67_5 +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB67_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB67_5 +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB67_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v8i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-NEXT: .LBB67_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB67_3: +; GFX11-NEXT: s_branch .LBB67_2 +; GFX11-NEXT: .LBB67_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i64_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v23, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v50, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB34_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v23, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v50, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB34_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_or_b32_e32 v60, v1, v18 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; GCN-NEXT: v_or_b32_e32 v17, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 -; GCN-NEXT: v_or_b32_e32 v7, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 -; GCN-NEXT: v_or_b32_e32 v54, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 -; GCN-NEXT: v_or_b32_e32 v9, v4, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 -; GCN-NEXT: v_or_b32_e32 v48, v5, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 -; GCN-NEXT: v_or_b32_e32 v34, v6, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v51 -; GCN-NEXT: v_or_b32_e32 v11, v8, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 -; GCN-NEXT: v_or_b32_e32 v13, v10, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v38 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 -; GCN-NEXT: v_or_b32_e32 v28, v12, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GCN-NEXT: v_or_b32_e32 v21, v14, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v32 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v15, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v32, v19, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 -; GCN-NEXT: v_or_b32_e32 v27, v61, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; GCN-NEXT: v_or_b32_e32 v40, v40, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 -; GCN-NEXT: v_or_b32_e32 v56, v56, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 -; GCN-NEXT: v_or_b32_e32 v49, v49, v47 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 -; GCN-NEXT: v_or_b32_e32 v50, v50, v43 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v36, v36, v55 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v37, v37, v53 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v31, v31, v39 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v35 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v25, v25, v38 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v20, v20, v52 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v26, v26, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v33, v33, v45 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v24, v41, v24 -; GCN-NEXT: v_or_b32_e32 v32, v57, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v58, v40 -; GCN-NEXT: v_or_b32_e32 v35, v59, v56 -; GCN-NEXT: v_or_b32_e32 v38, v46, v49 -; GCN-NEXT: v_or_b32_e32 v39, v44, v50 -; GCN-NEXT: v_or_b32_e32 v7, v7, v36 -; GCN-NEXT: v_or_b32_e32 v36, v54, v37 -; GCN-NEXT: v_or_b32_e32 v9, v9, v30 -; GCN-NEXT: v_or_b32_e32 v30, v48, v31 -; GCN-NEXT: v_or_b32_e32 v23, v34, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v20 -; GCN-NEXT: v_or_b32_e32 v20, v28, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v33 -; GCN-NEXT: v_or_b32_e32 v24, v29, v24 -; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i64_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB68_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB68_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i64_to_v64i8: ; VI: ; %bb.0: @@ -18922,7 +38165,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -18975,9 +38218,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: .LBB68_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: s_cbranch_execz .LBB68_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc @@ -19045,7 +38288,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -19253,7 +38496,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -19306,9 +38549,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: s_cbranch_execz .LBB68_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc @@ -19376,7 +38619,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -19536,7 +38779,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -19570,9 +38813,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB34_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -19627,7 +38870,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB34_4: ; %end +; GFX11-TRUE16-NEXT: .LBB68_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -19839,7 +39082,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -19889,9 +39132,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB34_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB68_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -19962,7 +39205,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -20132,606 +39375,2577 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i64_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 +; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 +; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 +; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 +; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 +; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 +; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 +; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 +; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 +; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 +; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 +; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 +; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 +; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 +; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 +; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 +; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 +; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 +; SI-NEXT: s_lshr_b32 s8, s6, 24 +; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v22, s4, v22 +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s61, 24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v21, s4, v21 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s59, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s58, 24 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v18, s4, v18 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s56, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s47, 24 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s44, 24 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s43, 8 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s41, 24 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s12, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s11, s11, 24 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v8i64_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s4, s4, 3 +; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_lshl_b32 s9, s67, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s66, 0xff +; VI-NEXT: s_lshl_b32 s11, s44, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_and_b32 s7, s17, 0xff +; VI-NEXT: s_lshl_b32 s9, s65, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s64, 0xff +; VI-NEXT: s_lshl_b32 s11, s55, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s54, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s53, 0xff +; VI-NEXT: s_lshl_b32 s11, s42, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s19, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s51, 0xff +; VI-NEXT: s_lshl_b32 s11, s50, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s49, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s48, 0xff +; VI-NEXT: s_lshl_b32 s11, s40, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s21, 0xff +; VI-NEXT: s_lshl_b32 s9, s39, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s38, 0xff +; VI-NEXT: s_lshl_b32 s11, s37, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s35, 0xff +; VI-NEXT: s_lshl_b32 s11, s14, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s23, 0xff +; VI-NEXT: s_lshl_b32 s9, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s31, 0xff +; VI-NEXT: s_lshl_b32 s11, s30, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_lshl_b32 s9, s91, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s90, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s25, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s88, 0xff +; VI-NEXT: s_lshl_b32 s11, s79, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s9, s78, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s77, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: s_lshl_b32 s9, s76, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s75, 0xff +; VI-NEXT: s_lshl_b32 s10, s74, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s9, s73, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s72, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s29, 0xff +; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s61, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s59, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v8i64_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-NEXT: v_writelane_b32 v4, s31, 1 +; GFX9-NEXT: v_writelane_b32 v4, s34, 2 +; GFX9-NEXT: v_writelane_b32 v4, s35, 3 +; GFX9-NEXT: v_writelane_b32 v4, s36, 4 +; GFX9-NEXT: v_writelane_b32 v4, s37, 5 +; GFX9-NEXT: v_writelane_b32 v4, s38, 6 +; GFX9-NEXT: v_writelane_b32 v4, s39, 7 +; GFX9-NEXT: v_writelane_b32 v4, s48, 8 +; GFX9-NEXT: v_writelane_b32 v4, s49, 9 +; GFX9-NEXT: v_writelane_b32 v4, s50, 10 +; GFX9-NEXT: v_writelane_b32 v4, s51, 11 +; GFX9-NEXT: v_writelane_b32 v4, s52, 12 +; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s4, s4, 3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s59, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s29, 8 +; GFX9-NEXT: s_lshr_b32 s72, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 8 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s89, s25, 8 +; GFX9-NEXT: s_lshr_b32 s90, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s94, s23, 8 +; GFX9-NEXT: s_lshr_b32 s95, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s35, s21, 8 +; GFX9-NEXT: s_lshr_b32 s36, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s48, s19, 8 +; GFX9-NEXT: s_lshr_b32 s49, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: s_and_b32 s7, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s55, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s44, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s53, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s51, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s50, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s42, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s48, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s38, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s37, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s40, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s35, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s34, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s31, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s95, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s14, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s94, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s93, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s92, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s91, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s89, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s79, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s78, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s76, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s74, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s73, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s63, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s60, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s7, s59, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v4, 15 +; GFX9-NEXT: v_readlane_b32 s54, v4, 14 +; GFX9-NEXT: v_readlane_b32 s53, v4, 13 +; GFX9-NEXT: v_readlane_b32 s52, v4, 12 +; GFX9-NEXT: v_readlane_b32 s51, v4, 11 +; GFX9-NEXT: v_readlane_b32 s50, v4, 10 +; GFX9-NEXT: v_readlane_b32 s49, v4, 9 +; GFX9-NEXT: v_readlane_b32 s48, v4, 8 +; GFX9-NEXT: v_readlane_b32 s39, v4, 7 +; GFX9-NEXT: v_readlane_b32 s38, v4, 6 +; GFX9-NEXT: v_readlane_b32 s37, v4, 5 +; GFX9-NEXT: v_readlane_b32 s36, v4, 4 +; GFX9-NEXT: v_readlane_b32 s35, v4, 3 +; GFX9-NEXT: v_readlane_b32 s34, v4, 2 +; GFX9-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v8i64_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s49, 9 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s48 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s39 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s37 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s35 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s31 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s94 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s93 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s92 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s91 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s73 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s72 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s60 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s47 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s45 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v17, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v8i64_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s48, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s39, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s38, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s37, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s36, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s35, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s34, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s28, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s31, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s30, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, vcc_hi, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s95, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s94, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s93, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s92, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s91, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s90, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s89, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s79, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s78, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s76, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s75, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s74, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s73, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s72, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s63, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s62, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s60, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s59, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s57, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s47, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s46, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s45, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s42, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB35_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB35_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v8i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB70_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB70_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v8i64: ; VI: ; %bb.0: @@ -20845,7 +42059,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: s_cbranch_execz .LBB70_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21009,9 +42223,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB35_2: ; %Flow +; VI-NEXT: .LBB70_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_4 +; VI-NEXT: s_cbranch_execz .LBB70_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21160,7 +42374,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB35_4: ; %end +; VI-NEXT: .LBB70_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -21305,7 +42519,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: s_cbranch_execz .LBB70_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21469,9 +42683,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB35_2: ; %Flow +; GFX9-NEXT: .LBB70_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_4 +; GFX9-NEXT: s_cbranch_execz .LBB70_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -21620,7 +42834,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB35_4: ; %end +; GFX9-NEXT: .LBB70_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -21747,15 +42961,15 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -21937,8 +43151,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -22217,15 +43431,15 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -22404,8 +43618,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -22602,108 +43816,2220 @@ end: ret <8 x i64> %phi } +define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v8i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v64i8_to_v8i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v8i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-TRUE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB71_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB71_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB71_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-FAKE16-NEXT: .LBB71_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB71_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB71_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v38, v7 -; GCN-NEXT: v_mov_b32_e32 v37, v6 -; GCN-NEXT: v_mov_b32_e32 v36, v5 -; GCN-NEXT: v_mov_b32_e32 v35, v4 -; GCN-NEXT: v_mov_b32_e32 v34, v3 -; GCN-NEXT: v_mov_b32_e32 v33, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; GCN-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 -; GCN-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 -; GCN-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v34 -; GCN-NEXT: v_mov_b32_e32 v8, v35 -; GCN-NEXT: v_mov_b32_e32 v10, v36 -; GCN-NEXT: v_mov_b32_e32 v12, v37 -; GCN-NEXT: v_mov_b32_e32 v14, v38 -; GCN-NEXT: v_mov_b32_e32 v16, v48 -; GCN-NEXT: v_mov_b32_e32 v18, v49 -; GCN-NEXT: v_mov_b32_e32 v20, v50 -; GCN-NEXT: v_mov_b32_e32 v22, v51 -; GCN-NEXT: v_mov_b32_e32 v24, v52 -; GCN-NEXT: v_mov_b32_e32 v26, v53 -; GCN-NEXT: v_mov_b32_e32 v28, v54 -; GCN-NEXT: v_mov_b32_e32 v30, v55 -; GCN-NEXT: v_mov_b32_e32 v1, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v36, v5 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB72_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB72_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 +; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 +; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 +; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 +; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 +; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB72_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v4, v33 +; SI-NEXT: v_mov_b32_e32 v6, v34 +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_mov_b32_e32 v10, v36 +; SI-NEXT: v_mov_b32_e32 v12, v37 +; SI-NEXT: v_mov_b32_e32 v14, v38 +; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: v_mov_b32_e32 v18, v49 +; SI-NEXT: v_mov_b32_e32 v20, v50 +; SI-NEXT: v_mov_b32_e32 v22, v51 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v26, v53 +; SI-NEXT: v_mov_b32_e32 v28, v54 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32i16: ; VI: ; %bb.0: @@ -22712,7 +46038,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -22722,7 +46048,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -22733,7 +46059,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -22743,7 +46069,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: .LBB72_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -22755,7 +46081,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -22765,7 +46091,7 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: .LBB72_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -22785,183 +46111,421 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s19 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s27 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: s_cbranch_execnz .LBB73_3 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: .LBB73_3: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v14, v13 +; SI-NEXT: v_mov_b32_e32 v18, v17 +; SI-NEXT: v_mov_b32_e32 v22, v21 +; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v1, v34 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v13, v37 +; SI-NEXT: v_mov_b32_e32 v17, v38 +; SI-NEXT: v_mov_b32_e32 v21, v39 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_4: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB73_2 +; +; VI-LABEL: bitcast_v8f64_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB73_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_3 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB73_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_4: +; VI-NEXT: s_branch .LBB73_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_3 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB73_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: s_branch .LBB73_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v37, v12 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v55 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: v_or_b32_e32 v9, v9, v49 +; SI-NEXT: v_or_b32_e32 v10, v10, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v23 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB74_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v6, v52, v6 +; SI-NEXT: v_or_b32_e32 v7, v51, v7 +; SI-NEXT: v_or_b32_e32 v8, v50, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; SI-NEXT: .LBB74_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v8f64: ; VI: ; %bb.0: @@ -22970,7 +46534,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v16, 3, v15 @@ -23021,7 +46585,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: .LBB74_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23032,7 +46596,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB37_2 +; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -23050,7 +46614,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB37_2: ; %end +; GFX9-NEXT: .LBB74_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23062,7 +46626,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -23080,7 +46644,7 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB37_2: ; %end +; GFX11-NEXT: .LBB74_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23100,193 +46664,558 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v26, v14 +; SI-NEXT: v_mov_b32_e32 v25, v12 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v24, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v7, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 +; SI-NEXT: v_or_b32_e32 v15, v0, v17 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v25 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v32i16_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s7, 3 +; VI-NEXT: s_and_b32 s4, s7, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s6, 3 +; VI-NEXT: s_add_i32 s7, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s6, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s29, 3 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s28, 3 +; VI-NEXT: s_add_i32 s29, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s27, 3 +; VI-NEXT: s_add_i32 s28, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s26, 3 +; VI-NEXT: s_add_i32 s27, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s25, 3 +; VI-NEXT: s_add_i32 s26, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s24, 3 +; VI-NEXT: s_add_i32 s25, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s23, 3 +; VI-NEXT: s_add_i32 s24, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s22, 3 +; VI-NEXT: s_add_i32 s23, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s21, 3 +; VI-NEXT: s_add_i32 s22, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s20, 3 +; VI-NEXT: s_add_i32 s21, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s19, 3 +; VI-NEXT: s_add_i32 s20, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s19, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_3 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB75_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: s_branch .LBB75_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: .LBB38_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: .LBB38_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v39 -; GCN-NEXT: v_mov_b32_e32 v1, v55 -; GCN-NEXT: v_mov_b32_e32 v2, v32 -; GCN-NEXT: v_mov_b32_e32 v3, v54 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v5, v53 -; GCN-NEXT: v_mov_b32_e32 v6, v34 -; GCN-NEXT: v_mov_b32_e32 v7, v52 -; GCN-NEXT: v_mov_b32_e32 v8, v35 -; GCN-NEXT: v_mov_b32_e32 v9, v51 -; GCN-NEXT: v_mov_b32_e32 v10, v36 -; GCN-NEXT: v_mov_b32_e32 v11, v50 -; GCN-NEXT: v_mov_b32_e32 v12, v37 -; GCN-NEXT: v_mov_b32_e32 v13, v49 -; GCN-NEXT: v_mov_b32_e32 v14, v38 -; GCN-NEXT: v_mov_b32_e32 v15, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB76_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: .LBB76_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v2, v53 +; SI-NEXT: v_mov_b32_e32 v3, v52 +; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_mov_b32_e32 v5, v49 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_mov_b32_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v11, v34 +; SI-NEXT: v_mov_b32_e32 v12, v37 +; SI-NEXT: v_mov_b32_e32 v13, v32 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: v_mov_b32_e32 v15, v33 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32f16: ; VI: ; %bb.0: @@ -23295,7 +47224,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23305,7 +47234,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: .LBB76_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23316,7 +47245,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: s_cbranch_execz .LBB76_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23326,7 +47255,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: .LBB76_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23338,7 +47267,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23348,7 +47277,7 @@ define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: .LBB76_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23368,269 +47297,555 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v8f64_to_v32f16_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s6 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: s_lshr_b32 s6, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s6 +; SI-NEXT: s_lshr_b32 s6, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 +; SI-NEXT: s_lshr_b32 s6, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s6 +; SI-NEXT: s_lshr_b32 s6, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s6 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s6 +; SI-NEXT: s_lshr_b32 s6, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s6 +; SI-NEXT: s_lshr_b32 s6, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; SI-NEXT: s_lshr_b32 s6, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s6 +; SI-NEXT: s_lshr_b32 s6, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s6 +; SI-NEXT: s_lshr_b32 s6, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: s_lshr_b32 s6, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s6 +; SI-NEXT: s_lshr_b32 s6, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: s_lshr_b32 s6, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[2:3], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v8f64_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB77_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_3 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB77_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_4: +; VI-NEXT: s_branch .LBB77_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_3 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB77_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: s_branch .LBB77_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB39_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB39_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_or_b32_e32 v5, v51, v5 +; SI-NEXT: v_or_b32_e32 v6, v49, v6 +; SI-NEXT: v_or_b32_e32 v7, v39, v7 +; SI-NEXT: v_or_b32_e32 v8, v37, v8 +; SI-NEXT: v_or_b32_e32 v9, v35, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v31, v11 +; SI-NEXT: v_or_b32_e32 v12, v22, v12 +; SI-NEXT: v_or_b32_e32 v13, v20, v13 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB78_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v35 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v23 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v19 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB78_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v8f64: ; VI: ; %bb.0: @@ -23639,7 +47854,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 0x200 ; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -23690,7 +47905,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v17 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23701,7 +47916,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB39_2 +; GFX9-NEXT: s_cbranch_execz .LBB78_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -23720,7 +47935,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB39_2: ; %end +; GFX9-NEXT: .LBB78_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23732,7 +47947,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -23750,7 +47965,7 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: .LBB78_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23770,150 +47985,550 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v32f16_to_v8f64_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v33, v7 +; SI-NEXT: v_or_b32_e32 v8, v31, v8 +; SI-NEXT: v_or_b32_e32 v9, v29, v9 +; SI-NEXT: v_or_b32_e32 v10, v27, v10 +; SI-NEXT: v_or_b32_e32 v11, v25, v11 +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v14, v19, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v30 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v32f16_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB79_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_3 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB79_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_4: +; VI-NEXT: s_branch .LBB79_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_3 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB79_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: s_branch .LBB79_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: .LBB40_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v1 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v0 -; GCN-NEXT: .LBB40_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: v_mov_b32_e32 v1, v54 -; GCN-NEXT: v_mov_b32_e32 v2, v53 -; GCN-NEXT: v_mov_b32_e32 v3, v52 -; GCN-NEXT: v_mov_b32_e32 v4, v51 -; GCN-NEXT: v_mov_b32_e32 v5, v50 -; GCN-NEXT: v_mov_b32_e32 v6, v49 -; GCN-NEXT: v_mov_b32_e32 v7, v48 -; GCN-NEXT: v_mov_b32_e32 v8, v39 -; GCN-NEXT: v_mov_b32_e32 v9, v38 -; GCN-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NEXT: v_mov_b32_e32 v11, v36 -; GCN-NEXT: v_mov_b32_e32 v12, v35 -; GCN-NEXT: v_mov_b32_e32 v13, v34 -; GCN-NEXT: v_mov_b32_e32 v14, v33 -; GCN-NEXT: v_mov_b32_e32 v15, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: .LBB80_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; SI-NEXT: .LBB80_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v55 +; SI-NEXT: v_mov_b32_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v2, v53 +; SI-NEXT: v_mov_b32_e32 v3, v52 +; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_mov_b32_e32 v5, v50 +; SI-NEXT: v_mov_b32_e32 v6, v49 +; SI-NEXT: v_mov_b32_e32 v7, v48 +; SI-NEXT: v_mov_b32_e32 v8, v39 +; SI-NEXT: v_mov_b32_e32 v9, v38 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v11, v36 +; SI-NEXT: v_mov_b32_e32 v12, v35 +; SI-NEXT: v_mov_b32_e32 v13, v34 +; SI-NEXT: v_mov_b32_e32 v14, v33 +; SI-NEXT: v_mov_b32_e32 v15, v32 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32bf16: ; VI: ; %bb.0: @@ -23922,7 +48537,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: s_cbranch_execz .LBB80_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23932,7 +48547,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: .LBB80_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23943,7 +48558,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: s_cbranch_execz .LBB80_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23953,7 +48568,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: .LBB80_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -23965,7 +48580,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: s_cbranch_execz .LBB80_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -23975,7 +48590,7 @@ define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: .LBB80_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -23995,237 +48610,505 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v8f64_to_v32bf16_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v35, s29 +; SI-NEXT: s_cbranch_scc0 .LBB81_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v35 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v34 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v39 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: s_cbranch_execnz .LBB81_3 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[52:53], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[34:35], 1.0 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v54 +; SI-NEXT: .LBB81_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v33 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_4: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB81_2 +; +; VI-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB81_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_3 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB81_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_4: +; VI-NEXT: s_branch .LBB81_2 +; +; GFX9-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_3 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB81_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: s_branch .LBB81_2 +; +; GFX11-LABEL: bitcast_v8f64_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], s[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[12:13], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 -; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 -; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 -; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 -; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 -; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB41_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 -; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 -; GCN-NEXT: .LBB41_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v56 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB82_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB82_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v8f64: ; VI: ; %bb.0: @@ -24234,7 +49117,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -24525,7 +49408,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -24536,7 +49419,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -24780,7 +49663,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -24792,7 +49675,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 @@ -25066,7 +49949,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v22, v25, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v20, v0 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25078,7 +49961,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v15 @@ -25356,7 +50239,7 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -25376,425 +50259,1649 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 +; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 +; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 +; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 +; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 +; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 +; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v19, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v19, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v0 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: s_branch .LBB83_5 +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB83_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v19, 1 +; VI-NEXT: v_readlane_b32 s30, v19, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s4, s31, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s4, s30, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB83_5 +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB83_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v8f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-NEXT: .LBB83_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s2, s26, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_and_b32 s1, s25, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s3, s25, 16 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s24, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v2, v9, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s1, s23, 16 +; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_lshl_b32 s1, s22, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-NEXT: s_lshl_b32 s1, s21, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s20, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v10, v0, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v7 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v4 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, v16, v4 +; GFX11-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v6, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s18, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX11-NEXT: s_and_b32 s1, s17, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v6, v4 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v17, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, v19, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: s_lshl_b32 s1, s16, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v17, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v6, v16, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v17, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v6, v16 +; GFX11-NEXT: v_lshl_or_b32 v6, v2, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: s_lshl_b32 s1, s15, 16 +; GFX11-NEXT: s_and_b32 s0, s14, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v16, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v3, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v17 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v18, v18, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v18 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4 +; GFX11-NEXT: v_add_f32_e64 v18, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s13, 16 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_add_f32_e64 v21, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_lshl_b32 s0, s12, 16 +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v21 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v17, v19, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v17, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v19, v21, 16, 1 +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e64 v22, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v23, v17, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v19, v19, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, v20, v18 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v17 +; GFX11-NEXT: v_bfe_u32 v24, v22, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v23, v23, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x7fff, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v24, v24, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v26, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v24 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v23, v27, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v21, v24, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v1, v18, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, v16, 16, v21 +; GFX11-NEXT: v_lshl_or_b32 v0, v20, 16, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB83_3: +; GFX11-NEXT: s_branch .LBB83_2 +; GFX11-NEXT: .LBB83_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v8f64_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v21, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v27, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v32, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v52, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v42, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v54, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v55, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB42_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 8 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 8 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 24 -; GCN-NEXT: v_alignbit_b32 v21, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 8 -; GCN-NEXT: v_alignbit_b32 v27, v10, v9, 24 -; GCN-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v10, v9, 8 -; GCN-NEXT: v_alignbit_b32 v32, v8, v7, 24 -; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 8 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 24 -; GCN-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 8 -; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 24 -; GCN-NEXT: v_alignbit_b32 v52, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v42, v4, v3, 8 -; GCN-NEXT: v_alignbit_b32 v54, v2, v1, 24 -; GCN-NEXT: v_alignbit_b32 v55, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v16 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 24, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 24, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; GCN-NEXT: .LBB42_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v45 -; GCN-NEXT: v_or_b32_e32 v45, v1, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GCN-NEXT: v_or_b32_e32 v18, v2, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GCN-NEXT: v_or_b32_e32 v42, v3, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v63, v4, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GCN-NEXT: v_or_b32_e32 v40, v5, v1 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 24, v54 -; GCN-NEXT: v_and_b32_e32 v62, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; GCN-NEXT: v_or_b32_e32 v14, v6, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v61, 0xff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; GCN-NEXT: v_or_b32_e32 v15, v7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v59 -; GCN-NEXT: v_or_b32_e32 v16, v8, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v34 -; GCN-NEXT: v_or_b32_e32 v17, v9, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v56 -; GCN-NEXT: v_or_b32_e32 v34, v10, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v30 -; GCN-NEXT: v_or_b32_e32 v30, v11, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v44 -; GCN-NEXT: v_or_b32_e32 v11, v12, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; GCN-NEXT: v_or_b32_e32 v13, v13, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v50 -; GCN-NEXT: v_or_b32_e32 v24, v19, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v22 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NEXT: v_or_b32_e32 v21, v20, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; GCN-NEXT: v_or_b32_e32 v36, v54, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v62 -; GCN-NEXT: v_or_b32_e32 v31, v60, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v61 -; GCN-NEXT: v_or_b32_e32 v56, v25, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v58 -; GCN-NEXT: v_or_b32_e32 v57, v57, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; GCN-NEXT: v_or_b32_e32 v37, v37, v52 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v39, v39, v47 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v32, v32, v46 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v33, v33, v43 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v27, v27, v41 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GCN-NEXT: v_or_b32_e32 v48, v48, v53 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v49, v49, v50 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v22, v22, v51 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v35, v35, v44 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v29, v29, v54 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v38, v38, v55 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v26, v59, v26 -; GCN-NEXT: v_or_b32_e32 v36, v45, v36 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_or_b32_e32 v31, v42, v56 -; GCN-NEXT: v_or_b32_e32 v50, v58, v57 -; GCN-NEXT: v_or_b32_e32 v37, v40, v37 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v32 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_or_b32_e32 v17, v17, v27 -; GCN-NEXT: v_or_b32_e32 v27, v34, v48 -; GCN-NEXT: v_or_b32_e32 v30, v30, v49 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v35 -; GCN-NEXT: v_or_b32_e32 v22, v24, v29 -; GCN-NEXT: v_or_b32_e32 v21, v21, v38 -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8f64_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v47, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v57, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB84_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v13, 8 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; SI-NEXT: v_alignbit_b32 v24, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 24 +; SI-NEXT: v_alignbit_b32 v30, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 8 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v36, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v47, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v57, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; SI-NEXT: .LBB84_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v47 +; SI-NEXT: v_or_b32_e32 v47, v47, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v64i8: ; VI: ; %bb.0: @@ -25868,7 +51975,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -25921,9 +52028,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -25983,7 +52090,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -26191,7 +52298,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -26244,9 +52351,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -26306,7 +52413,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -26466,7 +52573,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -26500,9 +52607,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB42_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -26544,7 +52651,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB42_4: ; %end +; GFX11-TRUE16-NEXT: .LBB84_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -26756,7 +52863,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -26806,9 +52913,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB84_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -26866,7 +52973,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -27036,606 +53143,2593 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8f64_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 +; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 +; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s28 +; SI-NEXT: v_alignbit_b32 v20, s29, v1, 24 +; SI-NEXT: v_alignbit_b32 v4, s29, v1, 16 +; SI-NEXT: v_alignbit_b32 v19, s29, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s26 +; SI-NEXT: v_alignbit_b32 v6, s27, v1, 24 +; SI-NEXT: v_alignbit_b32 v21, s27, v1, 16 +; SI-NEXT: v_alignbit_b32 v22, s27, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_alignbit_b32 v8, s25, v1, 24 +; SI-NEXT: v_alignbit_b32 v23, s25, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, s25, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s22 +; SI-NEXT: v_alignbit_b32 v10, s23, v1, 24 +; SI-NEXT: v_alignbit_b32 v25, s23, v1, 16 +; SI-NEXT: v_alignbit_b32 v26, s23, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s20 +; SI-NEXT: v_alignbit_b32 v12, s21, v1, 24 +; SI-NEXT: v_alignbit_b32 v14, s21, v1, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: v_alignbit_b32 v27, s19, v1, 24 +; SI-NEXT: v_alignbit_b32 v28, s19, v1, 16 +; SI-NEXT: v_alignbit_b32 v29, s19, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: v_alignbit_b32 v30, s17, v1, 24 +; SI-NEXT: v_alignbit_b32 v31, s17, v1, 16 +; SI-NEXT: v_alignbit_b32 v32, s17, v1, 8 +; SI-NEXT: s_lshr_b32 s8, s5, 24 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s5, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[15:16], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_readfirstlane_b32 s29, v4 +; SI-NEXT: v_readfirstlane_b32 s27, v6 +; SI-NEXT: v_readfirstlane_b32 s25, v8 +; SI-NEXT: v_readfirstlane_b32 s23, v10 +; SI-NEXT: v_readfirstlane_b32 s21, v12 +; SI-NEXT: v_readfirstlane_b32 s19, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v16 +; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 +; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 +; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 +; SI-NEXT: v_alignbit_b32 v20, s29, v3, 24 +; SI-NEXT: v_alignbit_b32 v4, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v19, s29, v3, 8 +; SI-NEXT: v_alignbit_b32 v6, s27, v5, 24 +; SI-NEXT: v_alignbit_b32 v21, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, s27, v5, 8 +; SI-NEXT: v_alignbit_b32 v8, s25, v7, 24 +; SI-NEXT: v_alignbit_b32 v23, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, s25, v7, 8 +; SI-NEXT: v_alignbit_b32 v10, s23, v9, 24 +; SI-NEXT: v_alignbit_b32 v25, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, s23, v9, 8 +; SI-NEXT: v_alignbit_b32 v12, s21, v11, 24 +; SI-NEXT: s_lshr_b32 s8, s5, 24 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s5, 8 +; SI-NEXT: s_lshr_b32 s11, s29, 24 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s29, 8 +; SI-NEXT: s_lshr_b32 s14, s27, 24 +; SI-NEXT: s_lshr_b32 s15, s27, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 8 +; SI-NEXT: s_lshr_b32 s41, s25, 24 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 8 +; SI-NEXT: s_lshr_b32 s44, s23, 24 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: s_lshr_b32 s47, s21, 24 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s59, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s61, s17, 24 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: v_alignbit_b32 v14, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v16, s21, v11, 8 +; SI-NEXT: v_alignbit_b32 v27, s19, v13, 24 +; SI-NEXT: v_alignbit_b32 v28, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, s19, v13, 8 +; SI-NEXT: v_alignbit_b32 v30, s17, v15, 24 +; SI-NEXT: v_alignbit_b32 v31, s17, v15, 16 +; SI-NEXT: v_alignbit_b32 v32, s17, v15, 8 +; SI-NEXT: s_branch .LBB85_5 +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_lshl_b32 s6, s63, 8 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s62, 0xff +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s61, 24 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v15, v15, v30 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: buffer_store_dword v30, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 +; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_lshl_b32 s6, s60, 8 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s59, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s58, 24 +; SI-NEXT: v_or_b32_e32 v15, v27, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v16 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s6, s57, 8 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s56, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s47, 24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v26 +; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s6, s46, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s45, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s44, 24 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s6, s43, 8 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v23 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s42, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s41, 24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v22 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s6, s40, 8 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s15, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s14, 24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s6, s13, 8 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v20 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s11, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s9, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: v_writelane_b32 v40, s34, 2 +; VI-NEXT: v_writelane_b32 v40, s35, 3 +; VI-NEXT: v_writelane_b32 v40, s36, 4 +; VI-NEXT: v_writelane_b32 v40, s37, 5 +; VI-NEXT: v_writelane_b32 v40, s38, 6 +; VI-NEXT: v_writelane_b32 v40, s39, 7 +; VI-NEXT: v_writelane_b32 v40, s48, 8 +; VI-NEXT: v_writelane_b32 v40, s49, 9 +; VI-NEXT: v_writelane_b32 v40, s50, 10 +; VI-NEXT: v_writelane_b32 v40, s51, 11 +; VI-NEXT: v_writelane_b32 v40, s52, 12 +; VI-NEXT: v_writelane_b32 v40, s53, 13 +; VI-NEXT: v_writelane_b32 v40, s54, 14 +; VI-NEXT: v_writelane_b32 v40, s55, 15 +; VI-NEXT: v_writelane_b32 v40, s64, 16 +; VI-NEXT: v_writelane_b32 v40, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v40, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v40, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s37, s4, 16 +; VI-NEXT: s_lshr_b32 s36, s4, 8 +; VI-NEXT: s_lshr_b32 s59, s29, 24 +; VI-NEXT: s_lshr_b32 s60, s29, 16 +; VI-NEXT: s_lshr_b32 s61, s29, 8 +; VI-NEXT: s_lshr_b32 s39, s28, 16 +; VI-NEXT: s_lshr_b32 s38, s28, 8 +; VI-NEXT: s_lshr_b32 s62, s27, 24 +; VI-NEXT: s_lshr_b32 s63, s27, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 8 +; VI-NEXT: s_lshr_b32 s49, s26, 16 +; VI-NEXT: s_lshr_b32 s48, s26, 8 +; VI-NEXT: s_lshr_b32 s73, s25, 24 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 8 +; VI-NEXT: s_lshr_b32 s51, s24, 16 +; VI-NEXT: s_lshr_b32 s50, s24, 8 +; VI-NEXT: s_lshr_b32 s76, s23, 24 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: s_lshr_b32 s53, s22, 16 +; VI-NEXT: s_lshr_b32 s52, s22, 8 +; VI-NEXT: s_lshr_b32 s79, s21, 24 +; VI-NEXT: s_lshr_b32 s88, s21, 16 +; VI-NEXT: s_lshr_b32 s89, s21, 8 +; VI-NEXT: s_lshr_b32 s55, s20, 16 +; VI-NEXT: s_lshr_b32 s54, s20, 8 +; VI-NEXT: s_lshr_b32 s90, s19, 24 +; VI-NEXT: s_lshr_b32 s91, s19, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s64, s18, 8 +; VI-NEXT: s_lshr_b32 s31, s17, 24 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s17, 8 +; VI-NEXT: s_lshr_b32 s67, s16, 16 +; VI-NEXT: s_lshr_b32 s66, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s19, v10 +; VI-NEXT: v_readfirstlane_b32 s21, v16 +; VI-NEXT: v_readfirstlane_b32 s23, v12 +; VI-NEXT: v_readfirstlane_b32 s25, v8 +; VI-NEXT: v_readfirstlane_b32 s27, v6 +; VI-NEXT: v_readfirstlane_b32 s29, v4 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; VI-NEXT: s_lshr_b32 s59, s29, 24 +; VI-NEXT: s_lshr_b32 s60, s29, 16 +; VI-NEXT: s_lshr_b32 s61, s29, 8 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; VI-NEXT: s_lshr_b32 s62, s27, 24 +; VI-NEXT: s_lshr_b32 s63, s27, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: s_lshr_b32 s73, s25, 24 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s25, 8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; VI-NEXT: s_lshr_b32 s76, s23, 24 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; VI-NEXT: s_lshr_b32 s79, s21, 24 +; VI-NEXT: s_lshr_b32 s88, s21, 16 +; VI-NEXT: s_lshr_b32 s89, s21, 8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v15 +; VI-NEXT: s_lshr_b32 s90, s19, 24 +; VI-NEXT: s_lshr_b32 s91, s19, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; VI-NEXT: s_lshr_b32 s31, s17, 24 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s17, 8 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v48, s67 +; VI-NEXT: v_mov_b32_e32 v49, s66 +; VI-NEXT: v_mov_b32_e32 v38, s65 +; VI-NEXT: v_mov_b32_e32 v39, s64 +; VI-NEXT: v_mov_b32_e32 v36, s55 +; VI-NEXT: v_mov_b32_e32 v37, s54 +; VI-NEXT: v_mov_b32_e32 v34, s53 +; VI-NEXT: v_mov_b32_e32 v35, s52 +; VI-NEXT: v_mov_b32_e32 v32, s51 +; VI-NEXT: v_mov_b32_e32 v33, s50 +; VI-NEXT: v_mov_b32_e32 v30, s49 +; VI-NEXT: v_mov_b32_e32 v31, s48 +; VI-NEXT: v_mov_b32_e32 v28, s39 +; VI-NEXT: v_mov_b32_e32 v29, s38 +; VI-NEXT: v_mov_b32_e32 v26, s37 +; VI-NEXT: v_mov_b32_e32 v27, s36 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v7, s24 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: v_mov_b32_e32 v3, s28 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v22, s10 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v20, s14 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v18, s42 +; VI-NEXT: v_mov_b32_e32 v17, s44 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: s_and_b32 s4, s17, 0xff +; VI-NEXT: s_lshl_b32 s6, s35, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s34, 0xff +; VI-NEXT: s_lshl_b32 s7, s31, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s19, 0xff +; VI-NEXT: s_lshl_b32 s6, s30, 8 +; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s91, 0xff +; VI-NEXT: s_lshl_b32 s7, s90, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: s_lshl_b32 s6, s89, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s88, 0xff +; VI-NEXT: s_lshl_b32 s7, s79, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: s_lshl_b32 s6, s78, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s77, 0xff +; VI-NEXT: s_lshl_b32 s7, s76, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 24, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s25, 0xff +; VI-NEXT: s_lshl_b32 s6, s75, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s74, 0xff +; VI-NEXT: s_lshl_b32 s7, s73, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s27, 0xff +; VI-NEXT: s_lshl_b32 s6, s72, 8 +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s63, 0xff +; VI-NEXT: s_lshl_b32 s7, s62, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_and_b32 s4, s29, 0xff +; VI-NEXT: s_lshl_b32 s6, s61, 8 +; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s60, 0xff +; VI-NEXT: s_lshl_b32 s7, s59, 8 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v40, 19 +; VI-NEXT: v_readlane_b32 s66, v40, 18 +; VI-NEXT: v_readlane_b32 s65, v40, 17 +; VI-NEXT: v_readlane_b32 s64, v40, 16 +; VI-NEXT: v_readlane_b32 s55, v40, 15 +; VI-NEXT: v_readlane_b32 s54, v40, 14 +; VI-NEXT: v_readlane_b32 s53, v40, 13 +; VI-NEXT: v_readlane_b32 s52, v40, 12 +; VI-NEXT: v_readlane_b32 s51, v40, 11 +; VI-NEXT: v_readlane_b32 s50, v40, 10 +; VI-NEXT: v_readlane_b32 s49, v40, 9 +; VI-NEXT: v_readlane_b32 s48, v40, 8 +; VI-NEXT: v_readlane_b32 s39, v40, 7 +; VI-NEXT: v_readlane_b32 s38, v40, 6 +; VI-NEXT: v_readlane_b32 s37, v40, 5 +; VI-NEXT: v_readlane_b32 s36, v40, 4 +; VI-NEXT: v_readlane_b32 s35, v40, 3 +; VI-NEXT: v_readlane_b32 s34, v40, 2 +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s35, 3 +; GFX9-NEXT: v_writelane_b32 v40, s36, 4 +; GFX9-NEXT: v_writelane_b32 v40, s37, 5 +; GFX9-NEXT: v_writelane_b32 v40, s38, 6 +; GFX9-NEXT: v_writelane_b32 v40, s39, 7 +; GFX9-NEXT: v_writelane_b32 v40, s48, 8 +; GFX9-NEXT: v_writelane_b32 v40, s49, 9 +; GFX9-NEXT: v_writelane_b32 v40, s50, 10 +; GFX9-NEXT: v_writelane_b32 v40, s51, 11 +; GFX9-NEXT: v_writelane_b32 v40, s52, 12 +; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v40, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: s_lshr_b32 s31, s4, 16 +; GFX9-NEXT: s_lshr_b32 s30, s4, 8 +; GFX9-NEXT: s_lshr_b32 s59, s29, 24 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s29, 8 +; GFX9-NEXT: s_lshr_b32 s35, s28, 16 +; GFX9-NEXT: s_lshr_b32 s34, s28, 8 +; GFX9-NEXT: s_lshr_b32 s62, s27, 24 +; GFX9-NEXT: s_lshr_b32 s63, s27, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 8 +; GFX9-NEXT: s_lshr_b32 s37, s26, 16 +; GFX9-NEXT: s_lshr_b32 s36, s26, 8 +; GFX9-NEXT: s_lshr_b32 s73, s25, 24 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s25, 8 +; GFX9-NEXT: s_lshr_b32 s39, s24, 16 +; GFX9-NEXT: s_lshr_b32 s38, s24, 8 +; GFX9-NEXT: s_lshr_b32 s76, s23, 24 +; GFX9-NEXT: s_lshr_b32 s77, s23, 16 +; GFX9-NEXT: s_lshr_b32 s78, s23, 8 +; GFX9-NEXT: s_lshr_b32 s49, s22, 16 +; GFX9-NEXT: s_lshr_b32 s48, s22, 8 +; GFX9-NEXT: s_lshr_b32 s79, s21, 24 +; GFX9-NEXT: s_lshr_b32 s88, s21, 16 +; GFX9-NEXT: s_lshr_b32 s89, s21, 8 +; GFX9-NEXT: s_lshr_b32 s51, s20, 16 +; GFX9-NEXT: s_lshr_b32 s50, s20, 8 +; GFX9-NEXT: s_lshr_b32 s90, s19, 24 +; GFX9-NEXT: s_lshr_b32 s91, s19, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s52, s18, 8 +; GFX9-NEXT: s_lshr_b32 s93, s17, 24 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s17, 8 +; GFX9-NEXT: s_lshr_b32 s55, s16, 16 +; GFX9-NEXT: s_lshr_b32 s54, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] +; GFX9-NEXT: v_readfirstlane_b32 s17, v14 +; GFX9-NEXT: v_readfirstlane_b32 s19, v12 +; GFX9-NEXT: v_readfirstlane_b32 s21, v16 +; GFX9-NEXT: v_readfirstlane_b32 s23, v10 +; GFX9-NEXT: v_readfirstlane_b32 s25, v8 +; GFX9-NEXT: v_readfirstlane_b32 s27, v6 +; GFX9-NEXT: v_readfirstlane_b32 s29, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s58, s5, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: s_lshr_b32 s59, s29, 24 +; GFX9-NEXT: s_lshr_b32 s60, s29, 16 +; GFX9-NEXT: s_lshr_b32 s61, s29, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX9-NEXT: s_lshr_b32 s62, s27, 24 +; GFX9-NEXT: s_lshr_b32 s63, s27, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: s_lshr_b32 s73, s25, 24 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s25, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: s_lshr_b32 s76, s23, 24 +; GFX9-NEXT: s_lshr_b32 s77, s23, 16 +; GFX9-NEXT: s_lshr_b32 s78, s23, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: s_lshr_b32 s79, s21, 24 +; GFX9-NEXT: s_lshr_b32 s88, s21, 16 +; GFX9-NEXT: s_lshr_b32 s89, s21, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v15 +; GFX9-NEXT: s_lshr_b32 s90, s19, 24 +; GFX9-NEXT: s_lshr_b32 s91, s19, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: s_lshr_b32 s93, s17, 24 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s17, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v39, s55 +; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v37, s53 +; GFX9-NEXT: v_mov_b32_e32 v48, s52 +; GFX9-NEXT: v_mov_b32_e32 v36, s51 +; GFX9-NEXT: v_mov_b32_e32 v38, s50 +; GFX9-NEXT: v_mov_b32_e32 v34, s49 +; GFX9-NEXT: v_mov_b32_e32 v35, s48 +; GFX9-NEXT: v_mov_b32_e32 v32, s39 +; GFX9-NEXT: v_mov_b32_e32 v33, s38 +; GFX9-NEXT: v_mov_b32_e32 v30, s37 +; GFX9-NEXT: v_mov_b32_e32 v31, s36 +; GFX9-NEXT: v_mov_b32_e32 v28, s35 +; GFX9-NEXT: v_mov_b32_e32 v29, s34 +; GFX9-NEXT: v_mov_b32_e32 v26, s31 +; GFX9-NEXT: v_mov_b32_e32 v27, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v24, s6 +; GFX9-NEXT: v_mov_b32_e32 v23, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s10 +; GFX9-NEXT: v_mov_b32_e32 v21, s12 +; GFX9-NEXT: v_mov_b32_e32 v20, s14 +; GFX9-NEXT: v_mov_b32_e32 v19, s40 +; GFX9-NEXT: v_mov_b32_e32 v18, s42 +; GFX9-NEXT: v_mov_b32_e32 v17, s44 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: s_and_b32 s4, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s93, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v39, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s92, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s90, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v48 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s89, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s79, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s78, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s76, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s75, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s74, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s73, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s72, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s63, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s62, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s61, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s59, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s58, 8 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s56, 8 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v40, 15 +; GFX9-NEXT: v_readlane_b32 s54, v40, 14 +; GFX9-NEXT: v_readlane_b32 s53, v40, 13 +; GFX9-NEXT: v_readlane_b32 s52, v40, 12 +; GFX9-NEXT: v_readlane_b32 s51, v40, 11 +; GFX9-NEXT: v_readlane_b32 s50, v40, 10 +; GFX9-NEXT: v_readlane_b32 s49, v40, 9 +; GFX9-NEXT: v_readlane_b32 s48, v40, 8 +; GFX9-NEXT: v_readlane_b32 s39, v40, 7 +; GFX9-NEXT: v_readlane_b32 s38, v40, 6 +; GFX9-NEXT: v_readlane_b32 s37, v40, 5 +; GFX9-NEXT: v_readlane_b32 s36, v40, 4 +; GFX9-NEXT: v_readlane_b32 s35, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v8f64_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v33, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v33, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[1:2], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v25 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v21 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s17, v17 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v15 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s21, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s23, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s25, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s27, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[20:21] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[2:3], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[4:5] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[14:15] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[16:17] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s92 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s89 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s88 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s78 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v26 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s76 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s77 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v20, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v22 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v25, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s74 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v27, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s75 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s73 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s62 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s72 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v14, v18 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s61 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s60 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s59 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s56 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s58 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s57 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v10 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s47 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s46 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s45 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v7, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v1, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s42 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v20 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s1 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[24:27], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[16:19], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v33, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v33, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v33, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v33, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v33, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v33, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v33, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v33, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v33, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v33, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v33, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v8f64_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v33, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v33, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[1:2], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[18:19] +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v25 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v23 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s17, v19 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v15 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s21, v11 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s23, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s25, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s27, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[22:23] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[6:7], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[20:21], 24, v[14:15] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[24:25] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s1, 8 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v3, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v1, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v27, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v5, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v31, s49 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v29, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s4 :: v_dual_mov_b32 v23, s37 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s8 :: v_dual_mov_b32 v25, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v19, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v21, s34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v17, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s48 :: v_dual_mov_b32 v11, vcc_hi +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s38 :: v_dual_mov_b32 v13, s95 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s94 :: v_dual_mov_b32 v2, s92 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s93 :: v_dual_mov_b32 v4, s91 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s89, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s79, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s88, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s78, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s77, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s76, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v31, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v25, s1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s75, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s73, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v22, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v31, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s62, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v23 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v23, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s74, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s72, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s19, 0xff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s63, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v17 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s0 :: v_dual_lshlrev_b32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v17, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s61, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s59, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s60, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s58, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s56, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v8, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v26 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_and_b32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s47, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s45, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v14, v21 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v1, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s44, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s3, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s42, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v18, v19 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s1 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[22:25], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v33, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v33, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v33, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v33, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v33, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v33, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v33, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v33, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v33, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v33, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v34, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v33, v8 -; GCN-NEXT: v_mov_b32_e32 v36, v6 -; GCN-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GCN-NEXT: v_or_b32_e32 v0, v0, v42 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_or_b32_e32 v2, v2, v40 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_or_b32_e32 v3, v3, v55 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v33, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v28, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 -; GCN-NEXT: v_or_b32_e32 v27, v35, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v29, v36, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 -; GCN-NEXT: v_or_b32_e32 v31, v38, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v25, v59, v25 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v30, v57, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v32, v58, v8 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v8, v10 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v8, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v8, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v19, v46, v19 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_or_b32_e32 v4, v17, v21 -; GCN-NEXT: v_or_b32_e32 v5, v20, v22 -; GCN-NEXT: v_or_b32_e32 v6, v23, v24 -; GCN-NEXT: v_or_b32_e32 v7, v26, v28 -; GCN-NEXT: v_or_b32_e32 v8, v27, v25 -; GCN-NEXT: v_or_b32_e32 v9, v29, v30 -; GCN-NEXT: v_or_b32_e32 v10, v31, v32 -; GCN-NEXT: v_or_b32_e32 v11, v33, v34 -; GCN-NEXT: v_or_b32_e32 v12, v35, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v16 -; GCN-NEXT: v_or_b32_e32 v15, v18, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; kill: killed $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; kill: killed $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; kill: killed $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB43_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v40, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v55, v3 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v17, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v29, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v27, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v44, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_or_b32_e32 v20, v45, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 -; GCN-NEXT: v_or_b32_e32 v23, v56, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v29, v43, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v36 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v32, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 -; GCN-NEXT: v_or_b32_e32 v16, v59, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: v_or_b32_e32 v22, v58, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 -; GCN-NEXT: v_or_b32_e32 v30, v46, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v10, v9 -; GCN-NEXT: v_or_b32_e32 v6, v12, v11 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v16, v15 -; GCN-NEXT: v_or_b32_e32 v9, v18, v17 -; GCN-NEXT: v_or_b32_e32 v10, v22, v20 -; GCN-NEXT: v_or_b32_e32 v11, v24, v23 -; GCN-NEXT: v_or_b32_e32 v12, v26, v25 -; GCN-NEXT: v_or_b32_e32 v13, v28, v27 -; GCN-NEXT: v_or_b32_e32 v14, v21, v29 -; GCN-NEXT: v_or_b32_e32 v15, v30, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 -; GCN-NEXT: .LBB43_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_or_b32_e32 v0, v0, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v22 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v62 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v9, v9, v59 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v11, v11, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v50 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB86_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v37, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v60, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v42, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v53, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v48, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v18, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v16, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v17, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v16, v52, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v22, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v30, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v12, v40, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v50, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v14, v25, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v15, v21, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; SI-NEXT: .LBB86_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v8f64: ; VI: ; %bb.0: @@ -27749,7 +55843,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: s_cbranch_execz .LBB86_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -27913,9 +56007,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: .LBB43_2: ; %Flow +; VI-NEXT: .LBB86_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_4 +; VI-NEXT: s_cbranch_execz .LBB86_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -28064,7 +56158,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: .LBB43_4: ; %end +; VI-NEXT: .LBB86_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -28209,7 +56303,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: s_cbranch_execz .LBB86_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -28373,9 +56467,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: .LBB43_2: ; %Flow +; GFX9-NEXT: .LBB86_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_4 +; GFX9-NEXT: s_cbranch_execz .LBB86_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -28524,7 +56618,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: .LBB43_4: ; %end +; GFX9-NEXT: .LBB86_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -28651,15 +56745,15 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h @@ -28841,8 +56935,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 @@ -29121,15 +57215,15 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 @@ -29308,8 +57402,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v31, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v32, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v33, 3 @@ -29506,254 +57600,2364 @@ end: ret <8 x double> %phi } +define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v8f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_or_b32_e32 v9, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v43, v6 +; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v44, v10 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v23, v21 +; SI-NEXT: v_mov_b32_e32 v21, v19 +; SI-NEXT: v_mov_b32_e32 v19, v17 +; SI-NEXT: v_mov_b32_e32 v17, v13 +; SI-NEXT: v_mov_b32_e32 v48, v51 +; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v64i8_to_v8f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_mov_b32_e32 v61, v28 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v34, v39 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v29, v33 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v48, v8 +; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_mov_b32_e32 v16, v18 +; VI-NEXT: v_mov_b32_e32 v18, v20 +; VI-NEXT: v_mov_b32_e32 v20, v22 +; VI-NEXT: v_mov_b32_e32 v22, v24 +; VI-NEXT: v_mov_b32_e32 v24, v26 +; VI-NEXT: v_mov_b32_e32 v26, v61 +; VI-NEXT: v_mov_b32_e32 v30, v37 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_mov_b32_e32 v40, v3 +; VI-NEXT: v_mov_b32_e32 v63, v59 +; VI-NEXT: v_mov_b32_e32 v36, v58 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_mov_b32_e32 v59, v56 +; VI-NEXT: v_mov_b32_e32 v56, v47 +; VI-NEXT: v_mov_b32_e32 v47, v46 +; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_mov_b32_e32 v45, v25 +; VI-NEXT: v_mov_b32_e32 v61, v23 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v23, v21 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: v_mov_b32_e32 v19, v17 +; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_mov_b32_e32 v37, v27 +; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v8f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v28 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s5, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v39 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v29, v33 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v8 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v18 +; GFX9-NEXT: v_mov_b32_e32 v18, v20 +; GFX9-NEXT: v_mov_b32_e32 v20, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v24 +; GFX9-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-NEXT: v_mov_b32_e32 v26, v61 +; GFX9-NEXT: v_mov_b32_e32 v30, v37 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_mov_b32_e32 v40, v3 +; GFX9-NEXT: v_mov_b32_e32 v63, v59 +; GFX9-NEXT: v_mov_b32_e32 v36, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_mov_b32_e32 v59, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v47 +; GFX9-NEXT: v_mov_b32_e32 v47, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v25 +; GFX9-NEXT: v_mov_b32_e32 v61, v23 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_mov_b32_e32 v37, v27 +; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v25 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v96, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v98, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-TRUE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v21, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v1, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: .LBB87_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB87_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB87_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-FAKE16-NEXT: .LBB87_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8 +; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff +; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB87_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB87_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v62, v30 -; GCN-NEXT: v_mov_b32_e32 v61, v29 -; GCN-NEXT: v_mov_b32_e32 v60, v28 -; GCN-NEXT: v_mov_b32_e32 v59, v27 -; GCN-NEXT: v_mov_b32_e32 v58, v26 -; GCN-NEXT: v_mov_b32_e32 v57, v25 -; GCN-NEXT: v_mov_b32_e32 v56, v24 -; GCN-NEXT: v_mov_b32_e32 v47, v23 -; GCN-NEXT: v_mov_b32_e32 v46, v22 -; GCN-NEXT: v_mov_b32_e32 v45, v21 -; GCN-NEXT: v_mov_b32_e32 v44, v20 -; GCN-NEXT: v_mov_b32_e32 v43, v19 -; GCN-NEXT: v_mov_b32_e32 v42, v18 -; GCN-NEXT: v_mov_b32_e32 v41, v17 -; GCN-NEXT: v_mov_b32_e32 v40, v16 -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v39, v7 -; GCN-NEXT: v_mov_b32_e32 v38, v6 -; GCN-NEXT: v_mov_b32_e32 v37, v5 -; GCN-NEXT: v_mov_b32_e32 v36, v4 -; GCN-NEXT: v_mov_b32_e32 v35, v3 -; GCN-NEXT: v_mov_b32_e32 v34, v2 -; GCN-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v62 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB44_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: .LBB44_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v60, v28 +; SI-NEXT: v_mov_b32_e32 v59, v27 +; SI-NEXT: v_mov_b32_e32 v58, v26 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v56, v24 +; SI-NEXT: v_mov_b32_e32 v47, v23 +; SI-NEXT: v_mov_b32_e32 v46, v22 +; SI-NEXT: v_mov_b32_e32 v45, v21 +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v43, v19 +; SI-NEXT: v_mov_b32_e32 v42, v18 +; SI-NEXT: v_mov_b32_e32 v41, v17 +; SI-NEXT: v_mov_b32_e32 v40, v16 +; SI-NEXT: v_mov_b32_e32 v55, v15 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v39, v7 +; SI-NEXT: v_mov_b32_e32 v38, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB88_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB88_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v32f16: ; VI: ; %bb.0: @@ -29762,7 +59966,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 3 ; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -29813,7 +60017,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v19 ; VI-NEXT: v_or_b32_e32 v1, v1, v18 ; VI-NEXT: v_or_b32_e32 v0, v0, v17 -; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: .LBB88_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -29824,7 +60028,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: s_cbranch_execz .LBB88_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -29842,7 +60046,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB44_2: ; %end +; GFX9-NEXT: .LBB88_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -29854,7 +60058,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: s_cbranch_execz .LBB88_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -29872,7 +60076,7 @@ define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: .LBB88_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -29892,191 +60096,577 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v32i16_to_v32f16_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v17 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_mov_b32_e32 v37, v8 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v34, v5 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v32, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v54 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v55 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v32i16_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_3 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB89_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: s_branch .LBB89_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v32f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; GCN-NEXT: v_or_b32_e32 v30, v30, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; GCN-NEXT: v_or_b32_e32 v18, v18, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_or_b32_e32 v14, v14, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GCN-NEXT: v_or_b32_e32 v10, v10, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_or_b32_e32 v6, v6, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_or_b32_e32 v8, v8, v9 -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_or_b32_e32 v20, v20, v21 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v32i16: ; VI: ; %bb.0: @@ -30085,7 +60675,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 @@ -30136,7 +60726,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: .LBB90_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -30147,7 +60737,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB45_2 +; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -30166,7 +60756,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB45_2: ; %end +; GFX9-NEXT: .LBB90_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -30178,7 +60768,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -30196,7 +60786,7 @@ define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB45_2: ; %end +; GFX11-NEXT: .LBB90_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -30216,206 +60806,584 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v26, v26, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v22, v22, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_or_b32_e32 v18, v18, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v14, v14, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v32f16_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB91_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_3 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v19, v15 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v19, v14 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v19, v13 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v19, v12 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v19, v11 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v19, v10 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v19, v9 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v19, v7 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v19, v6 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v19, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v19, v4 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_or_b32_e32 v2, v19, v2 +; VI-NEXT: v_or_b32_e32 v1, v18, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB91_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_4: +; VI-NEXT: s_branch .LBB91_2 +; +; GFX9-LABEL: bitcast_v32f16_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_3 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB91_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: s_branch .LBB91_2 +; +; GFX11-LABEL: bitcast_v32f16_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v30 -; GCN-NEXT: v_mov_b32_e32 v54, v28 -; GCN-NEXT: v_mov_b32_e32 v53, v26 -; GCN-NEXT: v_mov_b32_e32 v52, v24 -; GCN-NEXT: v_mov_b32_e32 v51, v22 -; GCN-NEXT: v_mov_b32_e32 v50, v20 -; GCN-NEXT: v_mov_b32_e32 v49, v18 -; GCN-NEXT: v_mov_b32_e32 v48, v16 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_mov_b32_e32 v38, v12 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v36, v8 -; GCN-NEXT: v_mov_b32_e32 v35, v6 -; GCN-NEXT: v_mov_b32_e32 v34, v4 -; GCN-NEXT: v_mov_b32_e32 v33, v2 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v32 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v0, v31, v0 -; GCN-NEXT: v_or_b32_e32 v2, v29, v2 -; GCN-NEXT: v_or_b32_e32 v4, v27, v4 -; GCN-NEXT: v_or_b32_e32 v6, v25, v6 -; GCN-NEXT: v_or_b32_e32 v8, v23, v8 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_or_b32_e32 v13, v13, v18 -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: v_or_b32_e32 v9, v9, v22 -; GCN-NEXT: v_or_b32_e32 v7, v7, v24 -; GCN-NEXT: v_or_b32_e32 v5, v5, v26 -; GCN-NEXT: v_or_b32_e32 v3, v3, v28 -; GCN-NEXT: v_or_b32_e32 v1, v1, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v55, v28 +; SI-NEXT: v_mov_b32_e32 v54, v26 +; SI-NEXT: v_mov_b32_e32 v53, v24 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_mov_b32_e32 v51, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_mov_b32_e32 v48, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v13, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v32bf16: ; VI: ; %bb.0: @@ -30424,7 +61392,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v16, 3 ; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -30475,7 +61443,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v19 ; VI-NEXT: v_or_b32_e32 v1, v1, v18 ; VI-NEXT: v_or_b32_e32 v0, v0, v17 -; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: .LBB92_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -30486,7 +61454,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: s_cbranch_execz .LBB92_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -30504,7 +61472,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB46_2: ; %end +; GFX9-NEXT: .LBB92_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -30516,7 +61484,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-NEXT: s_cbranch_execz .LBB92_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -30534,7 +61502,7 @@ define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB46_2: ; %end +; GFX11-NEXT: .LBB92_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -30554,299 +61522,710 @@ end: ret <32 x bfloat> %phi } +define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v20, v17 +; SI-NEXT: v_mov_b32_e32 v33, v16 +; SI-NEXT: v_mov_b32_e32 v16, v15 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v20 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s14, s16, 16 +; SI-NEXT: s_lshl_b32 s15, s17, 16 +; SI-NEXT: s_lshl_b32 s40, s18, 16 +; SI-NEXT: s_lshl_b32 s41, s19, 16 +; SI-NEXT: s_lshl_b32 s42, s20, 16 +; SI-NEXT: s_lshl_b32 s43, s21, 16 +; SI-NEXT: s_lshl_b32 s6, s22, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_lshl_b32 s8, s24, 16 +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_lshl_b32 s10, s26, 16 +; SI-NEXT: s_lshl_b32 s11, s27, 16 +; SI-NEXT: s_lshl_b32 s12, s28, 16 +; SI-NEXT: s_lshl_b32 s13, s29, 16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s26, 0xffff +; SI-NEXT: s_lshl_b32 s6, s27, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xffff +; SI-NEXT: s_lshl_b32 s7, s25, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s22, 0xffff +; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xffff +; SI-NEXT: s_lshl_b32 s9, s21, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s10, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s16, 0xffff +; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: v_or_b32_e32 v3, v29, v3 +; SI-NEXT: v_or_b32_e32 v5, v27, v5 +; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: v_or_b32_e32 v8, v23, v8 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v4, v19, v4 +; SI-NEXT: v_or_b32_e32 v2, v17, v2 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_and_b32 s15, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s10, 16 +; SI-NEXT: s_and_b32 s41, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s9, 16 +; SI-NEXT: s_and_b32 s43, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s7, 16 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s5, 16 +; SI-NEXT: s_and_b32 s13, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s4, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v6 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_mov_b32_e32 v4, s42 +; SI-NEXT: v_mov_b32_e32 v5, s43 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s11 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v1 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_and_b32 s10, s18, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s18, 3 +; VI-NEXT: s_and_b32 s12, s19, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s19, 3 +; VI-NEXT: s_and_b32 s14, s20, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s20, 3 +; VI-NEXT: s_and_b32 s16, s21, 0xffff0000 +; VI-NEXT: s_add_i32 s17, s21, 3 +; VI-NEXT: s_and_b32 s18, s22, 0xffff0000 +; VI-NEXT: s_add_i32 s19, s22, 3 +; VI-NEXT: s_and_b32 s20, s23, 0xffff0000 +; VI-NEXT: s_add_i32 s21, s23, 3 +; VI-NEXT: s_and_b32 s22, s24, 0xffff0000 +; VI-NEXT: s_add_i32 s23, s24, 3 +; VI-NEXT: s_and_b32 s24, s25, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_and_b32 s40, s26, 0xffff0000 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_and_b32 s41, s27, 0xffff0000 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_and_b32 s42, s28, 0xffff0000 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s43, s29, 0xffff0000 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_and_b32 s44, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_and_b32 s45, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s29, s29, 0xffff +; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_and_b32 s25, s25, 0xffff +; VI-NEXT: s_and_b32 s23, s23, 0xffff +; VI-NEXT: s_and_b32 s21, s21, 0xffff +; VI-NEXT: s_and_b32 s19, s19, 0xffff +; VI-NEXT: s_and_b32 s17, s17, 0xffff +; VI-NEXT: s_and_b32 s15, s15, 0xffff +; VI-NEXT: s_and_b32 s13, s13, 0xffff +; VI-NEXT: s_and_b32 s11, s11, 0xffff +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s7, s45, s7 +; VI-NEXT: s_or_b32 s6, s44, s6 +; VI-NEXT: s_or_b32 s29, s43, s29 +; VI-NEXT: s_or_b32 s28, s42, s28 +; VI-NEXT: s_or_b32 s27, s41, s27 +; VI-NEXT: s_or_b32 s26, s40, s26 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_add_i32 s28, s28, 0x30000 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_add_i32 s25, s24, 0x30000 +; VI-NEXT: s_add_i32 s24, s22, 0x30000 +; VI-NEXT: s_add_i32 s23, s20, 0x30000 +; VI-NEXT: s_add_i32 s22, s18, 0x30000 +; VI-NEXT: s_add_i32 s21, s16, 0x30000 +; VI-NEXT: s_add_i32 s20, s14, 0x30000 +; VI-NEXT: s_add_i32 s19, s12, 0x30000 +; VI-NEXT: s_add_i32 s18, s10, 0x30000 +; VI-NEXT: s_add_i32 s17, s8, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_mov_b32_e32 v15, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_3 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB93_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: s_branch .LBB93_2 +; +; GFX11-LABEL: bitcast_v32i16_to_v32bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, s15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v31 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v63 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB47_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v56 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v19 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v0, v30, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v36, v2, 16 -; GCN-NEXT: v_alignbit_b32 v8, v37, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v38, v5, 16 -; GCN-NEXT: v_alignbit_b32 v16, v39, v33, 16 -; GCN-NEXT: v_alignbit_b32 v20, v48, v9, 16 -; GCN-NEXT: v_alignbit_b32 v24, v49, v10, 16 -; GCN-NEXT: v_alignbit_b32 v28, v50, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v31, v14, 16 -; GCN-NEXT: v_alignbit_b32 v26, v27, v17, 16 -; GCN-NEXT: v_alignbit_b32 v22, v23, v18, 16 -; GCN-NEXT: v_alignbit_b32 v18, v19, v21, 16 -; GCN-NEXT: v_alignbit_b32 v14, v15, v34, 16 -; GCN-NEXT: v_alignbit_b32 v10, v11, v25, 16 -; GCN-NEXT: v_alignbit_b32 v6, v7, v35, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v29, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v41, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v40, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v55, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v54, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v53, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v52, 16 -; GCN-NEXT: v_alignbit_b32 v29, v30, v51, 16 -; GCN-NEXT: .LBB47_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: .LBB94_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB94_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v32i16: ; VI: ; %bb.0: @@ -30855,7 +62234,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -31146,7 +62525,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -31157,7 +62536,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -31401,7 +62780,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -31413,7 +62792,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0 @@ -31709,7 +63088,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v18, 16, v20 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v19 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -31721,7 +63100,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0 @@ -31971,7 +63350,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v29, v32, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v27, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -31991,810 +63370,2086 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v54 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 +; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB95_2 +; +; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_lshl_b32 s5, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s5, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s5, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s5, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s4, v1 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_branch .LBB95_5 +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB95_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s5, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s5, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_and_or_b32 v14, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_and_or_b32 v15, v3, v16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v13, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v12, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v11, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v10, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v9, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v8, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v7, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v6, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v5, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v3, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, v16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: v_and_or_b32 v1, v1, v16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v17, v16, v0 +; GFX9-NEXT: s_branch .LBB95_5 +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB95_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v32i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-NEXT: .LBB95_2: ; %cmp.true +; GFX11-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-NEXT: s_lshl_b32 s1, s12, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s1, s13, 16 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-NEXT: s_and_b32 s2, s14, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-NEXT: s_lshl_b32 s1, s27, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v2, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v9, v5 +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v3 +; GFX11-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v7 +; GFX11-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v6 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v5, v6, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v8, v4 +; GFX11-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v10 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v8, v11, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v8 +; GFX11-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v7, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v6 +; GFX11-NEXT: v_bfe_u32 v9, v12, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v12 +; GFX11-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_bfe_u32 v9, v11, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v7, v8, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v12, v7 +; GFX11-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v12, v13 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v9 +; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v13, v14, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v10, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v12 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v23, 0x40c00000, s0 +; GFX11-NEXT: v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v12, v14, v10 +; GFX11-NEXT: v_bfe_u32 v13, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v23 +; GFX11-NEXT: v_add_f32_e64 v25, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v13 +; GFX11-NEXT: v_bfe_u32 v13, v25, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v12, v14, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v12, v15, v11 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_bfe_u32 v28, v14, 16, 1 +; GFX11-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v10, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v25 +; GFX11-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v12, v27, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v12, v28, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v14 +; GFX11-NEXT: v_bfe_u32 v29, v15, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-NEXT: v_bfe_u32 v13, v27, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-NEXT: v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v13, v13, v27 +; GFX11-NEXT: v_add_f32_e64 v28, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v15 +; GFX11-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x7fff, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_or_b32_e32 v30, 0x400000, v14 +; GFX11-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_add_f32_e64 v29, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v13, v31, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v13, v32, v28 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v28 +; GFX11-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_add_f32_e64 v33, 0x40c00000, s0 +; GFX11-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, v32, v29 +; GFX11-NEXT: v_add_f32_e64 v30, 0x40c00000, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v31, vcc_lo +; GFX11-NEXT: v_add_f32_e64 v31, 0x40c00000, s1 +; GFX11-NEXT: s_and_b32 s0, s27, 0xffff0000 +; GFX11-NEXT: v_bfe_u32 v28, v33, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v32, 0x40c00000, s0 +; GFX11-NEXT: v_bfe_u32 v34, v30, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v34, v34, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v35, v35, v31 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v30 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v37, v37, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v28, v28, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v35, v39, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v34, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v37, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_and_or_b32 v7, 0xffff0000, v7, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v15, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_and_or_b32 v15, 0xffff0000, v32, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v28, v38, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_or_b32 v12, 0xffff0000, v27, v31 +; GFX11-NEXT: v_and_or_b32 v11, 0xffff0000, v25, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX11-NEXT: v_and_or_b32 v14, 0xffff0000, v28, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_and_or_b32 v9, 0xffff0000, v26, v23 +; GFX11-NEXT: v_and_or_b32 v8, 0xffff0000, v24, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX11-NEXT: v_and_or_b32 v6, 0xffff0000, v5, v27 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff0000, v21, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX11-NEXT: v_and_or_b32 v13, 0xffff0000, v13, v29 +; GFX11-NEXT: v_and_or_b32 v10, 0xffff0000, v10, v33 +; GFX11-NEXT: v_and_or_b32 v4, 0xffff0000, v19, v20 +; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v18, v21 +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v17, v22 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v16, v23 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v24 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB95_3: +; GFX11-NEXT: s_branch .LBB95_2 +; GFX11-NEXT: .LBB95_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v32i16_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; kill: killed $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v4 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v8 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v12 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v16 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v49 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_bfe_u32 v4, v4, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v8, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_bfe_u32 v44, v12, 8, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v16, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v20, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v24, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v28, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v4, v49, 8, 8 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v52, v1, v46 -; GCN-NEXT: v_or_b32_e32 v51, v2, v47 -; GCN-NEXT: v_or_b32_e32 v48, v3, v57 -; GCN-NEXT: v_or_b32_e32 v37, v5, v59 -; GCN-NEXT: v_or_b32_e32 v32, v6, v60 -; GCN-NEXT: v_or_b32_e32 v6, v7, v61 -; GCN-NEXT: v_or_b32_e32 v58, v9, v62 -; GCN-NEXT: v_or_b32_e32 v56, v11, v63 -; GCN-NEXT: v_or_b32_e32 v42, v13, v10 -; GCN-NEXT: v_or_b32_e32 v54, v15, v14 -; GCN-NEXT: v_or_b32_e32 v50, v17, v22 -; GCN-NEXT: v_or_b32_e32 v39, v18, v33 -; GCN-NEXT: v_or_b32_e32 v36, v19, v26 -; GCN-NEXT: v_or_b32_e32 v31, v21, v34 -; GCN-NEXT: v_or_b32_e32 v18, v23, v30 -; GCN-NEXT: v_or_b32_e32 v2, v25, v35 -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v55, v6, v32, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v32, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v32, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v41, v56, v58, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v58, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v56, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v45, v39, v50, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: .LBB48_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v49, v40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v29 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v25 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v2, v30, v2 -; GCN-NEXT: v_or_b32_e32 v4, v35, v4 -; GCN-NEXT: v_or_b32_e32 v6, v26, v6 -; GCN-NEXT: v_or_b32_e32 v8, v34, v8 -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: v_or_b32_e32 v16, v33, v16 -; GCN-NEXT: v_or_b32_e32 v10, v10, v17 -; GCN-NEXT: v_or_b32_e32 v14, v14, v18 -; GCN-NEXT: v_or_b32_e32 v13, v62, v13 -; GCN-NEXT: v_or_b32_e32 v15, v63, v15 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: v_or_b32_e32 v11, v61, v11 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v3, v47, v3 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v3 -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v4, v37, v48, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v8, v37, v48, 8 -; GCN-NEXT: v_alignbit_b32 v55, v6, v32, 24 -; GCN-NEXT: v_alignbit_b32 v20, v6, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v6, v32, 8 -; GCN-NEXT: v_alignbit_b32 v41, v56, v58, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v58, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v56, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v45, v39, v50, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v24, 24, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v37 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: .LBB48_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v5, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v12 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v1 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v28 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v58 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v14, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v24 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v56 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NEXT: v_or_b32_e32 v16, v4, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v55 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v42 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_or_b32_e32 v17, v6, v9 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v44 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v54 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GCN-NEXT: v_or_b32_e32 v21, v9, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v41 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v45 -; GCN-NEXT: v_or_b32_e32 v24, v11, v15 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v15 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v39 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GCN-NEXT: v_or_b32_e32 v22, v15, v19 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v19 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_or_b32_e32 v36, v19, v25 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xff, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v25 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v31 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 -; GCN-NEXT: v_or_b32_e32 v31, v25, v31 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v50 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v50 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v50 -; GCN-NEXT: v_or_b32_e32 v50, v18, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 24, v53 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v54 -; GCN-NEXT: v_or_b32_e32 v54, v2, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v33, v33, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v43, v27, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v44, 24, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: v_or_b32_e32 v29, v29, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v20, v20, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v23, v23, v38 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v30, v30, v48 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v35, v37, v51 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v37, v39, v55 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v38, v49, v42 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v39, v52, v45 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v48, v53, v46 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v49, v40, v47 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v51, v41, v56 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v52, v44, v57 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v54, v60, v58 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v29 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v23 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_or_b32_e32 v16, v16, v35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v37 -; GCN-NEXT: v_or_b32_e32 v20, v21, v38 -; GCN-NEXT: v_or_b32_e32 v21, v24, v39 -; GCN-NEXT: v_or_b32_e32 v22, v22, v48 -; GCN-NEXT: v_or_b32_e32 v23, v36, v49 -; GCN-NEXT: v_or_b32_e32 v24, v31, v51 -; GCN-NEXT: v_or_b32_e32 v29, v50, v52 -; GCN-NEXT: v_or_b32_e32 v30, v53, v54 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32i16_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v30 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v56, v1, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v47, v1, v27 +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v50, v1, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v38, v1, v36 +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v34, v1, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v33, v1, v39 +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v32, v1, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v31, v1, v49 +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v30, v1, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v26, v1, v52 +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v26, v30, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v22, v1, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v18, v1, v54 +; SI-NEXT: v_alignbit_b32 v1, v18, v22, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v14, v1, v41 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 +; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 +; SI-NEXT: v_bfe_u32 v62, v44, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v10, v1, v40 +; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v6, v1, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v2, v1, v42 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: .LBB96_2: ; %Flow +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v1 +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v47, v56, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 +; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v55, v4 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v53, v4 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v52, v4 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v49, v4 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v39, v4 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v37, v4 +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v38, v50, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v31, v32, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v4, v47, v56, 24 +; SI-NEXT: v_alignbit_b32 v24, v31, v32, 24 +; SI-NEXT: v_alignbit_b32 v28, v31, v32, 16 +; SI-NEXT: v_alignbit_b32 v12, v26, v30, 24 +; SI-NEXT: v_alignbit_b32 v16, v26, v30, 16 +; SI-NEXT: v_alignbit_b32 v44, v26, v30, 8 +; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 +; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 +; SI-NEXT: v_alignbit_b32 v20, v18, v22, 8 +; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v10, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: .LBB96_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i16_to_v64i8: ; VI: ; %bb.0: @@ -32901,7 +65556,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -32979,9 +65634,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v18, 3 ; VI-NEXT: v_add_u16_sdwa v26, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -33099,7 +65754,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v18 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 @@ -33337,7 +65992,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -33390,9 +66045,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] @@ -33460,7 +66115,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -33620,7 +66275,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -33654,9 +66309,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] @@ -33706,7 +66361,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB48_4: ; %end +; GFX11-TRUE16-NEXT: .LBB96_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -33918,7 +66573,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -33968,9 +66623,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB96_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] @@ -34036,7 +66691,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -34206,780 +66861,3032 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32i16_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v37, s30, 0 +; SI-NEXT: v_writelane_b32 v37, s31, 1 +; SI-NEXT: v_writelane_b32 v37, s34, 2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v37, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s34, v18 +; SI-NEXT: v_readfirstlane_b32 s35, v17 +; SI-NEXT: v_readfirstlane_b32 s30, v14 +; SI-NEXT: v_readfirstlane_b32 s31, v13 +; SI-NEXT: v_readfirstlane_b32 s94, v10 +; SI-NEXT: v_readfirstlane_b32 s95, v9 +; SI-NEXT: v_readfirstlane_b32 s92, v6 +; SI-NEXT: v_readfirstlane_b32 s93, v5 +; SI-NEXT: v_readfirstlane_b32 s90, v2 +; SI-NEXT: v_readfirstlane_b32 s91, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_mov_b32_e32 v1, s40 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_alignbit_b32 v18, s41, v1, 24 +; SI-NEXT: v_alignbit_b32 v25, s41, v1, 16 +; SI-NEXT: v_alignbit_b32 v30, s41, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_alignbit_b32 v19, s15, v1, 24 +; SI-NEXT: v_alignbit_b32 v26, s15, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, s15, v1, 8 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: v_alignbit_b32 v17, s13, v1, 24 +; SI-NEXT: v_alignbit_b32 v23, s13, v1, 16 +; SI-NEXT: v_alignbit_b32 v29, s13, v1, 8 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_alignbit_b32 v16, s11, v1, 24 +; SI-NEXT: v_alignbit_b32 v20, s11, v1, 16 +; SI-NEXT: v_alignbit_b32 v27, s11, v1, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: s_and_b32 s4, s93, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: v_or_b32_e32 v5, v1, v33 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: s_and_b32 s4, s95, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: v_or_b32_e32 v4, v1, v34 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: s_and_b32 s4, s31, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: v_or_b32_e32 v2, v1, v35 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: v_alignbit_b32 v9, s9, v5, 24 +; SI-NEXT: v_alignbit_b32 v12, s9, v5, 16 +; SI-NEXT: v_alignbit_b32 v21, s9, v5, 8 +; SI-NEXT: v_alignbit_b32 v6, s8, v4, 24 +; SI-NEXT: v_alignbit_b32 v8, s8, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, s8, v4, 8 +; SI-NEXT: v_alignbit_b32 v24, s7, v2, 24 +; SI-NEXT: v_alignbit_b32 v28, s7, v2, 16 +; SI-NEXT: v_alignbit_b32 v32, s7, v2, 8 +; SI-NEXT: v_alignbit_b32 v10, s6, v1, 24 +; SI-NEXT: v_alignbit_b32 v14, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v22, s6, v1, 8 +; SI-NEXT: s_lshr_b32 s78, s41, 8 +; SI-NEXT: s_lshr_b32 s75, s15, 8 +; SI-NEXT: s_lshr_b32 s72, s13, 8 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s58, s9, 8 +; SI-NEXT: s_lshr_b32 s47, s8, 8 +; SI-NEXT: s_lshr_b32 s45, s7, 8 +; SI-NEXT: s_lshr_b32 s42, s6, 8 +; SI-NEXT: s_and_b32 s88, s19, 0xffff +; SI-NEXT: s_and_b32 s77, s23, 0xffff +; SI-NEXT: s_and_b32 s74, s27, 0xffff +; SI-NEXT: s_and_b32 s63, s90, 0xffff +; SI-NEXT: s_and_b32 s60, s92, 0xffff +; SI-NEXT: s_and_b32 s57, s94, 0xffff +; SI-NEXT: s_and_b32 s46, s30, 0xffff +; SI-NEXT: s_and_b32 s43, s34, 0xffff +; SI-NEXT: s_bfe_u32 s89, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s79, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s76, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s73, s90, 0x80008 +; SI-NEXT: s_bfe_u32 s62, s92, 0x80008 +; SI-NEXT: s_bfe_u32 s59, s94, 0x80008 +; SI-NEXT: s_bfe_u32 s56, s30, 0x80008 +; SI-NEXT: s_bfe_u32 s44, s34, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s31, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s95, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s93, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s91, s91, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s91, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s12, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s13, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s15, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s40, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_add_i32 s41, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_alignbit_b32 v18, s41, v6, 24 +; SI-NEXT: v_alignbit_b32 v25, s41, v6, 16 +; SI-NEXT: v_alignbit_b32 v30, s41, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v3, v33, v3 +; SI-NEXT: v_alignbit_b32 v19, s15, v6, 24 +; SI-NEXT: v_alignbit_b32 v26, s15, v6, 16 +; SI-NEXT: v_alignbit_b32 v31, s15, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: v_mov_b32_e32 v10, s7 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v3 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_alignbit_b32 v17, s13, v6, 24 +; SI-NEXT: v_alignbit_b32 v23, s13, v6, 16 +; SI-NEXT: v_alignbit_b32 v29, s13, v6, 8 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_alignbit_b32 v16, s11, v6, 24 +; SI-NEXT: v_alignbit_b32 v20, s11, v6, 16 +; SI-NEXT: v_alignbit_b32 v27, s11, v6, 8 +; SI-NEXT: v_alignbit_b32 v9, v3, v5, 24 +; SI-NEXT: v_alignbit_b32 v12, v3, v5, 16 +; SI-NEXT: v_alignbit_b32 v21, v3, v5, 8 +; SI-NEXT: v_alignbit_b32 v6, v7, v4, 24 +; SI-NEXT: v_alignbit_b32 v8, v7, v4, 16 +; SI-NEXT: v_alignbit_b32 v13, v7, v4, 8 +; SI-NEXT: v_alignbit_b32 v24, v10, v2, 24 +; SI-NEXT: v_alignbit_b32 v28, v10, v2, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v2, 8 +; SI-NEXT: v_alignbit_b32 v10, v15, v1, 24 +; SI-NEXT: v_alignbit_b32 v14, v15, v1, 16 +; SI-NEXT: v_alignbit_b32 v22, v15, v1, 8 +; SI-NEXT: s_lshr_b32 s89, s41, 24 +; SI-NEXT: s_lshr_b32 s88, s41, 16 +; SI-NEXT: s_lshr_b32 s78, s41, 8 +; SI-NEXT: s_lshr_b32 s79, s15, 24 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_lshr_b32 s75, s15, 8 +; SI-NEXT: s_lshr_b32 s76, s13, 24 +; SI-NEXT: s_lshr_b32 s74, s13, 16 +; SI-NEXT: s_lshr_b32 s72, s13, 8 +; SI-NEXT: s_lshr_b32 s73, s11, 24 +; SI-NEXT: s_lshr_b32 s63, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s62, s9, 24 +; SI-NEXT: s_lshr_b32 s60, s9, 16 +; SI-NEXT: s_lshr_b32 s58, s9, 8 +; SI-NEXT: s_lshr_b32 s59, s8, 24 +; SI-NEXT: s_lshr_b32 s57, s8, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 8 +; SI-NEXT: s_lshr_b32 s56, s7, 24 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 8 +; SI-NEXT: s_lshr_b32 s44, s6, 24 +; SI-NEXT: s_lshr_b32 s43, s6, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 8 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v30 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v18 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s89, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s77, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v19 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s79, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v17 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s12, s76, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s63, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v16 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s10, s73, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s9, s62, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s56, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s43, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s44, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s35, v37, 3 +; SI-NEXT: v_readlane_b32 s34, v37, 2 +; SI-NEXT: v_readlane_b32 s31, v37, 1 +; SI-NEXT: v_readlane_b32 s30, v37, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_branch .LBB97_2 +; +; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v4, s66, 18 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s16, 3 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s19, 3 +; VI-NEXT: s_add_i32 s16, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s18, 3 +; VI-NEXT: s_add_i32 s19, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s21, 3 +; VI-NEXT: s_add_i32 s18, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s20, 3 +; VI-NEXT: s_add_i32 s21, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s23, 3 +; VI-NEXT: s_add_i32 s20, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s22, 3 +; VI-NEXT: s_add_i32 s23, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s25, 3 +; VI-NEXT: s_add_i32 s22, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s24, 3 +; VI-NEXT: s_add_i32 s25, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s27, 3 +; VI-NEXT: s_add_i32 s24, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s26, 3 +; VI-NEXT: s_add_i32 s27, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s29, 3 +; VI-NEXT: s_add_i32 s26, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s7, s28, 3 +; VI-NEXT: s_add_i32 s29, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s28, s6, 0x30000 +; VI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s4, 0xffff0000 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s29, 8 +; VI-NEXT: s_lshr_b32 s72, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 8 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s89, s25, 8 +; VI-NEXT: s_lshr_b32 s90, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s34, s23, 8 +; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s39, s21, 8 +; VI-NEXT: s_lshr_b32 s48, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s52, s19, 8 +; VI-NEXT: s_lshr_b32 s53, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s65, s17, 8 +; VI-NEXT: s_lshr_b32 s66, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_lshl_b32 s9, s67, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s66, 0xff +; VI-NEXT: s_lshl_b32 s11, s44, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_and_b32 s7, s17, 0xff +; VI-NEXT: s_lshl_b32 s9, s65, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s64, 0xff +; VI-NEXT: s_lshl_b32 s11, s55, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s54, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s53, 0xff +; VI-NEXT: s_lshl_b32 s11, s42, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s19, 0xff +; VI-NEXT: s_lshl_b32 s9, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s51, 0xff +; VI-NEXT: s_lshl_b32 s11, s50, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s49, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s48, 0xff +; VI-NEXT: s_lshl_b32 s11, s40, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s21, 0xff +; VI-NEXT: s_lshl_b32 s9, s39, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s38, 0xff +; VI-NEXT: s_lshl_b32 s11, s37, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s35, 0xff +; VI-NEXT: s_lshl_b32 s11, s14, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s23, 0xff +; VI-NEXT: s_lshl_b32 s9, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s31, 0xff +; VI-NEXT: s_lshl_b32 s11, s30, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_lshl_b32 s9, s91, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s90, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s25, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s88, 0xff +; VI-NEXT: s_lshl_b32 s11, s79, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s9, s78, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s77, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: s_lshl_b32 s9, s76, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s75, 0xff +; VI-NEXT: s_lshl_b32 s10, s74, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s9, s73, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s72, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s29, 0xff +; VI-NEXT: s_lshl_b32 s8, s63, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s62, 0xff +; VI-NEXT: s_lshl_b32 s9, s61, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s59, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s58, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v32i16_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s59, s5, 8 +; GFX9-NEXT: s_lshr_b32 s58, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s72, s29, 8 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s27, 8 +; GFX9-NEXT: s_lshr_b32 s76, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 8 +; GFX9-NEXT: s_lshr_b32 s89, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s95, s23, 8 +; GFX9-NEXT: s_lshr_b32 s94, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s36, s21, 8 +; GFX9-NEXT: s_lshr_b32 s35, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s49, s19, 8 +; GFX9-NEXT: s_lshr_b32 s48, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s54, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v6, s27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v10, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v12, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v16, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v17, s55 +; GFX9-NEXT: v_mov_b32_e32 v62, s53 +; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v60, s52 +; GFX9-NEXT: v_mov_b32_e32 v61, s51 +; GFX9-NEXT: v_mov_b32_e32 v58, s50 +; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v57, s49 +; GFX9-NEXT: v_mov_b32_e32 v47, s39 +; GFX9-NEXT: v_mov_b32_e32 v56, s38 +; GFX9-NEXT: v_mov_b32_e32 v46, s37 +; GFX9-NEXT: v_mov_b32_e32 v45, s35 +; GFX9-NEXT: v_mov_b32_e32 v44, s36 +; GFX9-NEXT: v_mov_b32_e32 v42, s34 +; GFX9-NEXT: v_mov_b32_e32 v43, s31 +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v55, s95 +; GFX9-NEXT: v_mov_b32_e32 v53, s93 +; GFX9-NEXT: v_mov_b32_e32 v54, s92 +; GFX9-NEXT: v_mov_b32_e32 v52, s91 +; GFX9-NEXT: v_mov_b32_e32 v51, s89 +; GFX9-NEXT: v_mov_b32_e32 v50, s90 +; GFX9-NEXT: v_mov_b32_e32 v48, s88 +; GFX9-NEXT: v_mov_b32_e32 v49, s79 +; GFX9-NEXT: v_mov_b32_e32 v39, s78 +; GFX9-NEXT: v_mov_b32_e32 v38, s76 +; GFX9-NEXT: v_mov_b32_e32 v37, s77 +; GFX9-NEXT: v_mov_b32_e32 v35, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s74 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v33, s63 +; GFX9-NEXT: v_mov_b32_e32 v32, s72 +; GFX9-NEXT: v_mov_b32_e32 v30, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s61 +; GFX9-NEXT: v_mov_b32_e32 v29, s60 +; GFX9-NEXT: v_mov_b32_e32 v28, s58 +; GFX9-NEXT: v_mov_b32_e32 v27, s59 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v18, s56 +; GFX9-NEXT: v_mov_b32_e32 v23, s12 +; GFX9-NEXT: v_mov_b32_e32 v24, s10 +; GFX9-NEXT: v_mov_b32_e32 v25, s8 +; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32i16_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s89 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s77 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s75 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s4 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v87, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xff, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v85, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v82, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v69, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v21, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v22, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v17, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v23, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v21, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v23, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v38, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v6 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32i16_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v23 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s0 :: v_dual_mov_b32 v24, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v6, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v4, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v2, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s49 :: v_dual_mov_b32 v87, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s48 :: v_dual_mov_b32 v85, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s37 :: v_dual_mov_b32 v83, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s34 :: v_dual_mov_b32 v81, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s31 :: v_dual_mov_b32 v71, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, vcc_hi :: v_dual_mov_b32 v69, s94 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s95 :: v_dual_mov_b32 v67, s93 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s92 :: v_dual_mov_b32 v65, s91 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s89 :: v_dual_mov_b32 v55, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s88 :: v_dual_mov_b32 v53, s79 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s78 :: v_dual_mov_b32 v51, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s77 :: v_dual_mov_b32 v49, s75 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s74 :: v_dual_mov_b32 v39, s73 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s63 :: v_dual_mov_b32 v37, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s62 :: v_dual_mov_b32 v35, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s60 :: v_dual_mov_b32 v33, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s59 :: v_dual_mov_b32 v31, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s56 :: v_dual_mov_b32 v29, s47 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v7, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s46 :: v_dual_mov_b32 v11, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s44 :: v_dual_mov_b32 v17, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v21, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s10 :: v_dual_mov_b32 v26, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s6 :: v_dual_mov_b32 v28, s4 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v87, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v23, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v85, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v87, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v69, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, v24, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, v20, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v15, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v28, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v14, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v5, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v8 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[82:85], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v21 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v19 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v33 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v57, 8, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v12 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v20 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v47 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v58 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v59 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v60 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v21, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v23 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v25, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v27 -; GCN-NEXT: v_or_b32_e32 v23, v28, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 -; GCN-NEXT: v_or_b32_e32 v24, v24, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v33, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v34, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v35, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v32 -; GCN-NEXT: v_or_b32_e32 v15, v15, v44 -; GCN-NEXT: v_or_b32_e32 v16, v16, v45 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v32, v11 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v11, v8 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v11, v9 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v11, v6 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v41, v11, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v7, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v11, v18 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v11, v19 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v19, v21 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v19, v22 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v25 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v23, v26 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v27 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v33, v1, v10 -; GCN-NEXT: v_or_b32_e32 v35, v3, v8 -; GCN-NEXT: v_or_b32_e32 v37, v4, v6 -; GCN-NEXT: v_or_b32_e32 v39, v5, v0 -; GCN-NEXT: v_or_b32_e32 v49, v7, v17 -; GCN-NEXT: v_or_b32_e32 v51, v11, v20 -; GCN-NEXT: v_or_b32_e32 v53, v19, v22 -; GCN-NEXT: v_or_b32_e32 v55, v23, v24 -; GCN-NEXT: v_or_b32_e32 v32, v27, v40 -; GCN-NEXT: v_or_b32_e32 v34, v28, v9 -; GCN-NEXT: v_or_b32_e32 v36, v29, v41 -; GCN-NEXT: v_or_b32_e32 v38, v12, v2 -; GCN-NEXT: v_or_b32_e32 v48, v13, v18 -; GCN-NEXT: v_or_b32_e32 v50, v14, v21 -; GCN-NEXT: v_or_b32_e32 v52, v15, v25 -; GCN-NEXT: v_or_b32_e32 v54, v16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_alignbit_b32 v1, v33, v40, 16 -; GCN-NEXT: v_alignbit_b32 v5, v35, v9, 16 -; GCN-NEXT: v_alignbit_b32 v9, v37, v41, 16 -; GCN-NEXT: v_alignbit_b32 v13, v39, v2, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v18, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v21, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v25, 16 -; GCN-NEXT: v_alignbit_b32 v29, v55, v26, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; kill: killed $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; kill: killed $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v45, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v24 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v57, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v41 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v58 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v46 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v30 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v27, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s7, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v1, v31, v1 -; GCN-NEXT: v_or_b32_e32 v3, v8, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: v_or_b32_e32 v4, v6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v11, v9 -; GCN-NEXT: v_or_b32_e32 v6, v15, v13 -; GCN-NEXT: v_or_b32_e32 v7, v17, v16 -; GCN-NEXT: v_or_b32_e32 v8, v21, v19 -; GCN-NEXT: v_or_b32_e32 v9, v24, v23 -; GCN-NEXT: v_or_b32_e32 v11, v18, v25 -; GCN-NEXT: v_or_b32_e32 v13, v22, v20 -; GCN-NEXT: v_or_b32_e32 v10, v10, v26 -; GCN-NEXT: v_or_b32_e32 v12, v14, v12 -; GCN-NEXT: v_or_b32_e32 v14, v28, v27 -; GCN-NEXT: v_or_b32_e32 v15, v30, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v15 -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 16 -; GCN-NEXT: v_alignbit_b32 v5, v35, v34, 16 -; GCN-NEXT: v_alignbit_b32 v9, v37, v36, 16 -; GCN-NEXT: v_alignbit_b32 v13, v39, v38, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v32 -; GCN-NEXT: v_mov_b32_e32 v2, v33 -; GCN-NEXT: v_mov_b32_e32 v4, v34 -; GCN-NEXT: v_mov_b32_e32 v6, v35 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mov_b32_e32 v8, v36 -; GCN-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NEXT: v_mov_b32_e32 v12, v38 -; GCN-NEXT: v_mov_b32_e32 v14, v39 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mov_b32_e32 v16, v48 -; GCN-NEXT: v_mov_b32_e32 v18, v49 -; GCN-NEXT: v_mov_b32_e32 v20, v50 -; GCN-NEXT: v_mov_b32_e32 v22, v51 -; GCN-NEXT: v_mov_b32_e32 v24, v52 -; GCN-NEXT: v_mov_b32_e32 v26, v53 -; GCN-NEXT: v_mov_b32_e32 v28, v54 -; GCN-NEXT: v_mov_b32_e32 v30, v55 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v32i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v17 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v48 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v49 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v50 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v38 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v11, v9, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v7, v9 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v7, v1 +; SI-NEXT: v_or_b32_e32 v49, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v48, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v15, v13, v9 +; SI-NEXT: v_alignbit_b32 v9, v49, v15, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v19, v13, v9 +; SI-NEXT: v_or_b32_e32 v50, v8, v19 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v50, v0, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: v_or_b32_e32 v51, v6, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v22, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v22, v22, v60 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v52, v14, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v18, v14 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v51, v6, 16 +; SI-NEXT: v_alignbit_b32 v17, v52, v14, 16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v26 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_or_b32_e32 v53, v18, v10 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v42, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v58, v23 +; SI-NEXT: v_or_b32_e32 v55, v22, v24 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v44 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v54, v18, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v56, v18 +; SI-NEXT: v_alignbit_b32 v21, v53, v2, 16 +; SI-NEXT: v_alignbit_b32 v25, v54, v18, 16 +; SI-NEXT: v_alignbit_b32 v29, v55, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v12, v22 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v37, v12, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v35, v11, v15 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v33, v11, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v32, v0, v6 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v34, v0, v14 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v36, v0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v38, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v39, v0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB98_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v56, v3 +; SI-NEXT: v_alignbit_b32 v29, v55, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v38, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v42, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v25, v54, v38, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v10, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v53, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v18, v2 +; SI-NEXT: v_alignbit_b32 v21, v53, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v22, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v52, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v17, v52, v34, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v16, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v51, vcc, s7, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v13, v51, v32, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v50, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v9, v50, v33, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v49, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v0 +; SI-NEXT: v_alignbit_b32 v1, v48, v37, 16 +; SI-NEXT: v_alignbit_b32 v0, v49, v35, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: .LBB98_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v37 +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: v_mov_b32_e32 v4, v35 +; SI-NEXT: v_mov_b32_e32 v6, v49 +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: v_mov_b32_e32 v10, v50 +; SI-NEXT: v_mov_b32_e32 v12, v32 +; SI-NEXT: v_mov_b32_e32 v14, v51 +; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: v_mov_b32_e32 v18, v52 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v38 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v32i16: ; VI: ; %bb.0: @@ -35093,7 +70000,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -35256,9 +70163,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: .LBB49_2: ; %Flow +; VI-NEXT: .LBB98_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_4 +; VI-NEXT: s_cbranch_execz .LBB98_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v38 @@ -35413,7 +70320,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: .LBB49_4: ; %end +; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -35557,7 +70464,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -35722,9 +70629,9 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: .LBB49_2: ; %Flow +; GFX9-NEXT: .LBB98_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_4 +; GFX9-NEXT: s_cbranch_execz .LBB98_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 @@ -35877,7 +70784,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 -; GFX9-NEXT: .LBB49_4: ; %end +; GFX9-NEXT: .LBB98_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -36008,15 +70915,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.l @@ -36147,8 +71054,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -36378,15 +71285,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 @@ -36533,8 +71440,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 @@ -36704,315 +71611,2354 @@ end: ret <32 x i16> %phi } +define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v32i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 +; SI-NEXT: v_readfirstlane_b32 s15, v27 +; SI-NEXT: v_readfirstlane_b32 s40, v26 +; SI-NEXT: v_readfirstlane_b32 s12, v19 +; SI-NEXT: v_readfirstlane_b32 s13, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s44, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v32 +; SI-NEXT: v_readfirstlane_b32 s42, v33 +; SI-NEXT: v_readfirstlane_b32 s43, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v37 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v48 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v59 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v61 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v62 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s14, s19, 24 +; SI-NEXT: s_or_b32 s4, s14, s4 +; SI-NEXT: s_and_b32 s14, s28, 0xff +; SI-NEXT: s_lshl_b32 s46, s29, 8 +; SI-NEXT: s_or_b32 s14, s14, s46 +; SI-NEXT: s_and_b32 s46, s6, 0xff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_lshl_b32 s47, s7, 24 +; SI-NEXT: s_or_b32 s57, s47, s46 +; SI-NEXT: s_and_b32 s46, s26, 0xff +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_lshl_b32 s47, s27, 24 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s16, 0xff +; SI-NEXT: s_lshl_b32 s56, s17, 8 +; SI-NEXT: s_or_b32 s47, s47, s56 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 +; SI-NEXT: s_or_b32 s47, s47, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s56, s25, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v2, v10 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, s46 +; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 +; SI-NEXT: s_or_b32 s46, s4, s46 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s56, s8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v13, v13, v49 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v15, v3, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v19, v7, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v36, v13, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v35, s4, v15 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s56, s10, 8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v17, v17, v53 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v23, v51, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v27, v52, v18 +; SI-NEXT: v_or_b32_e32 v62, v47, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v41 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v18, v17, v27 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v37, s4, v23 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s56, s12, 8 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v21, v21, v43 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v33, v58, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xff, v45 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v25, v54, v17 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v31, v42, v26 +; SI-NEXT: v_or_b32_e32 v32, v32, v60 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v38, v21, v31 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v63, v59, v34 +; SI-NEXT: v_or_b32_e32 v39, s4, v25 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s56, s15, 8 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v48, v32, v63 +; SI-NEXT: v_and_b32_e32 v32, 0xff, v56 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v29, v44, v21 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v26, v26, v57 +; SI-NEXT: v_or_b32_e32 v34, v61, v32 +; SI-NEXT: v_or_b32_e32 v32, s4, v29 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s56, s42, 8 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v17, v18, v25, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16 +; SI-NEXT: v_or_b32_e32 v33, s4, v33 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s56, s44, 8 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s56 +; SI-NEXT: s_or_b32 s14, s14, s57 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s41, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 +; SI-NEXT: v_alignbit_b32 v13, v36, v23, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16 +; SI-NEXT: v_or_b32_e32 v34, s4, v34 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v61, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v45 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: v_add_i32_e32 v48, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v55 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v32, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_add_i32_e32 v38, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v52, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v37, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v49, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v1 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s46, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s14, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v35, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: s_add_i32 s47, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s41, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_alignbit_b32 v1, s41, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s46 +; SI-NEXT: v_alignbit_b32 v5, s14, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, v10, v35, 16 +; SI-NEXT: v_alignbit_b32 v13, v36, v37, 16 +; SI-NEXT: v_alignbit_b32 v17, v18, v39, 16 +; SI-NEXT: v_alignbit_b32 v21, v38, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16 +; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16 +; SI-NEXT: s_lshr_b32 s56, s41, 16 +; SI-NEXT: s_lshr_b32 s57, s14, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v48 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, s47 +; SI-NEXT: v_mov_b32_e32 v2, s41 +; SI-NEXT: v_mov_b32_e32 v3, s56 +; SI-NEXT: v_mov_b32_e32 v4, s46 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s57 +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_mov_b32_e32 v12, v37 +; SI-NEXT: v_mov_b32_e32 v14, v36 +; SI-NEXT: v_mov_b32_e32 v16, v39 +; SI-NEXT: v_mov_b32_e32 v20, v32 +; SI-NEXT: v_mov_b32_e32 v22, v38 +; SI-NEXT: v_mov_b32_e32 v24, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v64i8_to_v32i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, v20 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v51, v23 +; VI-NEXT: v_mov_b32_e32 v30, v26 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v30, v34 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v32i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v96, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v86, 16, v87 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v96, 16, v97 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB99_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB99_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB99_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB99_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB99_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v31 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB50_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v55 -; GCN-NEXT: .LBB50_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v25 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v26 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v29 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v30 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB100_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v58 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: .LBB100_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v32bf16: ; VI: ; %bb.0: @@ -37021,7 +73967,7 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB100_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 @@ -37072,68 +74018,535 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v19, v2 ; VI-NEXT: v_or_b32_e32 v1, v18, v1 ; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB50_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB100_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB100_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB100_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB100_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB100_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define inreg <32 x bfloat> @bitcast_v32f16_to_v32bf16_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v32f16_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB101_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_3 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v19, v15 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v19, v14 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v19, v13 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v19, v12 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v19, v11 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v19, v10 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v19, v9 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v19, v7 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v19, v6 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v19, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v19, v4 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_or_b32_e32 v2, v19, v2 +; VI-NEXT: v_or_b32_e32 v1, v18, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB101_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_4: +; VI-NEXT: s_branch .LBB101_2 ; -; GFX9-LABEL: bitcast_v32f16_to_v32bf16: +; GFX9-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB50_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB50_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_3 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB101_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: s_branch .LBB101_2 ; -; GFX11-LABEL: bitcast_v32f16_to_v32bf16: +; GFX11-LABEL: bitcast_v32f16_to_v32bf16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB50_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB50_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s13, s1 +; GFX11-NEXT: s_mov_b32 s12, s0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s12 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -37153,346 +74566,347 @@ end: } define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v63 -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v31 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v62 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB51_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v56 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v48 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v39 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: .LBB51_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v30 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB102_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB102_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v32f16: ; VI: ; %bb.0: @@ -37501,7 +74915,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -37792,7 +75206,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -37803,7 +75217,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -38047,7 +75461,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 ; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 ; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -38059,7 +75473,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v6 @@ -38330,7 +75744,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v13, v30 ; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v14, v29 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -38342,7 +75756,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0 @@ -38592,7 +76006,7 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v29, v32, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v11, v11, v27, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -38612,761 +76026,2489 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v13 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v15 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s29 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB103_2 +; +; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_readfirstlane_b32 s30, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s31, v1 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_lshl_b32 s5, s30, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s5, s30, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s5, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s5, s31, 16 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s5, v1 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_and_b32 s5, s31, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s5, v1 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: s_lshl_b32 s4, s29, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s28, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s27, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s26, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s25, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s24, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; VI-NEXT: v_add_f32_e32 v16, s4, v1 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s4, v1 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: s_branch .LBB103_5 +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v14, s30 +; VI-NEXT: v_mov_b32_e32 v15, s31 +; VI-NEXT: .LBB103_5: ; %end +; VI-NEXT: v_readlane_b32 s31, v20, 1 +; VI-NEXT: v_readlane_b32 s30, v20, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v20, s30, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_writelane_b32 v20, s31, 1 +; GFX9-NEXT: v_readfirstlane_b32 s30, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s31, v1 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: s_and_b32 s5, s30, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s5, s30, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s5, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b32 s5, s31, 0xffff0000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s5, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s5, s31, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s5, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: s_and_b32 s4, s29, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v3, 16, v4 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s29, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s28, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s28, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s27, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s27, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s26, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s26, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s25, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s25, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s24, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v17, vcc +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v2 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v17, v17, v1 +; GFX9-NEXT: v_add_u32_e32 v17, 0x7fff, v17 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v17, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v17, s4, v0 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v17 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v18, v18, v0 +; GFX9-NEXT: v_add_u32_e32 v18, 0x7fff, v18 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v16 +; GFX9-NEXT: s_branch .LBB103_5 +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: .LBB103_5: ; %end +; GFX9-NEXT: v_readlane_b32 s31, v20, 1 +; GFX9-NEXT: v_readlane_b32 s30, v20, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s12, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s13, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s14, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v3, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v5, v8, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, v6, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v10, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v13, v8 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, v14, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v14, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v26, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v27, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v26, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v26, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, v30, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v27, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, v14, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v30, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, v34, v29 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, v36, v33 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, v26, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, v32, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v30, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v31, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v32, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v23, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v18, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v17, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v16, 16, v23 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB103_3: +; GFX11-TRUE16-NEXT: s_branch .LBB103_2 +; GFX11-TRUE16-NEXT: .LBB103_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s1 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s0, s12, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s12, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s13, 0xffff0000 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s13, 16 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s14, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s14, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v33, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v9, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s15, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s15, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v33, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v3, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s16, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s17, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s18, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v11, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s19, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v12, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v13, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v12, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s21, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v10, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s22, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v15 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v12 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v14, 16, 1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s23, 16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v13, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v27, v12 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s24, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v26, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s25, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, v15, v13 +; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v27, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v30, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v27, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v15, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x7fff, v30 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s27, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s26, 16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v35, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v32, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v34, v29 +; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v33 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, v26, v30 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v35 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v30 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x7fff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v30, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v26, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v32, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v5, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v24, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v23, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v21, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v18, 16, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v17, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v16, 16, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v23 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB103_3: +; GFX11-FAKE16-NEXT: s_branch .LBB103_2 +; GFX11-FAKE16-NEXT: .LBB103_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v32f16_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v39 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v1 -; GCN-NEXT: v_bfe_u32 v33, v31, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v8, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v6, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v5, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v4, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v3, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v33, v2, 8, 8 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v57, v34, v10 -; GCN-NEXT: v_or_b32_e32 v56, v32, v11 -; GCN-NEXT: v_or_b32_e32 v47, v35, v12 -; GCN-NEXT: v_or_b32_e32 v45, v7, v14 -; GCN-NEXT: v_or_b32_e32 v41, v36, v20 -; GCN-NEXT: v_or_b32_e32 v53, v9, v23 -; GCN-NEXT: v_or_b32_e32 v48, v37, v25 -; GCN-NEXT: v_or_b32_e32 v39, v13, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v33, v16, v29 -; GCN-NEXT: v_or_b32_e32 v30, v18, v30 -; GCN-NEXT: v_or_b32_e32 v25, v15, v38 -; GCN-NEXT: v_or_b32_e32 v23, v22, v49 -; GCN-NEXT: v_or_b32_e32 v20, v17, v24 -; GCN-NEXT: v_or_b32_e32 v14, v27, v50 -; GCN-NEXT: v_or_b32_e32 v12, v21, v28 -; GCN-NEXT: v_or_b32_e32 v10, v19, v51 -; GCN-NEXT: v_alignbit_b32 v42, v56, v57, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v57, 16 -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: v_alignbit_b32 v55, v45, v47, 24 -; GCN-NEXT: v_alignbit_b32 v40, v45, v47, 16 -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v46, v39, v48, 8 -; GCN-NEXT: v_alignbit_b32 v29, v30, v33, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v30, v33, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v26, v30, v33, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v11, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v23 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: .LBB52_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v31 -; GCN-NEXT: v_bfe_u32 v12, v31, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v8, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v6, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v5, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v3, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v12, v2, 8, 8 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v33, v30 -; GCN-NEXT: v_or_b32_e32 v20, v14, v34 -; GCN-NEXT: v_or_b32_e32 v14, v17, v35 -; GCN-NEXT: v_or_b32_e32 v25, v15, v19 -; GCN-NEXT: v_or_b32_e32 v23, v21, v36 -; GCN-NEXT: v_or_b32_e32 v33, v16, v37 -; GCN-NEXT: v_or_b32_e32 v30, v18, v38 -; GCN-NEXT: v_or_b32_e32 v48, v24, v22 -; GCN-NEXT: v_or_b32_e32 v39, v13, v39 -; GCN-NEXT: v_or_b32_e32 v41, v26, v49 -; GCN-NEXT: v_or_b32_e32 v53, v9, v50 -; GCN-NEXT: v_or_b32_e32 v47, v27, v51 -; GCN-NEXT: v_or_b32_e32 v45, v7, v52 -; GCN-NEXT: v_or_b32_e32 v57, v29, v28 -; GCN-NEXT: v_or_b32_e32 v56, v32, v54 -; GCN-NEXT: v_alignbit_b32 v42, v56, v57, 24 -; GCN-NEXT: v_alignbit_b32 v43, v56, v57, 16 -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: v_alignbit_b32 v55, v45, v47, 24 -; GCN-NEXT: v_alignbit_b32 v40, v45, v47, 16 -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v46, v39, v48, 8 -; GCN-NEXT: v_alignbit_b32 v29, v30, v33, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v30, v33, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v26, v30, v33, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v11, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v23 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v14 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: .LBB52_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v61 -; GCN-NEXT: v_or_b32_e32 v13, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; GCN-NEXT: v_or_b32_e32 v15, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v60 -; GCN-NEXT: v_or_b32_e32 v16, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v63 -; GCN-NEXT: v_or_b32_e32 v17, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v59 -; GCN-NEXT: v_or_b32_e32 v19, v7, v9 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v42 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v31 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v62 -; GCN-NEXT: v_or_b32_e32 v21, v7, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v55 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v46 -; GCN-NEXT: v_or_b32_e32 v22, v9, v11 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v58 -; GCN-NEXT: v_or_b32_e32 v24, v8, v11 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v11 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v26 -; GCN-NEXT: v_or_b32_e32 v26, v11, v18 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v44 -; GCN-NEXT: v_or_b32_e32 v28, v6, v18 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v25 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; GCN-NEXT: v_or_b32_e32 v25, v18, v25 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v23 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GCN-NEXT: v_or_b32_e32 v23, v5, v23 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v29 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; GCN-NEXT: v_or_b32_e32 v29, v20, v29 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-NEXT: v_or_b32_e32 v14, v4, v14 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v53 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v53 -; GCN-NEXT: v_or_b32_e32 v53, v12, v53 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NEXT: v_or_b32_e32 v41, v3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_or_b32_e32 v37, v38, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: v_or_b32_e32 v34, v34, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 24, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v35 -; GCN-NEXT: v_or_b32_e32 v35, v36, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v27, v27, v39 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v31, v31, v49 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v33, v33, v54 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v39, v50, v42 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v49, v52, v44 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v50, v55, v46 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v51, v40, v47 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v52, v43, v56 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v38, v38, v57 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v54, v45, v58 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v41 -; GCN-NEXT: v_or_b32_e32 v36, v60, v36 -; GCN-NEXT: v_or_b32_e32 v13, v13, v37 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_or_b32_e32 v16, v16, v35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v48 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v21, v21, v31 -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: v_or_b32_e32 v24, v24, v33 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v27, v28, v49 -; GCN-NEXT: v_or_b32_e32 v25, v25, v50 -; GCN-NEXT: v_or_b32_e32 v23, v23, v51 -; GCN-NEXT: v_or_b32_e32 v28, v29, v52 -; GCN-NEXT: v_or_b32_e32 v14, v14, v38 -; GCN-NEXT: v_or_b32_e32 v29, v53, v54 -; GCN-NEXT: v_or_b32_e32 v30, v55, v36 -; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32f16_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_or_b32_e32 v54, v33, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 +; SI-NEXT: v_or_b32_e32 v50, v32, v7 +; SI-NEXT: v_alignbit_b32 v7, v50, v54, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v50, v54, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_or_b32_e32 v21, v36, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_or_b32_e32 v20, v35, v7 +; SI-NEXT: v_alignbit_b32 v7, v20, v21, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v20, v21, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v20, v21, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_or_b32_e32 v18, v39, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_or_b32_e32 v19, v38, v7 +; SI-NEXT: v_alignbit_b32 v7, v19, v18, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v19, v18, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v19, v18, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: v_or_b32_e32 v16, v51, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_or_b32_e32 v17, v49, v7 +; SI-NEXT: v_alignbit_b32 v7, v17, v16, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v17, v16, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v40 +; SI-NEXT: v_or_b32_e32 v15, v55, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v53, v7 +; SI-NEXT: v_alignbit_b32 v7, v14, v15, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v14, v15, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v14, v15, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_or_b32_e32 v12, v41, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_or_b32_e32 v13, v22, v7 +; SI-NEXT: v_alignbit_b32 v7, v13, v12, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v13, v12, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v13, v12, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v26, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_or_b32_e32 v11, v25, v7 +; SI-NEXT: v_alignbit_b32 v7, v11, v10, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v11, v10, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v11, v10, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v29, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v11 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v22, v1, 8, 8 +; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v13 +; SI-NEXT: v_bfe_u32 v23, v31, 8, 8 +; SI-NEXT: v_bfe_u32 v62, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB104_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v22 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v38 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v21, v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v54, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v50, v24, v22 +; SI-NEXT: v_alignbit_b32 v22, v50, v54, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v50, v54, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v20, v21, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v20, v21, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v20, v21, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v18, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v18, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v18, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v17, v16, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v17, v16, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v15, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v15, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v14, v15, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v13, v12, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v13, v12, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v13, v12, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v7, v9, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v11 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_alignbit_b32 v43, v50, v54, 24 +; SI-NEXT: v_alignbit_b32 v27, v17, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v13 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v23, v31, 8, 8 +; SI-NEXT: v_bfe_u32 v62, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v61, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v59, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v57, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v47, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v2, 8, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v22, v1, 8, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: .LBB104_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v8, v21, v8 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v8, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v58 +; SI-NEXT: v_or_b32_e32 v8, v8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v61 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v18, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v59 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v47 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f16_to_v64i8: ; VI: ; %bb.0: @@ -39441,7 +78583,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -39478,9 +78620,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[5:6] ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v1 -; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v51, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -39582,7 +78724,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v61, v38, 8, 8 ; VI-NEXT: v_bfe_u32 v54, v49, 8, 8 ; VI-NEXT: v_bfe_u32 v40, v51, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -39792,7 +78934,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -39845,9 +78987,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] @@ -39916,7 +79058,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -40076,7 +79218,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -40110,9 +79252,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB52_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] @@ -40162,7 +79304,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-TRUE16-NEXT: .LBB52_4: ; %end +; GFX11-TRUE16-NEXT: .LBB104_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -40374,7 +79516,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -40424,9 +79566,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB104_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] @@ -40492,7 +79634,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -40662,718 +79804,3160 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32f16_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v22, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_or_b32_e32 v37, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_or_b32_e32 v32, v9, v8 +; SI-NEXT: v_alignbit_b32 v8, v32, v37, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v32, v37, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v32, v37, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v24, v12, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_or_b32_e32 v23, v11, v8 +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v23, v24, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v42, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_or_b32_e32 v19, v14, v8 +; SI-NEXT: v_alignbit_b32 v8, v19, v18, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v19, v18, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v19, v18, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 +; SI-NEXT: v_or_b32_e32 v16, v25, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; SI-NEXT: v_or_b32_e32 v17, v28, v8 +; SI-NEXT: v_alignbit_b32 v8, v17, v16, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v17, v16, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v17, v16, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 +; SI-NEXT: v_or_b32_e32 v15, v21, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v62, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v12, v34, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_or_b32_e32 v13, v30, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_or_b32_e32 v10, v50, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; SI-NEXT: v_or_b32_e32 v11, v48, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_or_b32_e32 v9, v40, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v9, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v9, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v9, 8 +; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 +; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 +; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 +; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 +; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v22, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v60, v1, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v8 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v21, v18 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 +; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 +; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 +; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 +; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 +; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 +; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v60, v1, 8, 8 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v24, v22, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v37, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_or_b32_e32 v32, v25, v21 +; SI-NEXT: v_alignbit_b32 v21, v32, v37, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v32, v37, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v32, v37, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v23, v24, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v23, v24, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v23, v24, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v19, v18, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v19, v18, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v19, v18, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v17, v16, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v8, v9, 24 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v8, v9, 16 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v21, v8, v9, 8 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 +; SI-NEXT: v_bfe_u32 v22, v2, 8, 8 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v54 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v7, v25, v7 +; SI-NEXT: v_or_b32_e32 v7, v21, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v24, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v31 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v63 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v29 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v32f16_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s75, s5, 24 +; VI-NEXT: s_lshr_b32 s36, s5, 16 +; VI-NEXT: s_lshr_b32 s58, s5, 8 +; VI-NEXT: s_lshr_b32 s37, s4, 16 +; VI-NEXT: s_lshr_b32 s56, s4, 8 +; VI-NEXT: s_lshr_b32 s77, s29, 24 +; VI-NEXT: s_lshr_b32 s38, s29, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 8 +; VI-NEXT: s_lshr_b32 s39, s28, 16 +; VI-NEXT: s_lshr_b32 s57, s28, 8 +; VI-NEXT: s_lshr_b32 s79, s27, 24 +; VI-NEXT: s_lshr_b32 s48, s27, 16 +; VI-NEXT: s_lshr_b32 s74, s27, 8 +; VI-NEXT: s_lshr_b32 s49, s26, 16 +; VI-NEXT: s_lshr_b32 s59, s26, 8 +; VI-NEXT: s_lshr_b32 s89, s25, 24 +; VI-NEXT: s_lshr_b32 s50, s25, 16 +; VI-NEXT: s_lshr_b32 s76, s25, 8 +; VI-NEXT: s_lshr_b32 s51, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s24, 8 +; VI-NEXT: s_lshr_b32 s91, s23, 24 +; VI-NEXT: s_lshr_b32 s52, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: s_lshr_b32 s53, s22, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 8 +; VI-NEXT: s_lshr_b32 s31, s21, 24 +; VI-NEXT: s_lshr_b32 s54, s21, 16 +; VI-NEXT: s_lshr_b32 s88, s21, 8 +; VI-NEXT: s_lshr_b32 s55, s20, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 8 +; VI-NEXT: s_lshr_b32 s34, s19, 24 +; VI-NEXT: s_lshr_b32 s64, s19, 16 +; VI-NEXT: s_lshr_b32 s90, s19, 8 +; VI-NEXT: s_lshr_b32 s65, s18, 16 +; VI-NEXT: s_lshr_b32 s72, s18, 8 +; VI-NEXT: s_lshr_b32 s35, s17, 24 +; VI-NEXT: s_lshr_b32 s66, s17, 16 +; VI-NEXT: s_lshr_b32 s30, s17, 8 +; VI-NEXT: s_lshr_b32 s67, s16, 16 +; VI-NEXT: s_lshr_b32 s73, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v12, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: v_add_f16_e32 v27, s17, v1 +; VI-NEXT: v_add_f16_e32 v19, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_or_b32_e32 v10, v27, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; VI-NEXT: v_add_f16_e32 v35, s16, v1 +; VI-NEXT: v_add_f16_e32 v13, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_or_b32_e32 v9, v35, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_add_f16_e32 v28, s19, v1 +; VI-NEXT: v_add_f16_e32 v20, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: v_or_b32_e32 v62, v28, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; VI-NEXT: v_add_f16_e32 v36, s18, v1 +; VI-NEXT: v_add_f16_e32 v14, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s20, 16 +; VI-NEXT: v_or_b32_e32 v61, v36, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_add_f16_e32 v29, s21, v1 +; VI-NEXT: v_add_f16_e32 v21, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_or_b32_e32 v8, v29, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; VI-NEXT: v_add_f16_e32 v37, s20, v1 +; VI-NEXT: v_add_f16_e32 v15, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_or_b32_e32 v7, v37, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_add_f16_e32 v30, s23, v1 +; VI-NEXT: v_add_f16_e32 v22, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: v_or_b32_e32 v47, v30, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; VI-NEXT: v_add_f16_e32 v38, s22, v1 +; VI-NEXT: v_add_f16_e32 v16, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s24, 16 +; VI-NEXT: v_or_b32_e32 v46, v38, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; VI-NEXT: v_add_f16_e32 v31, s25, v1 +; VI-NEXT: v_add_f16_e32 v23, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_or_b32_e32 v6, v31, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; VI-NEXT: v_add_f16_e32 v39, s24, v1 +; VI-NEXT: v_add_f16_e32 v17, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_or_b32_e32 v5, v39, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; VI-NEXT: v_add_f16_e32 v32, s27, v1 +; VI-NEXT: v_add_f16_e32 v24, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_or_b32_e32 v43, v32, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; VI-NEXT: v_add_f16_e32 v48, s26, v1 +; VI-NEXT: v_add_f16_e32 v18, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_or_b32_e32 v42, v48, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_add_f16_e32 v33, s29, v1 +; VI-NEXT: v_add_f16_e32 v25, s6, v1 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_or_b32_e32 v55, v33, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; VI-NEXT: v_add_f16_e32 v49, s28, v1 +; VI-NEXT: v_add_f16_e32 v11, s6, v1 +; VI-NEXT: v_add_f16_e32 v34, s5, v1 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_or_b32_e32 v54, v49, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; VI-NEXT: v_add_f16_e32 v26, s5, v1 +; VI-NEXT: v_or_b32_e32 v52, v34, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; VI-NEXT: v_add_f16_e32 v50, s4, v1 +; VI-NEXT: v_or_b32_e32 v51, v50, v2 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[51:52] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[42:43] +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[54:55] +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v42 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[46:47] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[61:62] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v51 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v55 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v43 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v46 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v62 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v61 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v10 +; VI-NEXT: v_bfe_u32 v9, v11, 8, 8 +; VI-NEXT: v_bfe_u32 v10, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v40, v17, 8, 8 +; VI-NEXT: v_bfe_u32 v43, v16, 8, 8 +; VI-NEXT: v_bfe_u32 v46, v15, 8, 8 +; VI-NEXT: v_bfe_u32 v57, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v59, v13, 8, 8 +; VI-NEXT: v_bfe_u32 v62, v12, 8, 8 +; VI-NEXT: s_branch .LBB105_5 +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v1, s58 +; VI-NEXT: v_mov_b32_e32 v53, s56 +; VI-NEXT: v_mov_b32_e32 v52, s42 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v52, s44 +; VI-NEXT: v_mov_b32_e32 v19, s67 +; VI-NEXT: v_mov_b32_e32 v12, s66 +; VI-NEXT: v_mov_b32_e32 v20, s65 +; VI-NEXT: v_mov_b32_e32 v13, s64 +; VI-NEXT: v_mov_b32_e32 v21, s55 +; VI-NEXT: v_mov_b32_e32 v14, s54 +; VI-NEXT: v_mov_b32_e32 v22, s53 +; VI-NEXT: v_mov_b32_e32 v15, s52 +; VI-NEXT: v_mov_b32_e32 v23, s51 +; VI-NEXT: v_mov_b32_e32 v16, s50 +; VI-NEXT: v_mov_b32_e32 v24, s49 +; VI-NEXT: v_mov_b32_e32 v17, s48 +; VI-NEXT: v_mov_b32_e32 v25, s39 +; VI-NEXT: v_mov_b32_e32 v18, s38 +; VI-NEXT: v_mov_b32_e32 v26, s37 +; VI-NEXT: v_mov_b32_e32 v11, s36 +; VI-NEXT: v_mov_b32_e32 v35, s16 +; VI-NEXT: v_mov_b32_e32 v27, s17 +; VI-NEXT: v_mov_b32_e32 v36, s18 +; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: v_mov_b32_e32 v37, s20 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v38, s22 +; VI-NEXT: v_mov_b32_e32 v30, s23 +; VI-NEXT: v_mov_b32_e32 v39, s24 +; VI-NEXT: v_mov_b32_e32 v31, s25 +; VI-NEXT: v_mov_b32_e32 v48, s26 +; VI-NEXT: v_mov_b32_e32 v32, s27 +; VI-NEXT: v_mov_b32_e32 v49, s28 +; VI-NEXT: v_mov_b32_e32 v33, s29 +; VI-NEXT: v_mov_b32_e32 v50, s4 +; VI-NEXT: v_mov_b32_e32 v34, s5 +; VI-NEXT: v_mov_b32_e32 v62, s35 +; VI-NEXT: v_mov_b32_e32 v59, s34 +; VI-NEXT: v_mov_b32_e32 v57, s31 +; VI-NEXT: v_mov_b32_e32 v46, s91 +; VI-NEXT: v_mov_b32_e32 v43, s89 +; VI-NEXT: v_mov_b32_e32 v40, s79 +; VI-NEXT: v_mov_b32_e32 v10, s77 +; VI-NEXT: v_mov_b32_e32 v61, s30 +; VI-NEXT: v_mov_b32_e32 v58, s90 +; VI-NEXT: v_mov_b32_e32 v47, s88 +; VI-NEXT: v_mov_b32_e32 v45, s78 +; VI-NEXT: v_mov_b32_e32 v42, s76 +; VI-NEXT: v_mov_b32_e32 v55, s74 +; VI-NEXT: v_mov_b32_e32 v54, s57 +; VI-NEXT: v_mov_b32_e32 v41, s59 +; VI-NEXT: v_mov_b32_e32 v44, s60 +; VI-NEXT: v_mov_b32_e32 v56, s61 +; VI-NEXT: v_mov_b32_e32 v60, s63 +; VI-NEXT: v_mov_b32_e32 v51, s72 +; VI-NEXT: v_mov_b32_e32 v1, s73 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mov_b32_e32 v6, s10 +; VI-NEXT: v_mov_b32_e32 v5, s12 +; VI-NEXT: v_mov_b32_e32 v4, s14 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_mov_b32_e32 v9, s75 +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v52, s62 +; VI-NEXT: .LBB105_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v62 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v21, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v63, s30, 0 +; GFX9-NEXT: v_writelane_b32 v63, s31, 1 +; GFX9-NEXT: v_writelane_b32 v63, s34, 2 +; GFX9-NEXT: v_writelane_b32 v63, s35, 3 +; GFX9-NEXT: v_writelane_b32 v63, s36, 4 +; GFX9-NEXT: v_writelane_b32 v63, s37, 5 +; GFX9-NEXT: v_writelane_b32 v63, s38, 6 +; GFX9-NEXT: v_writelane_b32 v63, s39, 7 +; GFX9-NEXT: v_writelane_b32 v63, s48, 8 +; GFX9-NEXT: v_writelane_b32 v63, s49, 9 +; GFX9-NEXT: v_writelane_b32 v63, s50, 10 +; GFX9-NEXT: v_writelane_b32 v63, s51, 11 +; GFX9-NEXT: v_writelane_b32 v63, s52, 12 +; GFX9-NEXT: v_writelane_b32 v63, s53, 13 +; GFX9-NEXT: v_writelane_b32 v63, s54, 14 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v63, s55, 15 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s56, s5, 24 +; GFX9-NEXT: s_lshr_b32 s57, s5, 16 +; GFX9-NEXT: s_lshr_b32 s59, s5, 8 +; GFX9-NEXT: s_lshr_b32 s58, s4, 16 +; GFX9-NEXT: s_lshr_b32 s60, s4, 8 +; GFX9-NEXT: s_lshr_b32 s61, s29, 24 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s72, s29, 8 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s73, s28, 8 +; GFX9-NEXT: s_lshr_b32 s74, s27, 24 +; GFX9-NEXT: s_lshr_b32 s75, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s27, 8 +; GFX9-NEXT: s_lshr_b32 s76, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s26, 8 +; GFX9-NEXT: s_lshr_b32 s79, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 8 +; GFX9-NEXT: s_lshr_b32 s89, s24, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 8 +; GFX9-NEXT: s_lshr_b32 s92, s23, 24 +; GFX9-NEXT: s_lshr_b32 s93, s23, 16 +; GFX9-NEXT: s_lshr_b32 s95, s23, 8 +; GFX9-NEXT: s_lshr_b32 s94, s22, 16 +; GFX9-NEXT: s_lshr_b32 s30, s22, 8 +; GFX9-NEXT: s_lshr_b32 s31, s21, 24 +; GFX9-NEXT: s_lshr_b32 s34, s21, 16 +; GFX9-NEXT: s_lshr_b32 s36, s21, 8 +; GFX9-NEXT: s_lshr_b32 s35, s20, 16 +; GFX9-NEXT: s_lshr_b32 s37, s20, 8 +; GFX9-NEXT: s_lshr_b32 s38, s19, 24 +; GFX9-NEXT: s_lshr_b32 s39, s19, 16 +; GFX9-NEXT: s_lshr_b32 s49, s19, 8 +; GFX9-NEXT: s_lshr_b32 s48, s18, 16 +; GFX9-NEXT: s_lshr_b32 s50, s18, 8 +; GFX9-NEXT: s_lshr_b32 s51, s17, 24 +; GFX9-NEXT: s_lshr_b32 s52, s17, 16 +; GFX9-NEXT: s_lshr_b32 s54, s17, 8 +; GFX9-NEXT: s_lshr_b32 s53, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 +; GFX9-NEXT: v_pk_add_f16 v20, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s21, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s20, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s23, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s22, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s25, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s24, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s27, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s26, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s29, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s28, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s5, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s4, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v21, s44 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v11, s20 +; GFX9-NEXT: v_mov_b32_e32 v12, s21 +; GFX9-NEXT: v_mov_b32_e32 v9, s22 +; GFX9-NEXT: v_mov_b32_e32 v10, s23 +; GFX9-NEXT: v_mov_b32_e32 v7, s24 +; GFX9-NEXT: v_mov_b32_e32 v8, s25 +; GFX9-NEXT: v_mov_b32_e32 v5, s26 +; GFX9-NEXT: v_mov_b32_e32 v6, s27 +; GFX9-NEXT: v_mov_b32_e32 v3, s28 +; GFX9-NEXT: v_mov_b32_e32 v4, s29 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v17, s55 +; GFX9-NEXT: v_mov_b32_e32 v62, s53 +; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v60, s52 +; GFX9-NEXT: v_mov_b32_e32 v61, s51 +; GFX9-NEXT: v_mov_b32_e32 v58, s50 +; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v57, s49 +; GFX9-NEXT: v_mov_b32_e32 v47, s39 +; GFX9-NEXT: v_mov_b32_e32 v56, s38 +; GFX9-NEXT: v_mov_b32_e32 v46, s37 +; GFX9-NEXT: v_mov_b32_e32 v45, s35 +; GFX9-NEXT: v_mov_b32_e32 v44, s36 +; GFX9-NEXT: v_mov_b32_e32 v42, s34 +; GFX9-NEXT: v_mov_b32_e32 v43, s31 +; GFX9-NEXT: v_mov_b32_e32 v41, s30 +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v55, s95 +; GFX9-NEXT: v_mov_b32_e32 v53, s93 +; GFX9-NEXT: v_mov_b32_e32 v54, s92 +; GFX9-NEXT: v_mov_b32_e32 v52, s91 +; GFX9-NEXT: v_mov_b32_e32 v51, s89 +; GFX9-NEXT: v_mov_b32_e32 v50, s90 +; GFX9-NEXT: v_mov_b32_e32 v48, s88 +; GFX9-NEXT: v_mov_b32_e32 v49, s79 +; GFX9-NEXT: v_mov_b32_e32 v39, s78 +; GFX9-NEXT: v_mov_b32_e32 v38, s76 +; GFX9-NEXT: v_mov_b32_e32 v37, s77 +; GFX9-NEXT: v_mov_b32_e32 v35, s75 +; GFX9-NEXT: v_mov_b32_e32 v36, s74 +; GFX9-NEXT: v_mov_b32_e32 v34, s73 +; GFX9-NEXT: v_mov_b32_e32 v33, s63 +; GFX9-NEXT: v_mov_b32_e32 v32, s72 +; GFX9-NEXT: v_mov_b32_e32 v30, s62 +; GFX9-NEXT: v_mov_b32_e32 v31, s61 +; GFX9-NEXT: v_mov_b32_e32 v29, s60 +; GFX9-NEXT: v_mov_b32_e32 v28, s58 +; GFX9-NEXT: v_mov_b32_e32 v27, s59 +; GFX9-NEXT: v_mov_b32_e32 v14, s57 +; GFX9-NEXT: v_mov_b32_e32 v18, s56 +; GFX9-NEXT: v_mov_b32_e32 v23, s12 +; GFX9-NEXT: v_mov_b32_e32 v24, s10 +; GFX9-NEXT: v_mov_b32_e32 v25, s8 +; GFX9-NEXT: v_mov_b32_e32 v26, s6 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v21, s40 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v12, v42, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v48, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v6, v35, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s55, v63, 15 +; GFX9-NEXT: v_readlane_b32 s54, v63, 14 +; GFX9-NEXT: v_readlane_b32 s53, v63, 13 +; GFX9-NEXT: v_readlane_b32 s52, v63, 12 +; GFX9-NEXT: v_readlane_b32 s51, v63, 11 +; GFX9-NEXT: v_readlane_b32 s50, v63, 10 +; GFX9-NEXT: v_readlane_b32 s49, v63, 9 +; GFX9-NEXT: v_readlane_b32 s48, v63, 8 +; GFX9-NEXT: v_readlane_b32 s39, v63, 7 +; GFX9-NEXT: v_readlane_b32 s38, v63, 6 +; GFX9-NEXT: v_readlane_b32 s37, v63, 5 +; GFX9-NEXT: v_readlane_b32 s36, v63, 4 +; GFX9-NEXT: v_readlane_b32 s35, v63, 3 +; GFX9-NEXT: v_readlane_b32 s34, v63, 2 +; GFX9-NEXT: v_readlane_b32 s31, v63, 1 +; GFX9-NEXT: v_readlane_b32 s30, v63, 0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v32f16_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s50, 10 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, s50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, s49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, s48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, s39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, s38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, s37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, s36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, s35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, s34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, s31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, s30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, s95 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, s94 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, s93 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, s92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, s91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, s90 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s89 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s88 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s78 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s77 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s76 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s75 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s74 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s73 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s72 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s62 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s61 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s60 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s59 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s58 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s57 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s47 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s4 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v96 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v87, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xff, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v86 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v81, v85, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v83 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v82, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v22, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, v80, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v17, v70 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v69, v27 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v21, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v22, v67 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v17, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v18, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v23, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v21, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v23, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v38, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v13, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v14, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v10, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v7, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v18, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v6 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v40, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v32f16_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s42, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s49, 9 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s49, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[25:26], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[26:27], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[28:29], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v23 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s0 :: v_dual_mov_b32 v24, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s16 :: v_dual_mov_b32 v16, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s18 :: v_dual_mov_b32 v14, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v6, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v4, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v2, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v96, s49 :: v_dual_mov_b32 v87, s39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s48 :: v_dual_mov_b32 v85, s38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v84, s37 :: v_dual_mov_b32 v83, s36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s34 :: v_dual_mov_b32 v81, s35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s31 :: v_dual_mov_b32 v71, s30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, vcc_hi :: v_dual_mov_b32 v69, s94 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s95 :: v_dual_mov_b32 v67, s93 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s92 :: v_dual_mov_b32 v65, s91 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s89 :: v_dual_mov_b32 v55, s90 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s88 :: v_dual_mov_b32 v53, s79 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s78 :: v_dual_mov_b32 v51, s76 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s77 :: v_dual_mov_b32 v49, s75 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s74 :: v_dual_mov_b32 v39, s73 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s63 :: v_dual_mov_b32 v37, s72 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s62 :: v_dual_mov_b32 v35, s61 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s60 :: v_dual_mov_b32 v33, s58 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s59 :: v_dual_mov_b32 v31, s57 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s56 :: v_dual_mov_b32 v29, s47 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v7, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s46 :: v_dual_mov_b32 v11, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s44 :: v_dual_mov_b32 v17, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v21, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s10 :: v_dual_mov_b32 v26, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s6 :: v_dual_mov_b32 v28, s4 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v23, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v87, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v82, v23, v28 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v85, v84 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v83 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v87, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v69, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, v67, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v83, v24, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v84, v19, v23 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v85, v20, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v15, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v16, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 8, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 8, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v38 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v25, v26 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v28, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v13, v15 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v14, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v9, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v10, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v5, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 8, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 8, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v33 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v17, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v31 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v22 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v12, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v5, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v8 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[82:85], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v40, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v40, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v2, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v22 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v43 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v60 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v58 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v57 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v42 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v62 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v61 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v41 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v59 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v29, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v31, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v33, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v34, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v5, v32 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v7, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v9, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v11, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v48, v13, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v15, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v17, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v54, v19, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v5 -; GCN-NEXT: v_or_b32_e32 v40, v21, v44 -; GCN-NEXT: v_or_b32_e32 v22, v22, v45 -; GCN-NEXT: v_or_b32_e32 v41, v23, v46 -; GCN-NEXT: v_or_b32_e32 v24, v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v24 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; kill: killed $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB53_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v59 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v47, v1 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v46, v3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v45, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v41 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v43 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v32, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v32, v13 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v32, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v32, v23 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v32, v24 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v32, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v32, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v32, v29 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v32, v30 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v32, v31 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v32, v22 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v32, v14 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v32, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v32, v10 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v32, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v32, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v32, v4 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v32, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v32, v0 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v44, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v30 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v31 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v40 -; GCN-NEXT: .LBB53_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v55 -; GCN-NEXT: v_mov_b32_e32 v2, v39 -; GCN-NEXT: v_mov_b32_e32 v4, v35 -; GCN-NEXT: v_mov_b32_e32 v6, v51 -; GCN-NEXT: v_mov_b32_e32 v8, v33 -; GCN-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NEXT: v_mov_b32_e32 v12, v49 -; GCN-NEXT: v_mov_b32_e32 v14, v53 -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: v_mov_b32_e32 v18, v34 -; GCN-NEXT: v_mov_b32_e32 v20, v36 -; GCN-NEXT: v_mov_b32_e32 v22, v38 -; GCN-NEXT: v_mov_b32_e32 v24, v48 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mov_b32_e32 v26, v50 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, v52 -; GCN-NEXT: v_mov_b32_e32 v30, v54 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v32f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v29 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v6 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v35 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v39 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v48 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_or_b32_e32 v0, v6, v0 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v15, v15, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_or_b32_e32 v0, v0, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v11 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: .LBB106_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v30 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v11, v1, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v4, v47, v4 +; SI-NEXT: v_or_b32_e32 v5, v43, v5 +; SI-NEXT: v_or_b32_e32 v3, v55, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v4 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_or_b32_e32 v23, v57, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v12, v7 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v16, v7 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v18, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v20, v7 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v28, v7 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v18 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v28, v31, v28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v22, v7 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v63, v7 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v22, v59, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v24, v7 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v40, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v62, v7 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v26, v58, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v61, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: .LBB106_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v8, v33 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v12, v49 +; SI-NEXT: v_mov_b32_e32 v14, v53 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v22, v38 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v26, v50 +; SI-NEXT: v_mov_b32_e32 v28, v52 +; SI-NEXT: v_mov_b32_e32 v30, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v32f16: ; VI: ; %bb.0: @@ -41487,7 +83071,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: s_cbranch_execz .LBB106_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -41650,9 +83234,9 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: .LBB53_2: ; %Flow +; VI-NEXT: .LBB106_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_4 +; VI-NEXT: s_cbranch_execz .LBB106_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v38 @@ -41807,7 +83391,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: .LBB53_4: ; %end +; VI-NEXT: .LBB106_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -41951,7 +83535,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: s_cbranch_execz .LBB106_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -42116,9 +83700,9 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: .LBB53_2: ; %Flow +; GFX9-NEXT: .LBB106_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_4 +; GFX9-NEXT: s_cbranch_execz .LBB106_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 @@ -42271,7 +83855,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 -; GFX9-NEXT: .LBB53_4: ; %end +; GFX9-NEXT: .LBB106_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -42402,15 +83986,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.l @@ -42541,8 +84125,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -42772,15 +84356,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 @@ -42927,8 +84511,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 @@ -43098,796 +84682,2813 @@ end: ret <32 x half> %phi } +define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v32f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v32, s30, 0 +; SI-NEXT: v_writelane_b32 v32, s31, 1 +; SI-NEXT: v_writelane_b32 v32, s34, 2 +; SI-NEXT: v_writelane_b32 v32, s35, 3 +; SI-NEXT: v_writelane_b32 v32, s36, 4 +; SI-NEXT: v_writelane_b32 v32, s37, 5 +; SI-NEXT: v_writelane_b32 v32, s38, 6 +; SI-NEXT: v_writelane_b32 v32, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s74, v30 +; SI-NEXT: v_readfirstlane_b32 s61, v29 +; SI-NEXT: v_readfirstlane_b32 s63, v28 +; SI-NEXT: v_readfirstlane_b32 s59, v27 +; SI-NEXT: v_readfirstlane_b32 s60, v26 +; SI-NEXT: v_readfirstlane_b32 s57, v25 +; SI-NEXT: v_readfirstlane_b32 s58, v24 +; SI-NEXT: v_readfirstlane_b32 s47, v23 +; SI-NEXT: v_readfirstlane_b32 s56, v22 +; SI-NEXT: v_readfirstlane_b32 s44, v21 +; SI-NEXT: v_readfirstlane_b32 s34, v19 +; SI-NEXT: v_readfirstlane_b32 s37, v18 +; SI-NEXT: v_readfirstlane_b32 s94, v17 +; SI-NEXT: v_readfirstlane_b32 s31, v16 +; SI-NEXT: v_readfirstlane_b32 s90, v15 +; SI-NEXT: v_readfirstlane_b32 s93, v14 +; SI-NEXT: v_readfirstlane_b32 s79, v13 +; SI-NEXT: v_readfirstlane_b32 s39, v12 +; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_readfirstlane_b32 s38, v10 +; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_readfirstlane_b32 s35, v8 +; SI-NEXT: v_readfirstlane_b32 s92, v7 +; SI-NEXT: v_readfirstlane_b32 s95, v6 +; SI-NEXT: v_readfirstlane_b32 s89, v5 +; SI-NEXT: v_readfirstlane_b32 s91, v4 +; SI-NEXT: v_readfirstlane_b32 s78, v3 +; SI-NEXT: v_readfirstlane_b32 s88, v2 +; SI-NEXT: v_readfirstlane_b32 s76, v1 +; SI-NEXT: v_readfirstlane_b32 s77, v0 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s9, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s8, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s12, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s15, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s13, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s41, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s43, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s45, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s42, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s91, 0xff +; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s36, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s93, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s31, 0xff +; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_and_b32 s4, s58, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_and_b32 s4, s75, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s11, 0xff +; SI-NEXT: s_lshl_b32 s6, s7, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s8, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s15, 0xff +; SI-NEXT: s_lshl_b32 s8, s10, 8 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s41, 0xff +; SI-NEXT: s_lshl_b32 s9, s13, 8 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s43, 0xff +; SI-NEXT: s_lshl_b32 s10, s14, 8 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s45, 0xff +; SI-NEXT: s_lshl_b32 s11, s40, 8 +; SI-NEXT: s_add_i32 s73, s73, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s73, 0xff +; SI-NEXT: s_lshl_b32 s12, s42, 8 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s75, 0xff +; SI-NEXT: s_lshl_b32 s13, s62, 8 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s74, 0xff +; SI-NEXT: s_lshl_b32 s14, s72, 8 +; SI-NEXT: s_add_i32 s63, s63, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s63, 0xff +; SI-NEXT: s_lshl_b32 s15, s61, 8 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s60, 0xff +; SI-NEXT: s_lshl_b32 s40, s59, 8 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_or_b32 s15, s40, s15 +; SI-NEXT: s_and_b32 s40, s58, 0xff +; SI-NEXT: s_lshl_b32 s41, s57, 8 +; SI-NEXT: s_add_i32 s56, s56, 3 +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: s_and_b32 s41, s56, 0xff +; SI-NEXT: s_lshl_b32 s42, s47, 8 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_or_b32 s41, s42, s41 +; SI-NEXT: s_and_b32 s42, s46, 0xff +; SI-NEXT: s_lshl_b32 s43, s44, 8 +; SI-NEXT: s_add_i32 s37, s37, 3 +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: s_and_b32 s43, s37, 0xff +; SI-NEXT: s_lshl_b32 s44, s34, 8 +; SI-NEXT: s_add_i32 s31, s31, 3 +; SI-NEXT: s_or_b32 s43, s44, s43 +; SI-NEXT: s_and_b32 s44, s31, 0xff +; SI-NEXT: s_lshl_b32 s45, s94, 8 +; SI-NEXT: s_add_i32 s93, s93, 3 +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_and_b32 s45, s93, 0xff +; SI-NEXT: s_lshl_b32 s46, s90, 8 +; SI-NEXT: s_add_i32 s39, s39, 3 +; SI-NEXT: s_or_b32 s45, s46, s45 +; SI-NEXT: s_and_b32 s46, s39, 0xff +; SI-NEXT: s_lshl_b32 s47, s79, 8 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s38, 0xff +; SI-NEXT: s_lshl_b32 s56, s36, 8 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_or_b32 s47, s56, s47 +; SI-NEXT: s_and_b32 s56, s35, 0xff +; SI-NEXT: s_lshl_b32 s57, s30, 8 +; SI-NEXT: s_add_i32 s95, s95, 3 +; SI-NEXT: s_or_b32 s56, s57, s56 +; SI-NEXT: s_and_b32 s57, s95, 0xff +; SI-NEXT: s_lshl_b32 s58, s92, 8 +; SI-NEXT: s_add_i32 s91, s91, 3 +; SI-NEXT: s_or_b32 s57, s58, s57 +; SI-NEXT: s_and_b32 s58, s91, 0xff +; SI-NEXT: s_lshl_b32 s59, s89, 8 +; SI-NEXT: s_add_i32 s88, s88, 3 +; SI-NEXT: s_or_b32 s58, s59, s58 +; SI-NEXT: s_and_b32 s59, s88, 0xff +; SI-NEXT: s_lshl_b32 s60, s78, 8 +; SI-NEXT: s_add_i32 s77, s77, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s59, s60, s59 +; SI-NEXT: s_and_b32 s60, s77, 0xff +; SI-NEXT: s_lshl_b32 s61, s76, 8 +; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_lshl_b32 s27, s27, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 8 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s60, s61, s60 +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_addk_i32 s40, 0x300 +; SI-NEXT: s_addk_i32 s41, 0x300 +; SI-NEXT: s_addk_i32 s42, 0x300 +; SI-NEXT: s_addk_i32 s43, 0x300 +; SI-NEXT: s_addk_i32 s44, 0x300 +; SI-NEXT: s_addk_i32 s45, 0x300 +; SI-NEXT: s_addk_i32 s46, 0x300 +; SI-NEXT: s_addk_i32 s47, 0x300 +; SI-NEXT: s_addk_i32 s56, 0x300 +; SI-NEXT: s_addk_i32 s57, 0x300 +; SI-NEXT: s_addk_i32 s58, 0x300 +; SI-NEXT: s_addk_i32 s59, 0x300 +; SI-NEXT: s_addk_i32 s60, 0x300 +; SI-NEXT: s_addk_i32 s28, 0x300 +; SI-NEXT: s_addk_i32 s26, 0x300 +; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_addk_i32 s22, 0x300 +; SI-NEXT: s_addk_i32 s20, 0x300 +; SI-NEXT: s_addk_i32 s18, 0x300 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: v_readlane_b32 s39, v32, 7 +; SI-NEXT: v_readlane_b32 s38, v32, 6 +; SI-NEXT: v_readlane_b32 s37, v32, 5 +; SI-NEXT: v_readlane_b32 s36, v32, 4 +; SI-NEXT: v_readlane_b32 s35, v32, 3 +; SI-NEXT: v_readlane_b32 s34, v32, 2 +; SI-NEXT: v_readlane_b32 s31, v32, 1 +; SI-NEXT: v_readlane_b32 s30, v32, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB107_2 +; +; VI-LABEL: bitcast_v64i8_to_v32f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, v20 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v51, v23 +; VI-NEXT: v_mov_b32_e32 v30, v26 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v30, v34 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v32f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v96, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v86, 16, v87 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v96, 16, v97 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB107_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB107_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB107_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB107_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB107_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v32bf16_to_v64i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v35 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v37 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v50 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v55 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v28 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v29 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v57, v5, v34, 16 -; GCN-NEXT: v_alignbit_b32 v56, v14, v32, 16 -; GCN-NEXT: v_alignbit_b32 v47, v11, v36, 16 -; GCN-NEXT: v_alignbit_b32 v45, v10, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v13, v38, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v49, 16 -; GCN-NEXT: v_alignbit_b32 v39, v6, v15, 16 -; GCN-NEXT: v_alignbit_b32 v33, v21, v17, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v19, 16 -; GCN-NEXT: v_alignbit_b32 v26, v24, v20, 16 -; GCN-NEXT: v_alignbit_b32 v24, v3, v22, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v23, 16 -; GCN-NEXT: v_alignbit_b32 v16, v2, v25, 16 -; GCN-NEXT: v_alignbit_b32 v13, v52, v27, 16 -; GCN-NEXT: v_alignbit_b32 v11, v1, v30, 16 -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v31 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v11 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB54_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v23 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v22 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v49 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v40 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v50 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v38 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v51 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 -; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v6 -; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v8 -; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v10 -; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v11 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v13 -; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v14 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v20 -; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v18 -; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v21 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v22 -; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v23 -; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v24 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v25 -; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 -; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v27 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v28 -; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v29 -; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v30 -; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v31 -; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v16 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v13 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v37 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v35 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v13, v26, v33, 16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v11, v1, v34, 16 -; GCN-NEXT: v_alignbit_b32 v21, v27, v5, 16 -; GCN-NEXT: v_alignbit_b32 v16, v2, v36, 16 -; GCN-NEXT: v_alignbit_b32 v26, v28, v38, 16 -; GCN-NEXT: v_alignbit_b32 v24, v3, v39, 16 -; GCN-NEXT: v_alignbit_b32 v33, v29, v17, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v19, 16 -; GCN-NEXT: v_alignbit_b32 v48, v30, v18, 16 -; GCN-NEXT: v_alignbit_b32 v39, v6, v15, 16 -; GCN-NEXT: v_alignbit_b32 v41, v32, v20, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v47, v49, v22, 16 -; GCN-NEXT: v_alignbit_b32 v45, v10, v7, 16 -; GCN-NEXT: v_alignbit_b32 v57, v50, v23, 16 -; GCN-NEXT: v_alignbit_b32 v56, v14, v25, 16 -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 16 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 8 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v31 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v24 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v16 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v11 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: .LBB54_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v61 -; GCN-NEXT: v_or_b32_e32 v12, v7, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NEXT: v_or_b32_e32 v15, v7, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v60 -; GCN-NEXT: v_or_b32_e32 v17, v5, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v63 -; GCN-NEXT: v_or_b32_e32 v18, v5, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v59 -; GCN-NEXT: v_or_b32_e32 v19, v5, v7 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v5 -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v14 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v62 -; GCN-NEXT: v_or_b32_e32 v20, v5, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v48 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_or_b32_e32 v22, v7, v9 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v10 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v9 -; GCN-NEXT: v_and_b32_e32 v9, 0xff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v58 -; GCN-NEXT: v_or_b32_e32 v23, v9, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v10 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v33 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-NEXT: v_or_b32_e32 v25, v10, v14 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v8 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v31 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-NEXT: v_or_b32_e32 v28, v8, v14 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v26, 8, v26 -; GCN-NEXT: v_or_b32_e32 v26, v14, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; GCN-NEXT: v_or_b32_e32 v24, v6, v24 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v44, 0xff, v33 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v33 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v33 -; GCN-NEXT: v_or_b32_e32 v33, v21, v33 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-NEXT: v_or_b32_e32 v16, v4, v16 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v53 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v53 -; GCN-NEXT: v_or_b32_e32 v53, v13, v53 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GCN-NEXT: v_or_b32_e32 v41, v3, v11 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v57, 0xff, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: v_or_b32_e32 v32, v32, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 24, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v48, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v27, v27, v48 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v29, v29, v49 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v31, v31, v51 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v39, v39, v54 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v48, v50, v42 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v49, v52, v44 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v50, v55, v46 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v51, v40, v47 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_or_b32_e32 v52, v43, v56 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v37, v37, v57 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v54, v45, v58 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v41 -; GCN-NEXT: v_or_b32_e32 v35, v60, v35 -; GCN-NEXT: v_or_b32_e32 v12, v12, v36 -; GCN-NEXT: v_or_b32_e32 v15, v15, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v34 -; GCN-NEXT: v_or_b32_e32 v18, v18, v38 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v48 -; GCN-NEXT: v_or_b32_e32 v27, v28, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_or_b32_e32 v28, v33, v52 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v29, v53, v54 -; GCN-NEXT: v_or_b32_e32 v31, v55, v35 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v32bf16_to_v64i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v30 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; SI-NEXT: v_alignbit_b32 v48, v1, v38, 16 +; SI-NEXT: v_alignbit_b32 v50, v37, v35, 16 +; SI-NEXT: v_alignbit_b32 v1, v50, v48, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v48, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v50, v48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_alignbit_b32 v23, v1, v52, 16 +; SI-NEXT: v_alignbit_b32 v21, v19, v49, 16 +; SI-NEXT: v_alignbit_b32 v1, v21, v23, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v21, v23, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_alignbit_b32 v17, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v18, v16, v53, 16 +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v34 +; SI-NEXT: v_alignbit_b32 v14, v1, v42, 16 +; SI-NEXT: v_alignbit_b32 v15, v13, v40, 16 +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_alignbit_b32 v11, v1, v45, 16 +; SI-NEXT: v_alignbit_b32 v12, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_alignbit_b32 v8, v1, v47, 16 +; SI-NEXT: v_alignbit_b32 v9, v7, v24, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_alignbit_b32 v5, v1, v56, 16 +; SI-NEXT: v_alignbit_b32 v6, v4, v28, 16 +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_alignbit_b32 v2, v1, v57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_alignbit_b32 v3, v1, v29, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v18 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30 +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v34 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v15 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v33 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: .LBB108_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 +; SI-NEXT: v_alignbit_b32 v21, v19, v20, 16 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_alignbit_b32 v48, v30, v20, 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v50, v37, v20, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v20, v50, v48, 24 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v50, v48, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v50, v48, 8 +; SI-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v21, v23, 24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 +; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v21, v23, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 +; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v21, v23, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 24 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 +; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v18, v17, 8 +; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v10 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v15, v14, 24 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v15, v14, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v15, v14, 8 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v7 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 8 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v9, v8, 24 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v6, v4, v6, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v9, v8, 8 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v3, v1, v3, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v6, v5, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v27 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v15 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: .LBB108_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v59 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v60 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v61 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v58 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32bf16_to_v64i8: ; VI: ; %bb.0: @@ -43961,7 +87562,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -44014,9 +87615,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -44357,7 +87958,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 ; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 @@ -44566,7 +88167,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -44619,9 +88220,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] ; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -44932,7 +88533,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 @@ -45118,7 +88719,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -45176,9 +88777,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v15.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 @@ -45476,7 +89077,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l @@ -45689,7 +89290,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] @@ -45739,9 +89340,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 -; GFX11-FAKE16-NEXT: .LBB54_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB108_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1 @@ -46062,7 +89663,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 8, v28 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 8, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 8, v26 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -46232,779 +89833,4500 @@ end: ret <64 x i8> %phi } +define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_alignbit_b32 v27, v1, v3, 16 +; SI-NEXT: v_alignbit_b32 v30, v24, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v30, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v30, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_alignbit_b32 v21, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v19, v17, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v19, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_alignbit_b32 v15, v1, v9, 16 +; SI-NEXT: v_alignbit_b32 v16, v13, v7, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_alignbit_b32 v10, v1, v11, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v20, 16 +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v11, v10, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 +; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16 +; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16 +; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 +; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16 +; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16 +; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24 +; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 +; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16 +; SI-NEXT: v_mov_b32_e32 v53, v32 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16 +; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8 +; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 +; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 +; SI-NEXT: v_mov_b32_e32 v37, v33 +; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 +; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48 +; SI-NEXT: v_mov_b32_e32 v48, v32 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: v_mov_b32_e32 v50, v25 +; SI-NEXT: v_mov_b32_e32 v25, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 +; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16 +; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 +; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17 +; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 +; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24 +; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 +; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24 +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v30, v27, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v30, v27, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v21, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v21, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v19, v21, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v27, v36 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v35, 0xff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v23 +; SI-NEXT: v_or_b32_e32 v33, v33, v35 +; SI-NEXT: v_or_b32_e32 v27, v27, v33 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v30 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v22, v27, v22 +; SI-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v46 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v13, v16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v63 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v59 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_mov_b32_e32 v53, v32 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v37, v33 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v63, s30, 0 +; VI-NEXT: v_writelane_b32 v63, s31, 1 +; VI-NEXT: v_writelane_b32 v63, s34, 2 +; VI-NEXT: v_writelane_b32 v63, s35, 3 +; VI-NEXT: v_writelane_b32 v63, s36, 4 +; VI-NEXT: v_writelane_b32 v63, s37, 5 +; VI-NEXT: v_writelane_b32 v63, s38, 6 +; VI-NEXT: v_writelane_b32 v63, s39, 7 +; VI-NEXT: v_writelane_b32 v63, s48, 8 +; VI-NEXT: v_writelane_b32 v63, s49, 9 +; VI-NEXT: v_writelane_b32 v63, s50, 10 +; VI-NEXT: v_writelane_b32 v63, s51, 11 +; VI-NEXT: v_writelane_b32 v63, s52, 12 +; VI-NEXT: v_writelane_b32 v63, s53, 13 +; VI-NEXT: v_writelane_b32 v63, s54, 14 +; VI-NEXT: v_writelane_b32 v63, s55, 15 +; VI-NEXT: v_writelane_b32 v63, s64, 16 +; VI-NEXT: v_writelane_b32 v63, s65, 17 +; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s56, s5, 24 +; VI-NEXT: s_lshr_b32 s57, s5, 16 +; VI-NEXT: s_lshr_b32 s59, s5, 8 +; VI-NEXT: s_lshr_b32 s58, s4, 16 +; VI-NEXT: s_lshr_b32 s60, s4, 8 +; VI-NEXT: s_lshr_b32 s61, s29, 24 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s72, s29, 8 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s73, s28, 8 +; VI-NEXT: s_lshr_b32 s74, s27, 24 +; VI-NEXT: s_lshr_b32 s75, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s27, 8 +; VI-NEXT: s_lshr_b32 s76, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s26, 8 +; VI-NEXT: s_lshr_b32 s79, s25, 24 +; VI-NEXT: s_lshr_b32 s88, s25, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 8 +; VI-NEXT: s_lshr_b32 s89, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 8 +; VI-NEXT: s_lshr_b32 s30, s23, 24 +; VI-NEXT: s_lshr_b32 s31, s23, 16 +; VI-NEXT: s_lshr_b32 s35, s23, 8 +; VI-NEXT: s_lshr_b32 s34, s22, 16 +; VI-NEXT: s_lshr_b32 s36, s22, 8 +; VI-NEXT: s_lshr_b32 s37, s21, 24 +; VI-NEXT: s_lshr_b32 s38, s21, 16 +; VI-NEXT: s_lshr_b32 s48, s21, 8 +; VI-NEXT: s_lshr_b32 s39, s20, 16 +; VI-NEXT: s_lshr_b32 s49, s20, 8 +; VI-NEXT: s_lshr_b32 s50, s19, 24 +; VI-NEXT: s_lshr_b32 s51, s19, 16 +; VI-NEXT: s_lshr_b32 s53, s19, 8 +; VI-NEXT: s_lshr_b32 s52, s18, 16 +; VI-NEXT: s_lshr_b32 s54, s18, 8 +; VI-NEXT: s_lshr_b32 s55, s17, 24 +; VI-NEXT: s_lshr_b32 s64, s17, 16 +; VI-NEXT: s_lshr_b32 s66, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s67, s16, 8 +; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s6, v15 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s6, v15 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s6, v15 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v15 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s6, v15 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s6, v15 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; VI-NEXT: v_add_f32_e32 v3, s6, v15 +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s6, v15 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 +; VI-NEXT: v_add_f32_e32 v5, s6, v15 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v6, s6, v15 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 +; VI-NEXT: v_add_f32_e32 v5, s6, v15 +; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc +; VI-NEXT: v_add_f32_e32 v7, s6, v15 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 +; VI-NEXT: v_add_f32_e32 v7, s6, v15 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s6, v15 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; VI-NEXT: v_add_f32_e32 v7, s6, v15 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_add_f32_e32 v9, s6, v15 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 +; VI-NEXT: v_add_f32_e32 v9, s6, v15 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_add_f32_e32 v10, s6, v15 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; VI-NEXT: v_add_f32_e32 v9, s6, v15 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; VI-NEXT: v_add_f32_e32 v11, s6, v15 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 +; VI-NEXT: v_add_f32_e32 v11, s6, v15 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_add_f32_e32 v12, s6, v15 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: s_lshl_b32 s6, s26, 16 +; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 +; VI-NEXT: v_add_f32_e32 v11, s6, v15 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s6, v15 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: s_lshl_b32 s6, s29, 16 +; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 +; VI-NEXT: v_add_f32_e32 v13, s6, v15 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc +; VI-NEXT: v_add_f32_e32 v14, s6, v15 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_lshl_b32 s6, s28, 16 +; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; VI-NEXT: v_add_f32_e32 v13, s6, v15 +; VI-NEXT: v_bfe_u32 v16, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc +; VI-NEXT: v_add_f32_e32 v16, s6, v15 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_lshl_b32 s6, s5, 16 +; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16 +; VI-NEXT: v_add_f32_e32 v16, s6, v15 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_add_f32_e32 v17, s5, v15 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: s_lshl_b32 s5, s4, 16 +; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; VI-NEXT: v_add_f32_e32 v17, s5, v15 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v15, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: s_branch .LBB109_5 +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v19, s42 +; VI-NEXT: v_mov_b32_e32 v1, s16 +; VI-NEXT: v_mov_b32_e32 v2, s17 +; VI-NEXT: v_mov_b32_e32 v3, s18 +; VI-NEXT: v_mov_b32_e32 v4, s19 +; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v6, s21 +; VI-NEXT: v_mov_b32_e32 v7, s22 +; VI-NEXT: v_mov_b32_e32 v8, s23 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v11, s26 +; VI-NEXT: v_mov_b32_e32 v12, s27 +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_mov_b32_e32 v16, s5 +; VI-NEXT: v_mov_b32_e32 v18, s67 +; VI-NEXT: v_mov_b32_e32 v62, s65 +; VI-NEXT: v_mov_b32_e32 v17, s66 +; VI-NEXT: v_mov_b32_e32 v60, s64 +; VI-NEXT: v_mov_b32_e32 v61, s55 +; VI-NEXT: v_mov_b32_e32 v58, s54 +; VI-NEXT: v_mov_b32_e32 v59, s52 +; VI-NEXT: v_mov_b32_e32 v57, s53 +; VI-NEXT: v_mov_b32_e32 v47, s51 +; VI-NEXT: v_mov_b32_e32 v56, s50 +; VI-NEXT: v_mov_b32_e32 v46, s49 +; VI-NEXT: v_mov_b32_e32 v45, s39 +; VI-NEXT: v_mov_b32_e32 v44, s48 +; VI-NEXT: v_mov_b32_e32 v42, s38 +; VI-NEXT: v_mov_b32_e32 v43, s37 +; VI-NEXT: v_mov_b32_e32 v41, s36 +; VI-NEXT: v_mov_b32_e32 v40, s34 +; VI-NEXT: v_mov_b32_e32 v55, s35 +; VI-NEXT: v_mov_b32_e32 v53, s31 +; VI-NEXT: v_mov_b32_e32 v54, s30 +; VI-NEXT: v_mov_b32_e32 v52, s91 +; VI-NEXT: v_mov_b32_e32 v51, s89 +; VI-NEXT: v_mov_b32_e32 v50, s90 +; VI-NEXT: v_mov_b32_e32 v48, s88 +; VI-NEXT: v_mov_b32_e32 v49, s79 +; VI-NEXT: v_mov_b32_e32 v39, s78 +; VI-NEXT: v_mov_b32_e32 v38, s76 +; VI-NEXT: v_mov_b32_e32 v37, s77 +; VI-NEXT: v_mov_b32_e32 v35, s75 +; VI-NEXT: v_mov_b32_e32 v36, s74 +; VI-NEXT: v_mov_b32_e32 v34, s73 +; VI-NEXT: v_mov_b32_e32 v33, s63 +; VI-NEXT: v_mov_b32_e32 v32, s72 +; VI-NEXT: v_mov_b32_e32 v30, s62 +; VI-NEXT: v_mov_b32_e32 v31, s61 +; VI-NEXT: v_mov_b32_e32 v29, s60 +; VI-NEXT: v_mov_b32_e32 v28, s58 +; VI-NEXT: v_mov_b32_e32 v27, s59 +; VI-NEXT: v_mov_b32_e32 v25, s57 +; VI-NEXT: v_mov_b32_e32 v26, s56 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s10 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s14 +; VI-NEXT: .LBB109_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-NEXT: v_writelane_b32 v4, s31, 1 +; GFX9-NEXT: v_writelane_b32 v4, s34, 2 +; GFX9-NEXT: v_writelane_b32 v4, s35, 3 +; GFX9-NEXT: v_writelane_b32 v4, s36, 4 +; GFX9-NEXT: v_writelane_b32 v4, s37, 5 +; GFX9-NEXT: v_writelane_b32 v4, s38, 6 +; GFX9-NEXT: v_writelane_b32 v4, s39, 7 +; GFX9-NEXT: v_writelane_b32 v4, s48, 8 +; GFX9-NEXT: v_writelane_b32 v4, s49, 9 +; GFX9-NEXT: v_writelane_b32 v4, s50, 10 +; GFX9-NEXT: v_writelane_b32 v4, s51, 11 +; GFX9-NEXT: v_writelane_b32 v4, s52, 12 +; GFX9-NEXT: v_writelane_b32 v4, s53, 13 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_writelane_b32 v4, s54, 14 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_writelane_b32 v4, s55, 15 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s92, s5, 24 +; GFX9-NEXT: s_lshr_b32 s91, s5, 16 +; GFX9-NEXT: s_lshr_b32 s93, s5, 8 +; GFX9-NEXT: s_lshr_b32 s94, s4, 16 +; GFX9-NEXT: s_lshr_b32 s95, s4, 8 +; GFX9-NEXT: s_lshr_b32 s30, s29, 24 +; GFX9-NEXT: s_lshr_b32 s90, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s29, 8 +; GFX9-NEXT: s_lshr_b32 s31, s28, 16 +; GFX9-NEXT: s_lshr_b32 s74, s28, 8 +; GFX9-NEXT: s_lshr_b32 s34, s27, 24 +; GFX9-NEXT: s_lshr_b32 s89, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s27, 8 +; GFX9-NEXT: s_lshr_b32 s35, s26, 16 +; GFX9-NEXT: s_lshr_b32 s72, s26, 8 +; GFX9-NEXT: s_lshr_b32 s36, s25, 24 +; GFX9-NEXT: s_lshr_b32 s88, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s25, 8 +; GFX9-NEXT: s_lshr_b32 s37, s24, 16 +; GFX9-NEXT: s_lshr_b32 s62, s24, 8 +; GFX9-NEXT: s_lshr_b32 s38, s23, 24 +; GFX9-NEXT: s_lshr_b32 s79, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s23, 8 +; GFX9-NEXT: s_lshr_b32 s39, s22, 16 +; GFX9-NEXT: s_lshr_b32 s60, s22, 8 +; GFX9-NEXT: s_lshr_b32 s48, s21, 24 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s21, 8 +; GFX9-NEXT: s_lshr_b32 s49, s20, 16 +; GFX9-NEXT: s_lshr_b32 s58, s20, 8 +; GFX9-NEXT: s_lshr_b32 s50, s19, 24 +; GFX9-NEXT: s_lshr_b32 s77, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s19, 8 +; GFX9-NEXT: s_lshr_b32 s51, s18, 16 +; GFX9-NEXT: s_lshr_b32 s56, s18, 8 +; GFX9-NEXT: s_lshr_b32 s52, s17, 24 +; GFX9-NEXT: s_lshr_b32 s76, s17, 16 +; GFX9-NEXT: s_lshr_b32 s53, s17, 8 +; GFX9-NEXT: s_lshr_b32 s54, s16, 16 +; GFX9-NEXT: s_lshr_b32 s55, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB109_3 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s76, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s17, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s16, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s16, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s16, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s19, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s16, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s19, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s19, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s18, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s18, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s21, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s56, s18, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s78, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s21, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s21, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s20, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s20, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s20, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s79, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s23, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s23, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s22, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s22, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s25, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s22, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s88, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s25, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s25, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s24, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s24, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s24, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s27, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s62, s24, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s89, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s27, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s27, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s26, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s26, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s29, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s26, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s90, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s29, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s29, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s28, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s6, s28, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_add_i32 s9, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s10, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_lshr_b32 s28, s6, 16 +; GFX9-NEXT: s_and_b32 s6, s5, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s74, s28, s8 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: s_lshr_b32 s91, s6, 16 +; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_add_i32 s8, s6, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s5, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s6, s4, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s8, s7, 0x7fff +; GFX9-NEXT: s_or_b32 s9, s6, 0x400000 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s6, s9, s8 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_bfe_u32 s6, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s6, s6, s4 +; GFX9-NEXT: s_add_i32 s9, s6, 0x7fff +; GFX9-NEXT: s_bitset1_b32 s4, 22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX9-NEXT: s_cselect_b32 s4, s4, s9 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s17, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s57, s19, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s23, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s63, s25, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s27, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s75, s29, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s31, s5, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s30, s4, s8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[30:31], 24 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[74:75], 24 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[62:63], 24 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[60:61], 24 +; GFX9-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 +; GFX9-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 +; GFX9-NEXT: s_lshr_b32 s92, s31, 24 +; GFX9-NEXT: s_lshr_b32 s93, s31, 8 +; GFX9-NEXT: s_lshr_b32 s94, s30, 16 +; GFX9-NEXT: s_lshr_b32 s95, s30, 8 +; GFX9-NEXT: s_lshr_b32 s30, s75, 24 +; GFX9-NEXT: s_lshr_b32 s75, s75, 8 +; GFX9-NEXT: s_lshr_b32 s31, s74, 16 +; GFX9-NEXT: s_lshr_b32 s74, s74, 8 +; GFX9-NEXT: s_lshr_b32 s34, s73, 24 +; GFX9-NEXT: s_lshr_b32 s73, s73, 8 +; GFX9-NEXT: s_lshr_b32 s35, s72, 16 +; GFX9-NEXT: s_lshr_b32 s72, s72, 8 +; GFX9-NEXT: s_lshr_b32 s36, s63, 24 +; GFX9-NEXT: s_lshr_b32 s63, s63, 8 +; GFX9-NEXT: s_lshr_b32 s37, s62, 16 +; GFX9-NEXT: s_lshr_b32 s62, s62, 8 +; GFX9-NEXT: s_lshr_b32 s38, s61, 24 +; GFX9-NEXT: s_lshr_b32 s61, s61, 8 +; GFX9-NEXT: s_lshr_b32 s39, s60, 16 +; GFX9-NEXT: s_lshr_b32 s60, s60, 8 +; GFX9-NEXT: s_lshr_b32 s48, s59, 24 +; GFX9-NEXT: s_lshr_b32 s59, s59, 8 +; GFX9-NEXT: s_lshr_b32 s49, s58, 16 +; GFX9-NEXT: s_lshr_b32 s58, s58, 8 +; GFX9-NEXT: s_lshr_b32 s50, s57, 24 +; GFX9-NEXT: s_lshr_b32 s57, s57, 8 +; GFX9-NEXT: s_lshr_b32 s51, s56, 16 +; GFX9-NEXT: s_lshr_b32 s56, s56, 8 +; GFX9-NEXT: s_lshr_b32 s52, s47, 24 +; GFX9-NEXT: s_lshr_b32 s53, s47, 8 +; GFX9-NEXT: s_lshr_b32 s54, s46, 16 +; GFX9-NEXT: s_lshr_b32 s55, s46, 8 +; GFX9-NEXT: .LBB109_3: ; %end +; GFX9-NEXT: s_and_b32 s7, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s55, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s44, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s17, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s53, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s76, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s52, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s51, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s42, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s19, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s57, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s50, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s58, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s40, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s21, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s59, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s78, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s48, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s60, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s14, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s61, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s79, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s38, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s62, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s37, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s25, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s63, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s88, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s36, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s11 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s72, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s35, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s27, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s73, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s89, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s34, 8 +; GFX9-NEXT: s_or_b32 s9, s9, s10 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s74, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s9 +; GFX9-NEXT: s_and_b32 s9, s31, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s7, s29, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s75, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s90, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s30, 8 +; GFX9-NEXT: s_or_b32 s8, s8, s9 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s95, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s7, s94, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s93, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s92, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_readlane_b32 s55, v4, 15 +; GFX9-NEXT: v_readlane_b32 s54, v4, 14 +; GFX9-NEXT: v_readlane_b32 s53, v4, 13 +; GFX9-NEXT: v_readlane_b32 s52, v4, 12 +; GFX9-NEXT: v_readlane_b32 s51, v4, 11 +; GFX9-NEXT: v_readlane_b32 s50, v4, 10 +; GFX9-NEXT: v_readlane_b32 s49, v4, 9 +; GFX9-NEXT: v_readlane_b32 s48, v4, 8 +; GFX9-NEXT: v_readlane_b32 s39, v4, 7 +; GFX9-NEXT: v_readlane_b32 s38, v4, 6 +; GFX9-NEXT: v_readlane_b32 s37, v4, 5 +; GFX9-NEXT: v_readlane_b32 s36, v4, 4 +; GFX9-NEXT: v_readlane_b32 s35, v4, 3 +; GFX9-NEXT: v_readlane_b32 s34, v4, 2 +; GFX9-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-NEXT: v_readlane_b32 s30, v4, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: s_branch .LBB109_2 +; +; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s49, 9 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s27, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s26, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s25, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s24, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s23, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s23, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s22, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s21, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s21, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s20, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s19, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s19, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s18, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s17, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s17, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s16, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s3, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s3, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], s[26:27], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_3 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s1, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s5, s1 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s1, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s1, s46 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s0 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s0, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s5, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s3, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s6, s3 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s2, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, s2 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s2, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s6, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s17, s6, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s17, s56 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, s7 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s19, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s16, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s16, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s19, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s31, s19, s57 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, s8 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s18, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s30, s18, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s20, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s21, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s93, s21, s58 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s20, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s92, s20, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s23, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s89, s23, s59 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s25, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s22, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s22, s12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s25, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s25, s60 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s27, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s24, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s24, s13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s8, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s11, s9, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s11, s9 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s11, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s26, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s27, s11, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s2, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s3, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 vcc_hi, s27, s61 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s4, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-TRUE16-NEXT: s_add_i32 s4, s6, s5 +; GFX11-TRUE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cselect_b32 s26, s5, s4 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[76:77], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s26, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[88:89], 24 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 vcc_lo, s26, s49 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[10:11], s[92:93], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[12:13], s[30:31], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[14:15], s[44:45], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[28:29], s[8:9], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[40:41], s[42:43], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, vcc_hi, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, vcc_hi, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s77, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s77, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s76, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s76, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s89, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s89, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s88, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s88, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s93, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s93, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s92, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s92, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s31, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s31, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s94, s30, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s95, s30, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s30, s45, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s45, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s31, s44, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s44, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s9, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s9, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s8, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s43, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s43, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s42, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[8:9], vcc, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, vcc_lo, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, vcc_lo, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s42, 8 +; GFX11-TRUE16-NEXT: .LBB109_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s49 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s40 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s48 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s46 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s39 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s38 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s37 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s28 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s36 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s47 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s35 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s31 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s14 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s45 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s56 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s30 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s95 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s94 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s12 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s93 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s57 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s92 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s91 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s90 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s89 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s58 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s88 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s79 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s78 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s59 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s76 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s75 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s73 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s60 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s72 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s44 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s63 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s61 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s62 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v17, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; +; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v64i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v17, s32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s30, 0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s50, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v17, s51, 10 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s27, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s25, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s24, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s23, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s23, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s22, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s21, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s21, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s20, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s19, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s19, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s18, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s17, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s17, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s16, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s3, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s3, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[22:23], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[18:19], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_3 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_and_b32 s4, s1, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s1, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s5, s1 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s1, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s1, s4 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s0, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s43, s1, s46 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s0, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s0 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s42, s0, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s5, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s3, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s6, s3 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s3, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s2, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, s5 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s2, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, s2 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s7, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s2, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s17, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s7, s6, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, s6 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s6, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s6, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s17, s6, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s45, s17, s56 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s8, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s8, s7, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, s7 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s7, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s19, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s16, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s44, s16, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s19, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s31, s19, s57 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s9, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s9, s8, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, s8 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s8, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s21, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s18, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s30, s18, s7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s20, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s21, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s93, s21, s58 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s10, s9, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, s9 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s9, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s23, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s20, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s92, s20, s8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s23, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s89, s23, s59 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s11, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s9, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s11, s10, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, s10 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s10, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s11, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s25, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s22, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s88, s22, s9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s25, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s77, s25, s60 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s27, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s24, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s76, s24, s13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s10, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s12, s11, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s12, s11 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s11, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s12, s11, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s26, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshr_b32 s27, s12, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s2, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s3, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s51, s27, s61 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s5, s4, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s4 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s4, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s4, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s6, s5 +; GFX11-FAKE16-NEXT: s_bitset1_b32 s5, 22 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x7fff +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cselect_b32 s26, s5, s4 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[76:77], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s26, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[88:89], 24 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s50, s26, s48 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[8:9], s[92:93], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[12:13], s[30:31], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[14:15], s[44:45], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[28:29], s[10:11], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[40:41], s[42:43], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s51, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s51, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s77, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s77, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s76, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s76, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s89, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s89, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s88, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s88, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s93, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s93, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s92, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s92, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s31, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s31, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s94, s30, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s95, s30, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 vcc_hi, s45, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s45, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s30, s44, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s31, s44, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s34, s11, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s35, s11, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s36, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s37, s10, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s38, s43, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s39, s43, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s48, s42, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[10:11], s[50:51], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s50, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s50, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s42, 8 +; GFX11-FAKE16-NEXT: .LBB109_3: ; %end +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s42, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s48, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s40, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s39, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s46, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s38, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s37, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s36, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s28, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s35, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s47, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s34, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s31, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s30, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s14, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s17, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s45, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s56, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, vcc_hi, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s95, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s94, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s12, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s19, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s93, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s57, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s92, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s9, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s91, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s90, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s8, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s89, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s58, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s88, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s79, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s78, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s23, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s77, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s59, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s76, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s75, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s74, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s4, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s25, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s73, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s60, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s72, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s44, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s43, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s10, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s27, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s63, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s61, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s62, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s2 :: v_dual_mov_b32 v16, s3 +; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v17, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v17, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v17, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v17, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v17, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v17, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v17, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v17, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v17, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v17, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v17, 0 +; GFX11-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v17, off, s32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vcc_hi +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr95 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr94 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i8_to_v32bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v7 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v19 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v30 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v26 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v24 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v10 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v13 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v7 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v4, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v5, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v5, v2 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v31, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v33, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v61 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v60 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v56 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v44 -; GCN-NEXT: v_and_b32_e32 v37, 0xff, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v63 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v46 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v58 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v57 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 24, v17 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v35, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v38 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v39, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v50 -; GCN-NEXT: v_or_b32_e32 v26, v51, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v52 -; GCN-NEXT: v_or_b32_e32 v30, v53, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v54 -; GCN-NEXT: v_or_b32_e32 v43, v5, v55 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v5, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v3, v41 -; GCN-NEXT: v_or_b32_e32 v33, v0, v21 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v0, v22 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v0, v7 -; GCN-NEXT: v_or_b32_e32 v51, v1, v19 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v0, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v0, v13 -; GCN-NEXT: v_or_b32_e32 v32, v4, v15 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v0, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v0, v14 -; GCN-NEXT: v_or_b32_e32 v36, v8, v16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v0, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v0, v27 -; GCN-NEXT: v_or_b32_e32 v48, v9, v28 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v0, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v31 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v0, v42 -; GCN-NEXT: v_or_b32_e32 v52, v2, v44 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v53, v0, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v26 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v0, v25 -; GCN-NEXT: v_or_b32_e32 v40, v10, v24 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v0, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v30 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v0, v47 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v14 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v30, v3 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v17 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v5, v7, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v26, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v58 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: s_movk_i32 s7, 0x300 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v18 -; GCN-NEXT: s_mov_b32 s6, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v45 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v56 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v61 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v9 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v34 -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v35 -; GCN-NEXT: v_and_b32_e32 v35, 0xff, v36 -; GCN-NEXT: v_and_b32_e32 v36, 0xff, v38 -; GCN-NEXT: v_and_b32_e32 v38, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v39, 0xff, v17 -; GCN-NEXT: v_and_b32_e32 v48, 0xff, v18 -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v19 -; GCN-NEXT: v_and_b32_e32 v50, 0xff, v21 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v22 -; GCN-NEXT: v_and_b32_e32 v52, 0xff, v23 -; GCN-NEXT: v_and_b32_e32 v53, 0xff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GCN-NEXT: v_and_b32_e32 v54, 0xff, v20 -; GCN-NEXT: v_and_b32_e32 v15, 0xff, v29 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xff, v30 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v31 -; GCN-NEXT: v_and_b32_e32 v19, 0xff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v21, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v23, 0xff, v33 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v39 -; GCN-NEXT: v_or_b32_e32 v14, v14, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v49 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v32, v32, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v51 -; GCN-NEXT: v_or_b32_e32 v13, v13, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v2, v2, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; GCN-NEXT: v_or_b32_e32 v8, v8, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GCN-NEXT: v_or_b32_e32 v0, v0, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v24 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v22, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v22, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v22, v1 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v29 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v31 -; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v32 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v30, v30, v33 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v31, v34 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v32, v26 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v32, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, s7, v28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v32, v15 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v32, v16 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v32, v17 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v32, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v3, v9, v3 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_or_b32_e32 v4, v4, v7 -; GCN-NEXT: v_or_b32_e32 v1, v1, v10 -; GCN-NEXT: v_or_b32_e32 v6, v23, v22 -; GCN-NEXT: v_or_b32_e32 v7, v24, v14 -; GCN-NEXT: v_or_b32_e32 v9, v30, v29 -; GCN-NEXT: v_or_b32_e32 v10, v31, v13 -; GCN-NEXT: v_or_b32_e32 v13, v26, v25 -; GCN-NEXT: v_or_b32_e32 v11, v27, v11 -; GCN-NEXT: v_or_b32_e32 v14, v15, v28 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_or_b32_e32 v12, v17, v12 -; GCN-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-NEXT: v_or_b32_e32 v15, v20, v19 -; GCN-NEXT: v_or_b32_e32 v0, v21, v0 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v11 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v20 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v19 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v10 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v9 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v1 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v16 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v43 -; GCN-NEXT: v_mov_b32_e32 v1, v35 -; GCN-NEXT: v_mov_b32_e32 v2, v49 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v39 -; GCN-NEXT: v_mov_b32_e32 v8, v51 -; GCN-NEXT: v_mov_b32_e32 v9, v55 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mov_b32_e32 v10, v41 -; GCN-NEXT: v_mov_b32_e32 v12, v32 -; GCN-NEXT: v_mov_b32_e32 v14, v34 -; GCN-NEXT: v_mov_b32_e32 v16, v36 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mov_b32_e32 v17, v37 -; GCN-NEXT: v_mov_b32_e32 v18, v38 -; GCN-NEXT: v_mov_b32_e32 v20, v48 -; GCN-NEXT: v_mov_b32_e32 v22, v50 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mov_b32_e32 v24, v52 -; GCN-NEXT: v_mov_b32_e32 v25, v53 -; GCN-NEXT: v_mov_b32_e32 v26, v54 -; GCN-NEXT: v_mov_b32_e32 v28, v40 -; GCN-NEXT: v_mov_b32_e32 v30, v42 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v64i8_to_v32bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v27 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v34 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v3, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_or_b32_e32 v33, v7, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v63 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v51, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v55, v13, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v11, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v32, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v13, v10, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v15, v14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v36, v1, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v37, v26, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v18, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v48, v1, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v57 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v21, v30, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v23, v25, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v52, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v53, v16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v45, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v49 +; SI-NEXT: v_or_b32_e32 v40, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v29, v56, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v61, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v57 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v16, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v3, v12, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v25, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v62 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v20 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v17 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v18, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v8 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v26, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v14, v3 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v0 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v13, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v3, v13, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v10 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v2, v43 +; SI-NEXT: v_mov_b32_e32 v10, v41 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v28, v40 +; SI-NEXT: v_mov_b32_e32 v30, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v4, v33 +; SI-NEXT: v_mov_b32_e32 v6, v39 +; SI-NEXT: v_mov_b32_e32 v8, v51 +; SI-NEXT: v_mov_b32_e32 v9, v55 +; SI-NEXT: v_mov_b32_e32 v12, v32 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_mov_b32_e32 v16, v36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v17, v37 +; SI-NEXT: v_mov_b32_e32 v18, v38 +; SI-NEXT: v_mov_b32_e32 v20, v48 +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v24, v52 +; SI-NEXT: v_mov_b32_e32 v25, v53 +; SI-NEXT: v_mov_b32_e32 v26, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i8_to_v32bf16: ; VI: ; %bb.0: @@ -47118,7 +94440,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: s_cbranch_execz .LBB110_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -47281,9 +94603,9 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: .LBB55_2: ; %Flow +; VI-NEXT: .LBB110_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_4 +; VI-NEXT: s_cbranch_execz .LBB110_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v38 @@ -47438,7 +94760,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: .LBB55_4: ; %end +; VI-NEXT: .LBB110_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -47582,7 +94904,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: s_cbranch_execz .LBB110_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload @@ -47747,9 +95069,9 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: .LBB55_2: ; %Flow +; GFX9-NEXT: .LBB110_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_4 +; GFX9-NEXT: s_cbranch_execz .LBB110_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 @@ -47902,7 +95224,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 -; GFX9-NEXT: .LBB55_4: ; %end +; GFX9-NEXT: .LBB110_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -48033,15 +95355,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.l @@ -48172,8 +95494,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3 @@ -48403,15 +95725,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 @@ -48558,8 +95880,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v70, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v67, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v69, 3 @@ -48728,3 +96050,2004 @@ end: %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <32 x bfloat> %phi } + +define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_readfirstlane_b32 s46, v30 +; SI-NEXT: v_readfirstlane_b32 s44, v23 +; SI-NEXT: v_readfirstlane_b32 s45, v22 +; SI-NEXT: v_readfirstlane_b32 s41, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s58, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v32 +; SI-NEXT: v_readfirstlane_b32 s56, v33 +; SI-NEXT: v_readfirstlane_b32 s57, v34 +; SI-NEXT: v_readfirstlane_b32 s47, v35 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v37 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v38 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 +; SI-NEXT: s_cbranch_scc0 .LBB111_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s8, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s11, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: s_or_b32 s14, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s40, s4, 16 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_or_b32 s42, s5, s4 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s60, s4, 16 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: s_lshl_b32 s61, s4, 16 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v19 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v51 +; SI-NEXT: v_or_b32_e32 v37, v13, v9 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v39, v21, v17 +; SI-NEXT: s_lshl_b32 s62, s4, 16 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v24 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: v_or_b32_e32 v32, v29, v25 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v38, v1, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v33, v14, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v26 +; SI-NEXT: s_lshl_b32 s63, s4, 16 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: v_or_b32_e32 v34, v42, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v48, v15, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v36, v23, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v49 +; SI-NEXT: s_lshl_b32 s72, s4, 16 +; SI-NEXT: v_or_b32_e32 v35, v31, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v53 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v9, v0, v9 +; SI-NEXT: v_or_b32_e32 v13, v5, v13 +; SI-NEXT: v_or_b32_e32 v15, v6, v15 +; SI-NEXT: v_or_b32_e32 v17, v7, v17 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v23, v30, v23 +; SI-NEXT: v_or_b32_e32 v25, v41, v25 +; SI-NEXT: v_or_b32_e32 v29, v44, v29 +; SI-NEXT: s_lshl_b32 s73, s4, 16 +; SI-NEXT: v_or_b32_e32 v31, v45, v31 +; SI-NEXT: s_cbranch_execnz .LBB111_4 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: s_add_i32 s59, s59, 3 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v43 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v45, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v43, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v54 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v13, v44, v13 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: s_add_i32 s57, s57, 3 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v9 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v42, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v51 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: v_add_i32_e32 v32, vcc, 0x3000000, v9 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v50 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v30, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v27 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v13, v22, v13 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v9 +; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v9, v14, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v19 +; SI-NEXT: v_or_b32_e32 v9, v13, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v13 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s41, 8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s27, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s22, 0xff +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s19, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v5, v5, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s7, 16 +; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s6, 16 +; SI-NEXT: s_and_b32 s15, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s5, 16 +; SI-NEXT: s_and_b32 s42, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s4, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v0 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v43 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: s_branch .LBB111_5 +; SI-NEXT: .LBB111_3: +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB111_2 +; SI-NEXT: .LBB111_4: +; SI-NEXT: v_mov_b32_e32 v10, s60 +; SI-NEXT: v_mov_b32_e32 v14, s61 +; SI-NEXT: v_mov_b32_e32 v18, s62 +; SI-NEXT: v_mov_b32_e32 v22, s63 +; SI-NEXT: v_mov_b32_e32 v26, s72 +; SI-NEXT: v_mov_b32_e32 v30, s73 +; SI-NEXT: .LBB111_5: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s40 +; SI-NEXT: v_mov_b32_e32 v7, s42 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_mov_b32_e32 v11, v38 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v16, v39 +; SI-NEXT: v_mov_b32_e32 v19, v33 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v24, v32 +; SI-NEXT: v_mov_b32_e32 v27, v34 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v55, v20 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v34, v12 +; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v51, v23 +; VI-NEXT: v_mov_b32_e32 v30, v26 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v30, v34 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 +; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 +; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 +; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 +; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 +; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: v_mov_b32_e32 v25, v23 +; VI-NEXT: v_mov_b32_e32 v23, v26 +; VI-NEXT: v_mov_b32_e32 v26, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v51 +; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_mov_b32_e32 v36, v34 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v21, v15 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v49, v7 +; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 +; GFX9-NEXT: s_or_b32 s9, s10, s9 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_or_b32 s10, s11, s10 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_addk_i32 s9, 0x300 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v31 +; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v39, v26 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v16, v22 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v35, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v24 +; GFX9-NEXT: v_mov_b32_e32 v31, v51 +; GFX9-NEXT: v_mov_b32_e32 v27, v25 +; GFX9-NEXT: v_mov_b32_e32 v23, v21 +; GFX9-NEXT: v_mov_b32_e32 v42, v15 +; GFX9-NEXT: v_mov_b32_e32 v19, v13 +; GFX9-NEXT: v_mov_b32_e32 v55, v11 +; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v96, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v3, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v86, 16, v87 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v96, 16, v97 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-TRUE16-NEXT: .LBB111_3: ; %end +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB111_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-TRUE16-NEXT: s_branch .LBB111_2 +; +; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32bf16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2 +; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10 +; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2 +; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80 +; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300 +; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-FAKE16-NEXT: .LBB111_3: ; %end +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB111_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-FAKE16-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 8ae7b58330256..dda05a8897979 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -1,42 +1,42 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v18f32: ; VI: ; %bb.0: @@ -148,37 +148,251 @@ end: ret <18 x float> %phi } +define inreg <18 x float> @bitcast_v18i32_to_v18f32_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v18i32_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v18i32_to_v18f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s29, s29, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v18i32: ; VI: ; %bb.0: @@ -187,7 +401,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -207,7 +421,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -218,7 +432,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -238,7 +452,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -250,7 +464,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -261,7 +475,7 @@ define <18 x i32> @bitcast_v18f32_to_v18i32(<18 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -281,37 +495,304 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v18f32_to_v18i32_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v18f32_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v18f32_to_v18i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v17, s53, 1.0 +; GFX11-NEXT: v_add_f32_e64 v16, s52, 1.0 +; GFX11-NEXT: v_add_f32_e64 v15, s51, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s50, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s49, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s48, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s47, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s46, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s45, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s44, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s43, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s42, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s41, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s39, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s38, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s37, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s36, 1.0 +; GFX11-NEXT: s_branch .LBB3_5 +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB3_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v9i64: ; VI: ; %bb.0: @@ -320,7 +801,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -340,7 +821,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -351,7 +832,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -371,7 +852,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -383,7 +864,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -403,7 +884,7 @@ define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -423,37 +904,251 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v18i32_to_v9i64_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v18i32_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v18i32_to_v9i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s29, s29, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v18i32: ; VI: ; %bb.0: @@ -462,7 +1157,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: s_cbranch_execz .LBB6_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -482,7 +1177,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -493,7 +1188,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -513,7 +1208,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -525,7 +1220,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -550,7 +1245,7 @@ define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -570,77 +1265,291 @@ end: ret <18 x i32> %phi } -define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <18 x i32> @bitcast_v9i64_to_v18i32_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v18i32_to_v9f64: +; VI-LABEL: bitcast_v9i64_to_v18i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v18i32_to_v9f64: +; GFX9-LABEL: bitcast_v9i64_to_v18i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v9i64_to_v18i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s28, s28, 3 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + +define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v18i32_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v18i32_to_v9f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18i32_to_v9f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -660,7 +1569,7 @@ define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -672,7 +1581,7 @@ define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -692,7 +1601,7 @@ define <9 x double> @bitcast_v18i32_to_v9f64(<18 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -712,28 +1621,242 @@ end: ret <9 x double> %phi } +define inreg <9 x double> @bitcast_v18i32_to_v9f64_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v18i32_to_v9f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v9f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v18i32_to_v9f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s29, s29, 3 +; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: s_add_i32 s27, s27, 3 +; GFX11-NEXT: s_add_i32 s26, s26, 3 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s23, s23, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi +} + define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v18i32: ; VI: ; %bb.0: @@ -742,7 +1865,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -753,7 +1876,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -764,7 +1887,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -775,7 +1898,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -787,7 +1910,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -798,7 +1921,7 @@ define <18 x i32> @bitcast_v9f64_to_v18i32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -818,184 +1941,432 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v9f64_to_v18i32_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v9f64_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v9f64_to_v18i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[16:17], s[52:53], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[50:51], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[48:49], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[46:47], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[44:45], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[42:43], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[38:39], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[36:37], 1.0 +; GFX11-NEXT: s_branch .LBB11_5 +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB11_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36i16: ; VI: ; %bb.0: @@ -1021,7 +2392,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -1041,9 +2412,9 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -1081,7 +2452,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -1145,7 +2516,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -1165,9 +2536,9 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -1205,7 +2576,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -1236,7 +2607,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -1256,7 +2627,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1285,7 +2656,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -1305,9 +2676,9 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -1345,7 +2716,7 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -1384,269 +2755,1030 @@ end: ret <36 x i16> %phi } +define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v18i32_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v18i32: ; VI: ; %bb.0: @@ -1675,7 +3807,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1732,9 +3864,9 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -1791,7 +3923,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -1861,7 +3993,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -1926,9 +4058,9 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -1975,7 +4107,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2004,7 +4136,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2024,7 +4156,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2072,7 +4204,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2092,7 +4224,7 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2112,337 +4244,1115 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v36i16_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB15_5 +; GFX11-TRUE16-NEXT: .LBB15_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB15_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_3: +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v18i32_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18i32_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18i32_to_v36f16: ; VI: ; %bb.0: @@ -2468,7 +5378,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -2488,9 +5398,9 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -2528,7 +5438,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -2592,7 +5502,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -2612,9 +5522,9 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 ; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 @@ -2652,7 +5562,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -2683,7 +5593,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -2703,7 +5613,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2732,7 +5642,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -2752,9 +5662,9 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 @@ -2792,7 +5702,7 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -2831,345 +5741,1213 @@ end: ret <36 x half> %phi } +define inreg <36 x half> @bitcast_v18i32_to_v36f16_scalar(<18 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18i32_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: s_lshr_b32 s13, s21, 16 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s24, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s26, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s28, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s8, 16 +; SI-NEXT: s_lshr_b32 s47, s7, 16 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v18i32_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v18i32_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v18i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v18i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v18i32: ; VI: ; %bb.0: @@ -3198,7 +6976,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3255,9 +7033,9 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3314,7 +7092,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3384,7 +7162,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -3449,9 +7227,9 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -3499,7 +7277,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3528,7 +7306,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -3548,7 +7326,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3596,7 +7374,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -3616,7 +7394,7 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3636,37 +7414,912 @@ end: ret <18 x i32> %phi } +define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v18i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v36f16_to_v18i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v18i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB19_5 +; GFX11-TRUE16-NEXT: .LBB19_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB19_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_3: +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v9i64: ; VI: ; %bb.0: @@ -3675,7 +8328,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -3695,7 +8348,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3706,7 +8359,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -3726,7 +8379,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3738,7 +8391,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -3749,7 +8402,7 @@ define <9 x i64> @bitcast_v18f32_to_v9i64(<18 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3769,37 +8422,304 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v18f32_to_v9i64_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v18f32_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v18f32_to_v9i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v17, s53, 1.0 +; GFX11-NEXT: v_add_f32_e64 v16, s52, 1.0 +; GFX11-NEXT: v_add_f32_e64 v15, s51, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s50, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s49, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s48, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s47, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s46, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s45, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s44, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s43, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s42, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s41, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s39, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s38, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s37, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s36, 1.0 +; GFX11-NEXT: s_branch .LBB21_5 +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: s_branch .LBB21_2 +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB21_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v18f32: ; VI: ; %bb.0: @@ -3808,7 +8728,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -3828,7 +8748,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3839,7 +8759,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -3859,7 +8779,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3871,7 +8791,7 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -3896,67 +8816,453 @@ define <18 x float> @bitcast_v9i64_to_v18f32(<9 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define inreg <18 x float> @bitcast_v9i64_to_v18f32_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v9i64_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v9i64_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v9i64_to_v18f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s28, s28, 3 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v18f32_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v18f32_to_v9f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18f32_to_v9f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18f32_to_v9f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <9 x i64> %a, splat (i64 3) - %a2 = bitcast <9 x i64> %a1 to <18 x float> + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <9 x double> br label %end cmp.false: - %a3 = bitcast <9 x i64> %a to <18 x float> + %a3 = bitcast <18 x float> %a to <9 x double> br label %end end: - %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <18 x float> %phi + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi } -define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <9 x double> @bitcast_v18f32_to_v9f64_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v18f32_to_v9f64: +; VI-LABEL: bitcast_v18f32_to_v9f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -3975,19 +9281,39 @@ define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB25_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 ; -; GFX9-LABEL: bitcast_v18f32_to_v9f64: +; GFX9-LABEL: bitcast_v18f32_to_v9f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 ; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 @@ -4006,31 +9332,106 @@ define <9 x double> @bitcast_v18f32_to_v9f64(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB25_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 ; -; GFX11-LABEL: bitcast_v18f32_to_v9f64: +; GFX11-LABEL: bitcast_v18f32_to_v9f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v17, s53, 1.0 +; GFX11-NEXT: v_add_f32_e64 v16, s52, 1.0 +; GFX11-NEXT: v_add_f32_e64 v15, s51, 1.0 +; GFX11-NEXT: v_add_f32_e64 v14, s50, 1.0 +; GFX11-NEXT: v_add_f32_e64 v13, s49, 1.0 +; GFX11-NEXT: v_add_f32_e64 v12, s48, 1.0 +; GFX11-NEXT: v_add_f32_e64 v11, s47, 1.0 +; GFX11-NEXT: v_add_f32_e64 v10, s46, 1.0 +; GFX11-NEXT: v_add_f32_e64 v9, s45, 1.0 +; GFX11-NEXT: v_add_f32_e64 v8, s44, 1.0 +; GFX11-NEXT: v_add_f32_e64 v7, s43, 1.0 +; GFX11-NEXT: v_add_f32_e64 v6, s42, 1.0 +; GFX11-NEXT: v_add_f32_e64 v5, s41, 1.0 +; GFX11-NEXT: v_add_f32_e64 v4, s40, 1.0 +; GFX11-NEXT: v_add_f32_e64 v3, s39, 1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s38, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s37, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s36, 1.0 +; GFX11-NEXT: s_branch .LBB25_5 +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB25_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4050,27 +9451,27 @@ end: } define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v18f32: ; VI: ; %bb.0: @@ -4079,7 +9480,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -4090,7 +9491,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4101,7 +9502,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -4112,7 +9513,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4124,7 +9525,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -4135,7 +9536,7 @@ define <18 x float> @bitcast_v9f64_to_v18f32(<9 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4155,184 +9556,432 @@ end: ret <18 x float> %phi } +define inreg <18 x float> @bitcast_v9f64_to_v18f32_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v9f64_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v9f64_to_v18f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[16:17], s[52:53], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[50:51], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[48:49], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[46:47], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[44:45], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[42:43], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[38:39], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], s[36:37], 1.0 +; GFX11-NEXT: s_branch .LBB27_5 +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB27_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v27, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v36i16: ; VI: ; %bb.0: @@ -4358,7 +10007,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -4378,9 +10027,9 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -4418,7 +10067,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -4482,7 +10131,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -4502,9 +10151,9 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -4542,7 +10191,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -4573,7 +10222,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -4584,7 +10233,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4613,7 +10262,7 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -4633,9 +10282,9 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -4649,42 +10298,852 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + +define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v7, s27 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v5, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v33, v18, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v5, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v25, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v33, v18, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v19, v19, v33 +; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v18f32_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v15, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v18, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v19, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v17, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v5, v34, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-TRUE16-NEXT: s_branch .LBB29_5 +; GFX11-TRUE16-NEXT: .LBB29_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s5 +; GFX11-TRUE16-NEXT: .LBB29_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: s_branch .LBB29_5 +; GFX11-FAKE16-NEXT: .LBB29_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5 +; GFX11-FAKE16-NEXT: .LBB29_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4704,268 +11163,267 @@ end: } define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v18f32: ; VI: ; %bb.0: @@ -4994,7 +11452,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5051,9 +11509,9 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -5110,7 +11568,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5180,7 +11638,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -5245,9 +11703,9 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -5294,7 +11752,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5323,7 +11781,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -5343,7 +11801,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5391,7 +11849,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -5411,7 +11869,7 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5431,337 +11889,1115 @@ end: ret <18 x float> %phi } +define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v36i16_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB31_5 +; GFX11-TRUE16-NEXT: .LBB31_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB31_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_3: +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v18f32_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v18f32_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v18f32_to_v36f16: ; VI: ; %bb.0: @@ -5787,7 +13023,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -5807,9 +13043,9 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -5847,7 +13083,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -5911,7 +13147,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -5931,9 +13167,9 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 ; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -5971,7 +13207,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -6002,7 +13238,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -6013,7 +13249,7 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6042,78 +13278,986 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <18 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <18 x float> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <18 x float> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + +define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v18f32_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v18f32_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v17, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v15, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v18, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v19 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v19, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v18f32_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v17, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v15, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v5, v34, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-TRUE16-NEXT: s_branch .LBB33_5 +; GFX11-TRUE16-NEXT: .LBB33_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s5 +; GFX11-TRUE16-NEXT: .LBB33_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s29, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s28, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v15, s27, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, s26, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, s25, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s24, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s23, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, s22, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, s21, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, s20, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, s19, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, s18, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, s17, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, s16, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, s3, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, s0, 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: s_branch .LBB33_5 +; GFX11-FAKE16-NEXT: .LBB33_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5 +; GFX11-FAKE16-NEXT: .LBB33_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6133,344 +14277,361 @@ end: } define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v18f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v18f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v18f32: ; VI: ; %bb.0: @@ -6499,7 +14660,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6556,9 +14717,9 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -6615,7 +14776,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6685,7 +14846,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -6750,9 +14911,9 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -6800,7 +14961,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6829,7 +14990,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -6849,7 +15010,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6897,7 +15058,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -6917,7 +15078,7 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6933,51 +15094,1112 @@ cmp.false: br label %end end: - %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <18 x float> %phi + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v18f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v36f16_to_v18f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v18f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB35_5 +; GFX11-TRUE16-NEXT: .LBB35_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB35_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_3: +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <18 x float> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <18 x float> + br label %end + +end: + %phi = phi <18 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x float> %phi +} + +define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v9i64_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9i64_to_v9f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9i64_to_v9f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9i64_to_v9f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi } -define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <9 x double> @bitcast_v9i64_to_v9f64_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v9i64_to_v9f64: +; VI-LABEL: bitcast_v9i64_to_v9f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -6996,19 +16218,39 @@ define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v9i64_to_v9f64: +; GFX9-LABEL: bitcast_v9i64_to_v9f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -7027,46 +16269,53 @@ define <9 x double> @bitcast_v9i64_to_v9f64(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v9i64_to_v9f64: +; GFX11-LABEL: bitcast_v9i64_to_v9f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_add_u32 s2, s2, 3 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_add_u32 s16, s16, 3 +; GFX11-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-NEXT: s_add_u32 s18, s18, 3 +; GFX11-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-NEXT: s_add_u32 s20, s20, 3 +; GFX11-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-NEXT: s_add_u32 s22, s22, 3 +; GFX11-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-NEXT: s_add_u32 s24, s24, 3 +; GFX11-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-NEXT: s_add_u32 s26, s26, 3 +; GFX11-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-NEXT: s_add_u32 s28, s28, 3 +; GFX11-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: s_branch .LBB37_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7085,27 +16334,27 @@ end: } define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v9i64: ; VI: ; %bb.0: @@ -7114,7 +16363,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7125,7 +16374,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7136,7 +16385,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7147,7 +16396,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7159,7 +16408,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7170,7 +16419,7 @@ define <9 x i64> @bitcast_v9f64_to_v9i64(<9 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7190,184 +16439,432 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v9f64_to_v9i64_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v9f64_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v4 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v9f64_to_v9i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_mov_b32 s47, s23 +; GFX11-NEXT: s_mov_b32 s46, s22 +; GFX11-NEXT: s_mov_b32 s45, s21 +; GFX11-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-NEXT: s_mov_b32 s44, s20 +; GFX11-NEXT: s_mov_b32 s43, s19 +; GFX11-NEXT: s_mov_b32 s42, s18 +; GFX11-NEXT: s_mov_b32 s41, s17 +; GFX11-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-NEXT: s_mov_b32 s40, s16 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s37, s1 +; GFX11-NEXT: s_mov_b32 s36, s0 +; GFX11-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-NEXT: s_mov_b32 s48, s24 +; GFX11-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-NEXT: s_mov_b32 s49, s25 +; GFX11-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-NEXT: s_mov_b32 s50, s26 +; GFX11-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-NEXT: s_mov_b32 s51, s27 +; GFX11-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-NEXT: s_mov_b32 s52, s28 +; GFX11-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-NEXT: s_mov_b32 s53, s29 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[36:37], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], s[38:39], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], s[42:43], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], s[44:45], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], s[46:47], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[48:49], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[50:51], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], s[52:53], 1.0 +; GFX11-NEXT: s_branch .LBB39_5 +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-NEXT: .LBB39_5: ; %end +; GFX11-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v33 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v7, v7, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v16, v16, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v36i16: ; VI: ; %bb.0: @@ -7393,7 +16890,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -7413,9 +16910,9 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -7453,7 +16950,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -7517,7 +17014,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -7537,9 +17034,9 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -7577,7 +17074,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -7608,7 +17105,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7633,7 +17130,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7662,7 +17159,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -7682,9 +17179,9 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -7727,7 +17224,7 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -7766,269 +17263,1030 @@ end: ret <36 x i16> %phi } +define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v4, s26 +; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: s_lshr_b32 s13, s27, 16 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: s_lshr_b32 s15, s23, 16 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s11, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v9i64_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v9i64_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v9i64: ; VI: ; %bb.0: @@ -8057,7 +18315,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8114,9 +18372,9 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -8173,7 +18431,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8243,7 +18501,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -8308,9 +18566,9 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -8357,7 +18615,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8386,7 +18644,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -8406,7 +18664,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8454,7 +18712,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -8474,7 +18732,7 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8494,337 +18752,1115 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v36i16_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB43_5 +; GFX11-TRUE16-NEXT: .LBB43_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB43_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_3: +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v9i64_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9i64_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9i64_to_v36f16: ; VI: ; %bb.0: @@ -8850,7 +19886,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -8870,9 +19906,9 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc @@ -8910,7 +19946,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -8974,7 +20010,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -8994,9 +20030,9 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc @@ -9034,7 +20070,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -9065,7 +20101,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9090,7 +20126,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9119,7 +20155,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -9139,9 +20175,9 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9184,7 +20220,7 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -9223,345 +20259,1213 @@ end: ret <36 x half> %phi } +define inreg <36 x half> @bitcast_v9i64_to_v36f16_scalar(<9 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9i64_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v1 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s10, s4, 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_add_u32 s12, s18, 3 +; SI-NEXT: s_addc_u32 s13, s19, 0 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: s_lshr_b32 s15, s13, 16 +; SI-NEXT: s_add_u32 s16, s20, 3 +; SI-NEXT: s_addc_u32 s17, s21, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s22, 3 +; SI-NEXT: s_addc_u32 s21, s23, 0 +; SI-NEXT: s_lshr_b32 s22, s20, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s40, s24, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s42, s26, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s44, s28, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s56, s6, 16 +; SI-NEXT: s_lshr_b32 s57, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v9i64_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s10, s7, 16 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s8, 16 +; VI-NEXT: s_lshr_b32 s13, s9, 16 +; VI-NEXT: s_lshr_b32 s14, s29, 16 +; VI-NEXT: s_lshr_b32 s15, s28, 16 +; VI-NEXT: s_lshr_b32 s40, s27, 16 +; VI-NEXT: s_lshr_b32 s41, s26, 16 +; VI-NEXT: s_lshr_b32 s42, s25, 16 +; VI-NEXT: s_lshr_b32 s43, s24, 16 +; VI-NEXT: s_lshr_b32 s44, s23, 16 +; VI-NEXT: s_lshr_b32 s45, s22, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s57, s18, 16 +; VI-NEXT: s_lshr_b32 s58, s17, 16 +; VI-NEXT: s_lshr_b32 s59, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s59, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s58, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s57, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s56, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s47, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s46, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s45, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s44, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s43, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s42, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s41, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s40, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s26, s15 +; VI-NEXT: s_and_b32 s26, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s14, s26, s14 +; VI-NEXT: s_or_b32 s9, s9, s13 +; VI-NEXT: s_or_b32 s8, s8, s12 +; VI-NEXT: s_or_b32 s6, s6, s11 +; VI-NEXT: s_or_b32 s7, s7, s10 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s15 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s9 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v17, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v9i64_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_lshr_b32 s12, s7, 16 +; GFX9-NEXT: s_lshr_b32 s13, s6, 16 +; GFX9-NEXT: s_lshr_b32 s14, s29, 16 +; GFX9-NEXT: s_lshr_b32 s15, s28, 16 +; GFX9-NEXT: s_lshr_b32 s40, s27, 16 +; GFX9-NEXT: s_lshr_b32 s41, s26, 16 +; GFX9-NEXT: s_lshr_b32 s42, s25, 16 +; GFX9-NEXT: s_lshr_b32 s43, s24, 16 +; GFX9-NEXT: s_lshr_b32 s44, s23, 16 +; GFX9-NEXT: s_lshr_b32 s45, s22, 16 +; GFX9-NEXT: s_lshr_b32 s46, s21, 16 +; GFX9-NEXT: s_lshr_b32 s47, s20, 16 +; GFX9-NEXT: s_lshr_b32 s56, s19, 16 +; GFX9-NEXT: s_lshr_b32 s57, s18, 16 +; GFX9-NEXT: s_lshr_b32 s58, s17, 16 +; GFX9-NEXT: s_lshr_b32 s59, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s29, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s15 +; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s18, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s19, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s18, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s19, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s18, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s19, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s4, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s18, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s19, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s20, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s24, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s25, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s26, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s27, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s28, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s29, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s15 :: v_dual_mov_b32 v7, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s7 :: v_dual_mov_b32 v15, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v17, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v9i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v9i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v9i64: ; VI: ; %bb.0: @@ -9590,7 +21494,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -9647,9 +21551,9 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9706,7 +21610,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9776,7 +21680,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -9841,9 +21745,9 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -9891,7 +21795,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9920,7 +21824,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -9940,7 +21844,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9988,7 +21892,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -10008,7 +21912,7 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -10028,175 +21932,1067 @@ end: ret <9 x i64> %phi } +define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v9i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v36f16_to_v9i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v9i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB47_5 +; GFX11-TRUE16-NEXT: .LBB47_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB47_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_3: +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} + define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v26, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v20, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v26, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v31, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v1, v1, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v36 -; GCN-NEXT: v_or_b32_e32 v2, v2, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v3, v3, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v5, v5, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v7, v7, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v9, v9, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v10, v10, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v12, v12, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v13, v13, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v15, v15, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v16, v16, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v17, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v18, v18, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v25, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v22, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v24, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36i16: ; VI: ; %bb.0: @@ -10222,7 +23018,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -10242,9 +23038,9 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10273,7 +23069,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -10337,7 +23133,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -10357,9 +23153,9 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10388,7 +23184,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -10419,7 +23215,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10430,7 +23226,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10459,7 +23255,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -10479,9 +23275,9 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -10510,7 +23306,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 @@ -10549,269 +23345,1033 @@ end: ret <36 x i16> %phi } +define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 +; SI-NEXT: v_mov_b32_e32 v16, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v19, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_alignbit_b32 v5, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v21, v7, v6, 16 +; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 +; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 +; SI-NEXT: v_alignbit_b32 v29, v15, v14, 16 +; SI-NEXT: v_alignbit_b32 v31, v17, v16, 16 +; SI-NEXT: v_alignbit_b32 v34, v19, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v22 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v9f64_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v18, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v19, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v19, v34, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s5 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} + define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v40 -; GCN-NEXT: v_or_b32_e32 v1, v1, v41 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v4, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v5, v18 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_or_b32_e32 v8, v8, v44 -; GCN-NEXT: v_or_b32_e32 v9, v9, v45 -; GCN-NEXT: v_or_b32_e32 v10, v10, v46 -; GCN-NEXT: v_or_b32_e32 v11, v11, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v56 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v14, v14, v58 -; GCN-NEXT: v_or_b32_e32 v15, v15, v59 -; GCN-NEXT: v_or_b32_e32 v16, v16, v60 -; GCN-NEXT: v_or_b32_e32 v17, v17, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v0, v40, v0 -; GCN-NEXT: v_or_b32_e32 v1, v41, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v18, v4 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v18, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v43, v7 -; GCN-NEXT: v_or_b32_e32 v8, v44, v8 -; GCN-NEXT: v_or_b32_e32 v9, v45, v9 -; GCN-NEXT: v_or_b32_e32 v10, v46, v10 -; GCN-NEXT: v_or_b32_e32 v11, v47, v11 -; GCN-NEXT: v_or_b32_e32 v12, v56, v12 -; GCN-NEXT: v_or_b32_e32 v13, v57, v13 -; GCN-NEXT: v_or_b32_e32 v14, v58, v14 -; GCN-NEXT: v_or_b32_e32 v15, v59, v15 -; GCN-NEXT: v_or_b32_e32 v16, v60, v16 -; GCN-NEXT: v_or_b32_e32 v17, v61, v17 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v40 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v0, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v63 +; SI-NEXT: v_or_b32_e32 v5, v5, v62 +; SI-NEXT: v_or_b32_e32 v6, v6, v61 +; SI-NEXT: v_or_b32_e32 v7, v7, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v59 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v16, v16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v46 +; SI-NEXT: v_or_b32_e32 v14, v14, v45 +; SI-NEXT: v_or_b32_e32 v15, v15, v44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v36 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: v_or_b32_e32 v5, v62, v5 +; SI-NEXT: v_or_b32_e32 v6, v61, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v59, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v56, v11 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v42, v17 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v12, v47, v12 +; SI-NEXT: v_or_b32_e32 v13, v46, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v44, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v9f64: ; VI: ; %bb.0: @@ -10840,7 +24400,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -10897,9 +24457,9 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 @@ -10956,7 +24516,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v18, 3, v32 ; VI-NEXT: v_add_u16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11026,7 +24586,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -11091,9 +24651,9 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -11140,7 +24700,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11169,7 +24729,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11189,7 +24749,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11237,7 +24797,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11257,7 +24817,7 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -11277,319 +24837,1097 @@ end: ret <9 x double> %phi } +define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v20 +; SI-NEXT: v_mov_b32_e32 v33, v18 +; SI-NEXT: v_mov_b32_e32 v34, v16 +; SI-NEXT: v_mov_b32_e32 v35, v14 +; SI-NEXT: v_mov_b32_e32 v36, v12 +; SI-NEXT: v_mov_b32_e32 v37, v10 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v39, v6 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v50, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v21 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v7, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v8, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v9, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v16, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v17, v0, v51 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v36i16_to_v9f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v9f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_branch .LBB51_5 +; GFX11-TRUE16-NEXT: .LBB51_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB51_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_3: +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi +} + define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v9f64_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v53 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v49 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v39 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v37 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v35, v37 -; GCN-NEXT: v_or_b32_e32 v11, v33, v49 -; GCN-NEXT: v_or_b32_e32 v13, v31, v52 -; GCN-NEXT: v_or_b32_e32 v14, v29, v50 -; GCN-NEXT: v_or_b32_e32 v16, v27, v48 -; GCN-NEXT: v_or_b32_e32 v17, v25, v38 -; GCN-NEXT: v_or_b32_e32 v24, v24, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v9f64_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f64_to_v36f16: ; VI: ; %bb.0: @@ -11615,7 +25953,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -11635,9 +25973,9 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11666,7 +26004,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 @@ -11730,7 +26068,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -11750,9 +26088,9 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11781,7 +26119,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v35, v0, s4 @@ -11812,7 +26150,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11823,7 +26161,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11852,7 +26190,7 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 @@ -11872,9 +26210,9 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -11899,31 +26237,894 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <9 x double> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <9 x double> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + +define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v9f64_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s8, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: s_lshr_b32 s8, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 +; SI-NEXT: s_lshr_b32 s8, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s8, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: s_lshr_b32 s8, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: s_lshr_b32 s8, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: s_lshr_b32 s8, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: s_lshr_b32 s8, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s8 +; SI-NEXT: s_lshr_b32 s8, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s8 +; SI-NEXT: s_lshr_b32 s8, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s8 +; SI-NEXT: s_lshr_b32 s8, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s8 +; SI-NEXT: s_lshr_b32 s8, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s8 +; SI-NEXT: s_lshr_b32 s8, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s8 +; SI-NEXT: s_lshr_b32 s8, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s8 +; SI-NEXT: s_lshr_b32 s8, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s8 +; SI-NEXT: s_lshr_b32 s8, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s8 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s8 +; SI-NEXT: s_lshr_b32 s8, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v9f64_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v18, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v16, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v14, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v18, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 +; VI-NEXT: v_or_b32_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v19, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v9f64_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v18, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v16, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v14, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v35, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v19, v34, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v22, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-TRUE16-NEXT: s_branch .LBB53_5 +; GFX11-TRUE16-NEXT: .LBB53_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s46 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s5 +; GFX11-TRUE16-NEXT: .LBB53_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], s[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], s[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], s[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], s[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], s[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], s[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], s[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GFX11-FAKE16-NEXT: s_branch .LBB53_5 +; GFX11-FAKE16-NEXT: .LBB53_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5 +; GFX11-FAKE16-NEXT: .LBB53_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v33, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v32, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v26, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v35, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v23, 16, v19 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11943,344 +27144,361 @@ end: } define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v9f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v32, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; GCN-NEXT: v_or_b32_e32 v2, v62, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; GCN-NEXT: v_or_b32_e32 v3, v60, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v46, v4 -; GCN-NEXT: v_or_b32_e32 v5, v44, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v40, v7 -; GCN-NEXT: v_or_b32_e32 v8, v54, v8 -; GCN-NEXT: v_or_b32_e32 v9, v52, v9 -; GCN-NEXT: v_or_b32_e32 v10, v51, v10 -; GCN-NEXT: v_or_b32_e32 v11, v48, v11 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v18, v12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v18, v14 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v18, v15 -; GCN-NEXT: v_or_b32_e32 v16, v37, v16 -; GCN-NEXT: v_or_b32_e32 v17, v36, v17 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v60 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v55 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v53 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v36 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_or_b32_e32 v5, v7, v6 -; GCN-NEXT: v_or_b32_e32 v6, v9, v8 -; GCN-NEXT: v_or_b32_e32 v7, v11, v10 -; GCN-NEXT: v_or_b32_e32 v8, v13, v12 -; GCN-NEXT: v_or_b32_e32 v9, v15, v14 -; GCN-NEXT: v_or_b32_e32 v10, v17, v16 -; GCN-NEXT: v_or_b32_e32 v11, v19, v18 -; GCN-NEXT: v_or_b32_e32 v12, v21, v20 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v25, v24 -; GCN-NEXT: v_or_b32_e32 v15, v27, v26 -; GCN-NEXT: v_or_b32_e32 v16, v29, v28 -; GCN-NEXT: v_or_b32_e32 v17, v31, v30 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v9f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_or_b32_e32 v4, v58, v4 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v44, v7 +; SI-NEXT: v_or_b32_e32 v8, v42, v8 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v52, v11 +; SI-NEXT: v_or_b32_e32 v12, v50, v12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr18 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v17, v36, v17 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v9f64: ; VI: ; %bb.0: @@ -12309,7 +27527,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v17, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v17, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -12366,9 +27584,9 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v41, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -12425,7 +27643,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12495,7 +27713,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; kill: killed $vgpr18 @@ -12560,9 +27778,9 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -12610,7 +27828,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12639,7 +27857,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12659,7 +27877,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12707,7 +27925,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12727,7 +27945,7 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12747,420 +27965,1308 @@ end: ret <9 x double> %phi } +define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v9f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_or_b32_e32 v4, v41, v4 +; SI-NEXT: v_or_b32_e32 v5, v55, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v51, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_or_b32_e32 v11, v37, v11 +; SI-NEXT: v_or_b32_e32 v12, v31, v12 +; SI-NEXT: v_or_b32_e32 v13, v29, v13 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v15, v25, v15 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v27 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v38, v23 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v39 +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: v_mov_b32_e32 v43, v48 +; SI-NEXT: v_mov_b32_e32 v48, v25 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v49 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v27 +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v53, v30 +; SI-NEXT: v_mov_b32_e32 v54, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_mov_b32_e32 v39, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v38 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v36 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v54 +; SI-NEXT: v_mov_b32_e32 v30, v53 +; SI-NEXT: v_mov_b32_e32 v29, v52 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_mov_b32_e32 v44, v32 +; SI-NEXT: v_mov_b32_e32 v25, v48 +; SI-NEXT: v_mov_b32_e32 v48, v43 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v36f16_to_v9f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_mov_b32_e32 v32, v3 +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v34, v1 +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v35, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v16, v18, v16 +; VI-NEXT: v_add_f16_sdwa v17, v32, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v36f16_to_v9f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v3 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v34, v1 +; GFX9-NEXT: v_mov_b32_e32 v35, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v32, s32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s36, 0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s37, 1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s38, 2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s39, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s48, 4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s49, 5 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s50, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s51, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s52, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s53, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s54, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s55, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s64, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s65, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s66, 14 +; GFX11-TRUE16-NEXT: v_writelane_b32 v32, s67, 15 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s36, s40, s77 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s37, s41, s76 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s48, s56, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s49, s57, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s38, s40, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s39, s41, s74 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s41, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s42, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s43, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s44, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s45, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s50, s56, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s51, s57, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s40, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s41, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s42, s42, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s43, s43, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s44, s44, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s45, s45, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s46, s46, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s52, s56, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s53, s57, s4 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s74 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_branch .LBB55_5 +; GFX11-TRUE16-NEXT: .LBB55_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v5, s41 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v7, s43 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s44 :: v_dual_mov_b32 v9, s45 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s46 :: v_dual_mov_b32 v11, s47 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s48 :: v_dual_mov_b32 v13, s49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s50 :: v_dual_mov_b32 v15, s51 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v17, s53 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s54 :: v_dual_mov_b32 v19, s55 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s56 :: v_dual_mov_b32 v21, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s58 :: v_dual_mov_b32 v23, s59 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s60 :: v_dual_mov_b32 v25, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s62 :: v_dual_mov_b32 v27, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s64 :: v_dual_mov_b32 v29, s65 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s66 :: v_dual_mov_b32 v31, s67 +; GFX11-TRUE16-NEXT: .LBB55_5: ; %end +; GFX11-TRUE16-NEXT: v_readlane_b32 s67, v32, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s66, v32, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v32, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v32, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s55, v32, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v32, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v32, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v32, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v32, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v32, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v32, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v32, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v32, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v32, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v32, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v32, 0 +; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_3: +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s22 :: v_dual_mov_b32 v23, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s26 :: v_dual_mov_b32 v27, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s28 :: v_dual_mov_b32 v29, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <9 x double> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <9 x double> + br label %end + +end: + %phi = phi <9 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x double> %phi +} + define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v36i16_to_v36f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v30 -; GCN-NEXT: s_waitcnt vmcnt(9) -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v39 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v39 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v37 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v35 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v42 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v43 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v44 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v45 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v47 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v57 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v48 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v49 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v50 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v14, v13 -; GCN-NEXT: v_or_b32_e32 v8, v17, v16 -; GCN-NEXT: v_or_b32_e32 v10, v20, v19 -; GCN-NEXT: v_or_b32_e32 v11, v23, v22 -; GCN-NEXT: v_or_b32_e32 v13, v26, v25 -; GCN-NEXT: v_or_b32_e32 v14, v29, v28 -; GCN-NEXT: v_or_b32_e32 v16, v35, v34 -; GCN-NEXT: v_or_b32_e32 v17, v38, v37 -; GCN-NEXT: v_or_b32_e32 v19, v49, v48 -; GCN-NEXT: v_or_b32_e32 v20, v52, v51 -; GCN-NEXT: v_or_b32_e32 v22, v53, v42 -; GCN-NEXT: v_or_b32_e32 v23, v54, v31 -; GCN-NEXT: v_or_b32_e32 v25, v55, v32 -; GCN-NEXT: v_or_b32_e32 v26, v40, v33 -; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36i16_to_v36f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36i16_to_v36f16: ; VI: ; %bb.0: @@ -13187,7 +29293,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v35, 3, v35 @@ -13225,7 +29331,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v20 ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13290,7 +29396,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v17, v35, v17, s6 @@ -13347,7 +29453,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v18, v0, s4 @@ -13378,7 +29484,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] @@ -13398,7 +29504,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13428,7 +29534,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 @@ -13484,7 +29590,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 @@ -13522,323 +29628,1274 @@ end: ret <36 x half> %phi } +define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36i16_to_v36f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v36i16_to_v36f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v36i16_to_v36f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v8, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v7, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v6, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v5, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v19, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v18, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v21, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s42 +; GFX9-NEXT: v_mov_b32_e32 v24, s41 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s15 +; GFX9-NEXT: v_mov_b32_e32 v27, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v29, s12 +; GFX9-NEXT: v_mov_b32_e32 v30, s11 +; GFX9-NEXT: v_mov_b32_e32 v31, s10 +; GFX9-NEXT: v_mov_b32_e32 v32, s9 +; GFX9-NEXT: v_mov_b32_e32 v33, s8 +; GFX9-NEXT: v_mov_b32_e32 v34, s7 +; GFX9-NEXT: v_mov_b32_e32 v35, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v35, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v34, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v4, v31, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v29, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v28, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v27, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v23, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v22, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s5 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v29, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v24, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v36f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <36 x i16> %a, splat (i16 3) + %a2 = bitcast <36 x i16> %a1 to <36 x half> + br label %end + +cmp.false: + %a3 = bitcast <36 x i16> %a to <36 x half> + br label %end + +end: + %phi = phi <36 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x half> %phi +} + define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v36f16_to_v36i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v30 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v55 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v31 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v33 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v35 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v21, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v52, v22, v21 -; GCN-NEXT: v_or_b32_e32 v50, v24, v23 -; GCN-NEXT: v_or_b32_e32 v48, v26, v25 -; GCN-NEXT: v_or_b32_e32 v38, v28, v27 -; GCN-NEXT: v_or_b32_e32 v37, v30, v29 -; GCN-NEXT: v_or_b32_e32 v18, v18, v20 -; GCN-NEXT: v_or_b32_e32 v16, v16, v19 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: v_alignbit_b32 v54, v35, v21, 16 -; GCN-NEXT: v_alignbit_b32 v53, v33, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v31, v25, 16 -; GCN-NEXT: v_alignbit_b32 v49, v11, v27, 16 -; GCN-NEXT: v_alignbit_b32 v39, v9, v29, 16 -; GCN-NEXT: v_alignbit_b32 v20, v7, v20, 16 -; GCN-NEXT: v_alignbit_b32 v19, v5, v19, 16 -; GCN-NEXT: v_alignbit_b32 v17, v3, v17, 16 -; GCN-NEXT: v_alignbit_b32 v15, v1, v15, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v54 -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v53 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v34 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v51 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; GCN-NEXT: v_or_b32_e32 v30, v30, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v49 -; GCN-NEXT: v_or_b32_e32 v32, v32, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v39 -; GCN-NEXT: v_or_b32_e32 v34, v34, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v18, v18, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_or_b32_e32 v16, v16, v19 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_or_b32_e32 v14, v14, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v13, v13, v15 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v36f16_to_v36i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v43 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v53 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v33, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v36, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v39, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v50, v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v49, v21, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_or_b32_e32 v38, v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v35, v25, v23 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_or_b32_e32 v15, v15, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_alignbit_b32 v55, v39, v20, 16 +; SI-NEXT: v_alignbit_b32 v54, v36, v22, 16 +; SI-NEXT: v_alignbit_b32 v53, v33, v21, 16 +; SI-NEXT: v_alignbit_b32 v52, v31, v23, 16 +; SI-NEXT: v_alignbit_b32 v51, v13, v24, 16 +; SI-NEXT: v_alignbit_b32 v19, v10, v19, 16 +; SI-NEXT: v_alignbit_b32 v18, v6, v18, 16 +; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v17, v17, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v17, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v36f16_to_v36i16: ; VI: ; %bb.0: @@ -13865,7 +30922,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 @@ -13903,7 +30960,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 ; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13968,7 +31025,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v17, v35, v17, s6 @@ -14026,7 +31083,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v18, v0, s4 @@ -14057,7 +31114,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] @@ -14077,7 +31134,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14107,7 +31164,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100 @@ -14163,7 +31220,7 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100 @@ -14200,3 +31257,937 @@ end: %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <36 x i16> %phi } + +define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v36f16_to_v36i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v23, v23, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v24, v24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v27, v27, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v13, v13, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v37 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_or_b32_e32 v15, v15, v33 +; SI-NEXT: v_or_b32_e32 v29, v29, v32 +; SI-NEXT: v_or_b32_e32 v26, v26, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: v_alignbit_b32 v36, v19, v36, 16 +; SI-NEXT: v_alignbit_b32 v35, v16, v35, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v34, 16 +; SI-NEXT: v_alignbit_b32 v33, v27, v33, 16 +; SI-NEXT: v_alignbit_b32 v32, v24, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v23, v31, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v30, 16 +; SI-NEXT: v_alignbit_b32 v12, v3, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v22, v22, v36 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v36f16_to_v36i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v22, 0x200 +; VI-NEXT: v_add_f16_e32 v20, s16, v22 +; VI-NEXT: v_add_f16_e32 v35, s43, v22 +; VI-NEXT: v_add_f16_e32 v21, s17, v22 +; VI-NEXT: v_add_f16_e32 v34, s42, v22 +; VI-NEXT: v_add_f16_e32 v18, s18, v22 +; VI-NEXT: v_add_f16_e32 v33, s41, v22 +; VI-NEXT: v_add_f16_e32 v19, s19, v22 +; VI-NEXT: v_add_f16_e32 v32, s40, v22 +; VI-NEXT: v_add_f16_e32 v4, s20, v22 +; VI-NEXT: v_add_f16_e32 v31, s15, v22 +; VI-NEXT: v_add_f16_e32 v5, s21, v22 +; VI-NEXT: v_add_f16_e32 v30, s14, v22 +; VI-NEXT: v_add_f16_e32 v6, s22, v22 +; VI-NEXT: v_add_f16_e32 v29, s13, v22 +; VI-NEXT: v_add_f16_e32 v7, s23, v22 +; VI-NEXT: v_add_f16_e32 v28, s12, v22 +; VI-NEXT: v_add_f16_e32 v8, s24, v22 +; VI-NEXT: v_add_f16_e32 v27, s11, v22 +; VI-NEXT: v_add_f16_e32 v9, s25, v22 +; VI-NEXT: v_add_f16_e32 v26, s10, v22 +; VI-NEXT: v_add_f16_e32 v10, s26, v22 +; VI-NEXT: v_add_f16_e32 v25, s9, v22 +; VI-NEXT: v_add_f16_e32 v11, s27, v22 +; VI-NEXT: v_add_f16_e32 v24, s8, v22 +; VI-NEXT: v_add_f16_e32 v12, s28, v22 +; VI-NEXT: v_add_f16_e32 v23, s7, v22 +; VI-NEXT: v_add_f16_e32 v13, s29, v22 +; VI-NEXT: v_add_f16_e32 v22, s6, v22 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v24, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v25, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v26, s10 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v27, s11 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v28, s12 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v29, s13 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v30, s14 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v31, s15 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v32, s40 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v34, s42 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v35, s43 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v4, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v20 +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: v_mov_b32_e32 v2, v18 +; VI-NEXT: v_mov_b32_e32 v3, v19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v36f16_to_v36i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v7, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v6, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v5, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v4, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v19, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v18, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v21, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v20, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s42 +; GFX9-NEXT: v_mov_b32_e32 v24, s41 +; GFX9-NEXT: v_mov_b32_e32 v25, s40 +; GFX9-NEXT: v_mov_b32_e32 v26, s15 +; GFX9-NEXT: v_mov_b32_e32 v27, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s13 +; GFX9-NEXT: v_mov_b32_e32 v29, s12 +; GFX9-NEXT: v_mov_b32_e32 v30, s11 +; GFX9-NEXT: v_mov_b32_e32 v31, s10 +; GFX9-NEXT: v_mov_b32_e32 v32, s9 +; GFX9-NEXT: v_mov_b32_e32 v33, s8 +; GFX9-NEXT: v_mov_b32_e32 v34, s7 +; GFX9-NEXT: v_mov_b32_e32 v35, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v20, v35, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v34, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v4, v31, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v29, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v28, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v27, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v26, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v23, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v22, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v20 +; GFX9-NEXT: v_mov_b32_e32 v1, v21 +; GFX9-NEXT: v_mov_b32_e32 v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, v19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s5 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v28, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v29, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v24, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v24 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v36i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v33, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v31, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v29, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v28, 16, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v24, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v23, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v21, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v20, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v19, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v24 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <36 x half> %a, splat (half 0xH0200) + %a2 = bitcast <36 x half> %a1 to <36 x i16> + br label %end + +cmp.false: + %a3 = bitcast <36 x half> %a to <36 x i16> + br label %end + +end: + %phi = phi <36 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <36 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 67e035ba7d934..0ac06bbd1b996 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -1,44 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v20f32: ; VI: ; %bb.0: @@ -156,39 +156,269 @@ end: ret <20 x float> %phi } +define inreg <20 x float> @bitcast_v20i32_to_v20f32_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v20i32_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v20i32_to_v20f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v20i32: ; VI: ; %bb.0: @@ -197,7 +427,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -219,7 +449,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -230,7 +460,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -252,7 +482,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -264,7 +494,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -276,7 +506,7 @@ define <20 x i32> @bitcast_v20f32_to_v20i32(<20 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -296,39 +526,259 @@ end: ret <20 x i32> %phi } +define inreg <20 x i32> @bitcast_v20f32_to_v20i32_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v20f32_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v20f32_to_v20i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v10i64: ; VI: ; %bb.0: @@ -337,7 +787,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -359,7 +809,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -370,7 +820,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -392,7 +842,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -404,7 +854,7 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -426,69 +876,497 @@ define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + +define inreg <10 x i64> @bitcast_v20i32_to_v10i64_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v20i32_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v20i32_to_v10i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + +define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v10i64_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i64_to_v20i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i64_to_v20i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i64_to_v20i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <20 x i32> %a, splat (i32 3) - %a2 = bitcast <20 x i32> %a1 to <10 x i64> + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <20 x i32> br label %end cmp.false: - %a3 = bitcast <20 x i32> %a to <10 x i64> + %a3 = bitcast <10 x i64> %a to <20 x i32> br label %end end: - %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <10 x i64> %phi + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi } -define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <20 x i32> @bitcast_v10i64_to_v20i32_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v10i64_to_v20i32: +; VI-LABEL: bitcast_v10i64_to_v20i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 @@ -509,19 +1387,41 @@ define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB7_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v10i64_to_v20i32: +; GFX9-LABEL: bitcast_v10i64_to_v20i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 @@ -542,20 +1442,37 @@ define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB7_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 ; -; GFX11-LABEL: bitcast_v10i64_to_v20i32: +; GFX11-LABEL: bitcast_v10i64_to_v20i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo @@ -581,8 +1498,6 @@ define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -602,38 +1517,38 @@ end: } define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v10f64: ; VI: ; %bb.0: @@ -642,7 +1557,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -664,7 +1579,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -675,7 +1590,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -697,7 +1612,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -709,7 +1624,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -731,7 +1646,7 @@ define <10 x double> @bitcast_v20i32_to_v10f64(<20 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -751,29 +1666,259 @@ end: ret <10 x double> %phi } +define inreg <10 x double> @bitcast_v20i32_to_v10f64_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v20i32_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v20i32_to_v10f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v20i32: ; VI: ; %bb.0: @@ -782,7 +1927,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -794,7 +1939,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -805,7 +1950,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -817,7 +1962,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -829,7 +1974,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -841,7 +1986,7 @@ define <20 x i32> @bitcast_v10f64_to_v20i32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -861,202 +2006,411 @@ end: ret <20 x i32> %phi } +define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v10f64_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v12, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v10f64_to_v20i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40i16: ; VI: ; %bb.0: @@ -1084,7 +2438,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -1106,9 +2460,9 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -1150,7 +2504,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -1220,7 +2574,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -1242,9 +2596,9 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -1286,7 +2640,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -1319,7 +2673,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -1341,7 +2695,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1372,7 +2726,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -1394,9 +2748,9 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -1438,7 +2792,7 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -1479,321 +2833,1158 @@ end: ret <40 x i16> %phi } +define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v20i32_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v20i32: ; VI: ; %bb.0: @@ -1826,7 +4017,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1889,9 +4080,9 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -1954,7 +4145,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2034,7 +4225,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -2113,9 +4304,9 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -2174,7 +4365,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2203,7 +4394,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2225,7 +4416,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2277,7 +4468,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2299,7 +4490,7 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2319,381 +4510,1184 @@ end: ret <20 x i32> %phi } +define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v40i16_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v20i32_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20i32_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20i32_to_v40f16: ; VI: ; %bb.0: @@ -2721,7 +5715,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -2743,9 +5737,9 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -2787,7 +5781,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -2857,7 +5851,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -2879,9 +5873,9 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 @@ -2923,7 +5917,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -2956,7 +5950,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -2978,7 +5972,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3009,7 +6003,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -3031,9 +6025,9 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 @@ -3075,7 +6069,7 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -3116,405 +6110,1371 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v20i32_to_v40f16_scalar(<20 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20i32_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_lshr_b32 s44, s26, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: s_lshr_b32 s47, s29, 16 +; SI-NEXT: s_lshr_b32 s56, s11, 16 +; SI-NEXT: s_lshr_b32 s57, s10, 16 +; SI-NEXT: s_lshr_b32 s58, s8, 16 +; SI-NEXT: s_lshr_b32 s59, s7, 16 +; SI-NEXT: s_lshr_b32 s60, s6, 16 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v20i32_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v20i32_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v20i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v20i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v20i32: ; VI: ; %bb.0: @@ -3547,7 +7507,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3610,9 +7570,9 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3675,7 +7635,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3755,7 +7715,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -3834,9 +7794,9 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -3896,7 +7856,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3925,7 +7885,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -3947,7 +7907,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3999,7 +7959,7 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4021,69 +7981,1173 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <40 x half> %a, splat (half 0xH0200) - %a2 = bitcast <40 x half> %a1 to <20 x i32> + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + +define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v20i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v40f16_to_v20i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v20i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + +define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v20f32_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f32_to_v10i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f32_to_v10i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f32_to_v10i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <10 x i64> br label %end cmp.false: - %a3 = bitcast <40 x half> %a to <20 x i32> + %a3 = bitcast <20 x float> %a to <10 x i64> br label %end end: - %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <20 x i32> %phi + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi } -define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x i64> @bitcast_v20f32_to_v10i64_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 ; -; VI-LABEL: bitcast_v20f32_to_v10i64: +; VI-LABEL: bitcast_v20f32_to_v10i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 @@ -4104,19 +9168,41 @@ define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB21_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 ; -; GFX9-LABEL: bitcast_v20f32_to_v10i64: +; GFX9-LABEL: bitcast_v20f32_to_v10i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 ; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 @@ -4137,20 +9223,37 @@ define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB21_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 ; -; GFX11-LABEL: bitcast_v20f32_to_v10i64: +; GFX11-LABEL: bitcast_v20f32_to_v10i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 ; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 @@ -4161,8 +9264,6 @@ define <10 x i64> @bitcast_v20f32_to_v10i64(<20 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4182,38 +9283,38 @@ end: } define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i64_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v20f32: ; VI: ; %bb.0: @@ -4222,7 +9323,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -4244,7 +9345,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4255,7 +9356,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -4277,7 +9378,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4289,7 +9390,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -4316,7 +9417,7 @@ define <20 x float> @bitcast_v10i64_to_v20f32(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4336,39 +9437,274 @@ end: ret <20 x float> %phi } +define inreg <20 x float> @bitcast_v10i64_to_v20f32_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v10i64_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v10i64_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v10i64_to_v20f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x i64> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v10f64: ; VI: ; %bb.0: @@ -4377,7 +9713,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4399,7 +9735,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4410,7 +9746,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4432,7 +9768,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4444,7 +9780,7 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -4456,8 +9792,228 @@ define <10 x double> @bitcast_v20f32_to_v10f64(<20 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + +define inreg <10 x double> @bitcast_v20f32_to_v10f64_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v20f32_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v20f32_to_v10f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4477,28 +10033,28 @@ end: } define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v20f32: ; VI: ; %bb.0: @@ -4507,7 +10063,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4519,7 +10075,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4530,7 +10086,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4542,7 +10098,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4554,7 +10110,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4566,7 +10122,7 @@ define <20 x float> @bitcast_v10f64_to_v20f32(<10 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4586,202 +10142,411 @@ end: ret <20 x float> %phi } +define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v10f64_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v12, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v10f64_to_v20f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40i16: ; VI: ; %bb.0: @@ -4809,7 +10574,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -4831,9 +10596,9 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -4875,7 +10640,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -4945,7 +10710,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -4967,9 +10732,9 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -5011,7 +10776,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -5044,7 +10809,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -5056,7 +10821,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5087,7 +10852,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -5109,9 +10874,9 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -5143,7 +10908,7 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -5184,321 +10949,1144 @@ end: ret <40 x i16> %phi } +define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v17, s19 +; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v13, s23 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: v_mov_b32_e32 v11, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v18, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v35, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_alignbit_b32 v18, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v7, v8, 16 +; SI-NEXT: v_alignbit_b32 v26, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v28, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v33, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v35, v17, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v20f32_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v25, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v20, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v23, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v7, v38, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v20f32: ; VI: ; %bb.0: @@ -5531,7 +12119,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5594,9 +12182,9 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -5659,7 +12247,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5739,7 +12327,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -5818,9 +12406,9 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -5879,7 +12467,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5908,7 +12496,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -5930,7 +12518,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5982,7 +12570,7 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -6004,9 +12592,801 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + +define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v40i16_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6025,380 +13405,391 @@ end: } define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v20f32_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v20f32_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v20f32_to_v40f16: ; VI: ; %bb.0: @@ -6426,7 +13817,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -6448,9 +13839,9 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -6492,7 +13883,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -6562,7 +13953,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -6584,9 +13975,9 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 ; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 @@ -6628,7 +14019,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -6661,7 +14052,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 @@ -6673,7 +14064,7 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6699,91 +14090,1038 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <20 x float> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <20 x float> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + +define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v20f32_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v20f32_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v19, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v25, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v20, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v21, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v23, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v20f32_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v19, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v7, v38, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v36, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v37, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6802,404 +15140,431 @@ end: } define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v20f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v20f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v20f32: ; VI: ; %bb.0: @@ -7232,7 +15597,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -7295,9 +15660,9 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7360,7 +15725,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7440,7 +15805,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -7519,9 +15884,9 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -7581,7 +15946,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7610,7 +15975,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -7632,7 +15997,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7684,7 +16049,7 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -7706,69 +16071,1188 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <40 x half> %a, splat (half 0xH0200) - %a2 = bitcast <40 x half> %a1 to <20 x float> + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + +define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v20f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v40f16_to_v20f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v20f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <20 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <20 x float> + br label %end + +end: + %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x float> %phi +} + +define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v10i64_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i64_to_v10f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i64_to_v10f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i64_to_v10f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <10 x double> br label %end cmp.false: - %a3 = bitcast <40 x half> %a to <20 x float> + %a3 = bitcast <10 x i64> %a to <10 x double> br label %end end: - %phi = phi <20 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <20 x float> %phi + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi } -define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <10 x double> @bitcast_v10i64_to_v10f64_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v10i64_to_v10f64: +; VI-LABEL: bitcast_v10i64_to_v10f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -7789,19 +17273,41 @@ define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v10i64_to_v10f64: +; GFX9-LABEL: bitcast_v10i64_to_v10f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -7822,20 +17328,37 @@ define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v10i64_to_v10f64: +; GFX11-LABEL: bitcast_v10i64_to_v10f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -7861,8 +17384,6 @@ define <10 x double> @bitcast_v10i64_to_v10f64(<10 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo ; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7882,28 +17403,28 @@ end: } define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v10i64: ; VI: ; %bb.0: @@ -7912,7 +17433,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7924,7 +17445,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7935,7 +17456,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7947,7 +17468,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7959,7 +17480,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -7971,7 +17492,7 @@ define <10 x i64> @bitcast_v10f64_to_v10i64(<10 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7991,202 +17512,411 @@ end: ret <10 x i64> %phi } +define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v10f64_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v12, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v10f64_to_v10i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v26, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v1, v1, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v2, v2, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v7, v7, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v8, v8, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v12, v12, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v27 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i64_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40i16: ; VI: ; %bb.0: @@ -8214,7 +17944,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -8236,9 +17966,9 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -8280,7 +18010,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -8350,7 +18080,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -8372,9 +18102,9 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -8416,7 +18146,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -8449,7 +18179,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8476,7 +18206,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8507,7 +18237,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -8529,9 +18259,9 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -8578,7 +18308,7 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -8619,321 +18349,1158 @@ end: ret <40 x i16> %phi } +define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s28 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v6, s24 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: s_lshr_b32 s15, s29, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v10i64_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v10i64_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x i64> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v10i64: ; VI: ; %bb.0: @@ -8966,7 +19533,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -9029,9 +19596,9 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -9094,7 +19661,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9174,7 +19741,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -9253,9 +19820,9 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -9314,7 +19881,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9343,7 +19910,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -9365,7 +19932,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9417,7 +19984,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -9439,7 +20006,7 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9459,381 +20026,1184 @@ end: ret <10 x i64> %phi } +define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v40i16_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v10i64_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10i64_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10i64_to_v40f16: ; VI: ; %bb.0: @@ -9861,7 +21231,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -9883,9 +21253,9 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc @@ -9927,7 +21297,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -9997,7 +21367,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -10019,9 +21389,9 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -10063,7 +21433,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -10096,7 +21466,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10123,7 +21493,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10154,7 +21524,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -10176,9 +21546,9 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10225,7 +21595,7 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -10266,405 +21636,1371 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v10i64_to_v40f16_scalar(<10 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10i64_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s12, s4, 16 +; SI-NEXT: s_lshr_b32 s13, s5, 16 +; SI-NEXT: s_add_u32 s14, s18, 3 +; SI-NEXT: s_addc_u32 s15, s19, 0 +; SI-NEXT: s_lshr_b32 s16, s14, 16 +; SI-NEXT: s_lshr_b32 s17, s15, 16 +; SI-NEXT: s_add_u32 s18, s20, 3 +; SI-NEXT: s_addc_u32 s19, s21, 0 +; SI-NEXT: s_lshr_b32 s20, s18, 16 +; SI-NEXT: s_lshr_b32 s21, s19, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: s_lshr_b32 s41, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s44, s26, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s46, s28, 16 +; SI-NEXT: s_lshr_b32 s47, s29, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s58, s7, 16 +; SI-NEXT: s_lshr_b32 s59, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s60, s6, 16 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v10i64_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_readfirstlane_b32 s11, v0 +; VI-NEXT: v_readfirstlane_b32 s10, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v3 +; VI-NEXT: v_readfirstlane_b32 s6, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v5 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s12, s7, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 16 +; VI-NEXT: s_lshr_b32 s14, s8, 16 +; VI-NEXT: s_lshr_b32 s15, s9, 16 +; VI-NEXT: s_lshr_b32 s40, s10, 16 +; VI-NEXT: s_lshr_b32 s41, s11, 16 +; VI-NEXT: s_lshr_b32 s42, s29, 16 +; VI-NEXT: s_lshr_b32 s43, s28, 16 +; VI-NEXT: s_lshr_b32 s44, s27, 16 +; VI-NEXT: s_lshr_b32 s45, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: s_lshr_b32 s56, s23, 16 +; VI-NEXT: s_lshr_b32 s57, s22, 16 +; VI-NEXT: s_lshr_b32 s58, s21, 16 +; VI-NEXT: s_lshr_b32 s59, s20, 16 +; VI-NEXT: s_lshr_b32 s60, s19, 16 +; VI-NEXT: s_lshr_b32 s61, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s17, 16 +; VI-NEXT: s_lshr_b32 s63, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s63, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s62, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s61, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s60, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s59, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s58, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s57, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s56, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s47, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s46, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s45, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s44, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s43, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_or_b32 s9, s9, s15 +; VI-NEXT: s_or_b32 s8, s8, s14 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s7, s7, s12 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s11 +; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s8 +; VI-NEXT: v_mov_b32_e32 v18, s6 +; VI-NEXT: v_mov_b32_e32 v19, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v10i64_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_lshr_b32 s14, s9, 16 +; GFX9-NEXT: s_lshr_b32 s15, s8, 16 +; GFX9-NEXT: s_lshr_b32 s40, s7, 16 +; GFX9-NEXT: s_lshr_b32 s41, s6, 16 +; GFX9-NEXT: s_lshr_b32 s42, s29, 16 +; GFX9-NEXT: s_lshr_b32 s43, s28, 16 +; GFX9-NEXT: s_lshr_b32 s44, s27, 16 +; GFX9-NEXT: s_lshr_b32 s45, s26, 16 +; GFX9-NEXT: s_lshr_b32 s46, s25, 16 +; GFX9-NEXT: s_lshr_b32 s47, s24, 16 +; GFX9-NEXT: s_lshr_b32 s56, s23, 16 +; GFX9-NEXT: s_lshr_b32 s57, s22, 16 +; GFX9-NEXT: s_lshr_b32 s58, s21, 16 +; GFX9-NEXT: s_lshr_b32 s59, s20, 16 +; GFX9-NEXT: s_lshr_b32 s60, s19, 16 +; GFX9-NEXT: s_lshr_b32 s61, s18, 16 +; GFX9-NEXT: s_lshr_b32 s62, s17, 16 +; GFX9-NEXT: s_lshr_b32 s63, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s23, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s22, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s23, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_mov_b32 s58, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s6, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s58 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s22, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s25, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s26, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s27, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s28, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s29, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v15, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s9 :: v_dual_mov_b32 v17, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s4 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i64> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v10i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v10i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v10i64: ; VI: ; %bb.0: @@ -10697,7 +23033,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -10760,9 +23096,9 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -10825,7 +23161,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10905,7 +23241,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -10984,9 +23320,9 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -11046,7 +23382,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11075,7 +23411,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -11097,7 +23433,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11149,7 +23485,7 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -11171,9 +23507,930 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} + +define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v10i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v40f16_to_v10i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v10i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11192,191 +24449,210 @@ end: } define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v37, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_alignbit_b32 v21, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v22, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v23, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v27, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v29, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v37, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v38, v49, v38 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v48, v50, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v3, v3, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v4, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v5, v5, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v9, v9, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v10, v10, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v11, v11, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v12, v12, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v14, v14, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v15, v15, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v16, v16, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: v_or_b32_e32 v19, v19, v21 -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_alignbit_b32 v21, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v23, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v24, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v30, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v37 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v40i16: ; VI: ; %bb.0: @@ -11404,7 +24680,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -11426,9 +24702,9 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -11460,7 +24736,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -11530,7 +24806,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -11552,9 +24828,9 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -11586,7 +24862,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -11619,7 +24895,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -11631,22 +24907,901 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} + +define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s29 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v19, v19, v37 +; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v10f64_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v25, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s0 :: v_dual_mov_b32 v20, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v36, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v27, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v25, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v23, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 ; -; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16: +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 @@ -11657,91 +25812,7 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11760,320 +25831,317 @@ end: } define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v45 -; GCN-NEXT: v_or_b32_e32 v1, v1, v46 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v43 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v42 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v41 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v62 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v6, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v7, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v8, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v9, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v20 -; GCN-NEXT: v_or_b32_e32 v13, v13, v47 -; GCN-NEXT: v_or_b32_e32 v14, v14, v56 -; GCN-NEXT: v_or_b32_e32 v15, v15, v57 -; GCN-NEXT: v_or_b32_e32 v16, v16, v58 -; GCN-NEXT: v_or_b32_e32 v17, v17, v59 -; GCN-NEXT: v_or_b32_e32 v18, v18, v60 -; GCN-NEXT: v_or_b32_e32 v19, v19, v61 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v0, v45, v0 -; GCN-NEXT: v_or_b32_e32 v1, v46, v1 -; GCN-NEXT: v_or_b32_e32 v2, v44, v2 -; GCN-NEXT: v_or_b32_e32 v3, v43, v3 -; GCN-NEXT: v_or_b32_e32 v4, v42, v4 -; GCN-NEXT: v_or_b32_e32 v5, v41, v5 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v20, v6 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v20, v7 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v20, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: v_or_b32_e32 v13, v47, v13 -; GCN-NEXT: v_or_b32_e32 v14, v56, v14 -; GCN-NEXT: v_or_b32_e32 v15, v57, v15 -; GCN-NEXT: v_or_b32_e32 v16, v58, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v60, v18 -; GCN-NEXT: v_or_b32_e32 v19, v61, v19 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v38, v18 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v35 +; SI-NEXT: v_or_b32_e32 v4, v4, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v62 +; SI-NEXT: v_or_b32_e32 v10, v10, v61 +; SI-NEXT: v_or_b32_e32 v16, v16, v47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v60 +; SI-NEXT: v_or_b32_e32 v12, v12, v59 +; SI-NEXT: v_or_b32_e32 v13, v13, v58 +; SI-NEXT: v_or_b32_e32 v14, v14, v57 +; SI-NEXT: v_or_b32_e32 v15, v15, v56 +; SI-NEXT: v_or_b32_e32 v17, v17, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v41, v2 +; SI-NEXT: v_or_b32_e32 v3, v35, v3 +; SI-NEXT: v_or_b32_e32 v4, v40, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v6, v33, v6 +; SI-NEXT: v_or_b32_e32 v7, v32, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v62, v9 +; SI-NEXT: v_or_b32_e32 v10, v61, v10 +; SI-NEXT: v_or_b32_e32 v16, v47, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v11, v60, v11 +; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v58, v13 +; SI-NEXT: v_or_b32_e32 v14, v57, v14 +; SI-NEXT: v_or_b32_e32 v15, v56, v15 +; SI-NEXT: v_or_b32_e32 v17, v46, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v44, v19 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v10f64: ; VI: ; %bb.0: @@ -12106,7 +26174,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -12169,9 +26237,9 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v43 @@ -12234,7 +26302,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v20, 3, v32 ; VI-NEXT: v_add_u16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12314,7 +26382,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -12393,9 +26461,9 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -12454,7 +26522,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12483,7 +26551,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12505,7 +26573,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12557,7 +26625,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12579,7 +26647,7 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12599,361 +26667,1164 @@ end: ret <10 x double> %phi } +define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v51, v2 +; SI-NEXT: v_mov_b32_e32 v52, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v7, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v8, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v10, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v11, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v12, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v13, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v18, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v19, v0, v53 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v40i16_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v10f64_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v42 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v40 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v50 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v38, v50 -; GCN-NEXT: v_or_b32_e32 v14, v36, v54 -; GCN-NEXT: v_or_b32_e32 v15, v34, v41 -; GCN-NEXT: v_or_b32_e32 v16, v32, v55 -; GCN-NEXT: v_or_b32_e32 v17, v30, v53 -; GCN-NEXT: v_or_b32_e32 v19, v28, v51 -; GCN-NEXT: v_or_b32_e32 v20, v27, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v39 -; GCN-NEXT: v_or_b32_e32 v25, v25, v37 -; GCN-NEXT: v_or_b32_e32 v24, v24, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v10f64_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f64_to_v40f16: ; VI: ; %bb.0: @@ -12981,7 +27852,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -13003,9 +27874,9 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13037,7 +27908,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 @@ -13107,7 +27978,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -13129,9 +28000,9 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13163,7 +28034,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v39, v0, s4 @@ -13196,7 +28067,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13208,7 +28079,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13239,7 +28110,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18 @@ -13261,9 +28132,9 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -13295,7 +28166,7 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100 @@ -13336,405 +28207,1351 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v10f64_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v6 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s10, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: s_lshr_b32 s10, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s10 +; SI-NEXT: s_lshr_b32 s10, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; SI-NEXT: s_lshr_b32 s10, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s10 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s10 +; SI-NEXT: s_lshr_b32 s10, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: s_lshr_b32 s10, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: s_lshr_b32 s10, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s10 +; SI-NEXT: s_lshr_b32 s10, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s10 +; SI-NEXT: s_lshr_b32 s10, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s10 +; SI-NEXT: s_lshr_b32 s10, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s10 +; SI-NEXT: s_lshr_b32 s10, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s10 +; SI-NEXT: s_lshr_b32 s10, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s10 +; SI-NEXT: s_lshr_b32 s10, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s10 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s10 +; SI-NEXT: s_lshr_b32 s10, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s10 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 +; SI-NEXT: s_lshr_b32 s10, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[34:35], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[29:30], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v10f64_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v9, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v10, v10, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; VI-NEXT: v_or_b32_sdwa v24, v20, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; VI-NEXT: v_or_b32_sdwa v11, v11, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_or_b32_sdwa v25, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_or_b32_sdwa v20, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_or_b32_sdwa v21, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v10f64_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v16, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v20 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v29, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v39, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v21, v38, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v8, v37, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v36, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v35, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v34, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v33, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v32, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v26, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s0 :: v_dual_mov_b32 v20, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v36, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v14, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v27, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v25, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v23, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v36, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v35, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v28, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v27, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v26, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v25, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <10 x double> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x double> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v10f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; GCN-NEXT: v_or_b32_e32 v0, v38, v0 -; GCN-NEXT: v_or_b32_e32 v1, v36, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; GCN-NEXT: v_or_b32_e32 v2, v34, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v63 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; GCN-NEXT: v_or_b32_e32 v5, v60, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v41 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v46, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v42, v8 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v20, v9 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v20, v10 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v20, v11 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v20, v12 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v20, v13 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v20, v14 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v20, v15 -; GCN-NEXT: v_or_b32_e32 v16, v53, v16 -; GCN-NEXT: v_or_b32_e32 v17, v51, v17 -; GCN-NEXT: v_or_b32_e32 v18, v49, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; kill: killed $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v32 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v62 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v60 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v56 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v43 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v41 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v40 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v48 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_or_b32_e32 v7, v9, v8 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v19, v18 -; GCN-NEXT: v_or_b32_e32 v13, v21, v20 -; GCN-NEXT: v_or_b32_e32 v14, v23, v22 -; GCN-NEXT: v_or_b32_e32 v15, v25, v24 -; GCN-NEXT: v_or_b32_e32 v16, v27, v26 -; GCN-NEXT: v_or_b32_e32 v17, v29, v28 -; GCN-NEXT: v_or_b32_e32 v18, v31, v30 -; GCN-NEXT: v_or_b32_e32 v19, v33, v32 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v10f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v1, v36, v1 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v7, v56, v7 +; SI-NEXT: v_or_b32_e32 v8, v46, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v19, v48, v19 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v50, v18 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v51 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v50 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v10f64: ; VI: ; %bb.0: @@ -13767,7 +29584,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v19, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v19, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -13830,9 +29647,9 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v19, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v43, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -13895,7 +29712,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13975,7 +29792,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; kill: killed $vgpr20 @@ -14054,9 +29871,9 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -14116,7 +29933,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -14145,7 +29962,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -14167,7 +29984,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14219,7 +30036,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -14241,7 +30058,7 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -14261,513 +30078,1448 @@ end: ret <10 x double> %phi } +define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v10f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v35, v4 +; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_or_b32_e32 v8, v62, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v55, v11 +; SI-NEXT: v_or_b32_e32 v12, v53, v12 +; SI-NEXT: v_or_b32_e32 v13, v51, v13 +; SI-NEXT: v_or_b32_e32 v14, v49, v14 +; SI-NEXT: v_or_b32_e32 v15, v31, v15 +; SI-NEXT: v_or_b32_e32 v16, v29, v16 +; SI-NEXT: v_or_b32_e32 v17, v27, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v48, v21 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v49, v20 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v22 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v51 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v27 +; SI-NEXT: v_mov_b32_e32 v46, v53 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v47, v54 +; SI-NEXT: v_mov_b32_e32 v54, v29 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v56, v55 +; SI-NEXT: v_mov_b32_e32 v55, v30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v57, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v39, v58 +; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v32, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v58, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v23, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v49 +; SI-NEXT: v_mov_b32_e32 v49, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v48 +; SI-NEXT: v_mov_b32_e32 v48, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v28, v53 +; SI-NEXT: v_mov_b32_e32 v53, v46 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v40f16_to_v10f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_mov_b32_e32 v32, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v34, v3 +; VI-NEXT: v_mov_b32_e32 v35, v2 +; VI-NEXT: v_mov_b32_e32 v36, v1 +; VI-NEXT: v_mov_b32_e32 v37, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v19, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v37, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v36, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v18, v20, v18 +; VI-NEXT: v_add_f16_sdwa v19, v32, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v40f16_to_v10f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v5 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v34, v3 +; GFX9-NEXT: v_mov_b32_e32 v35, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v19 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v33, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v13, s76 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s1, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s2, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s18, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s19, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s20, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s21, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s25, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s26, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v33, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <10 x double> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <10 x double> + br label %end + +end: + %phi = phi <10 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x double> %phi +} + define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v40i16_to_v40f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v39 -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v55 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v55 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v53 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v51 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v49 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v46 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v47 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v56 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v58 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v59 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v60 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v61 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v63 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v5, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v11, v10 -; GCN-NEXT: v_or_b32_e32 v9, v13, v12 -; GCN-NEXT: v_or_b32_e32 v10, v15, v14 -; GCN-NEXT: v_or_b32_e32 v11, v17, v16 -; GCN-NEXT: v_or_b32_e32 v12, v20, v19 -; GCN-NEXT: v_or_b32_e32 v13, v23, v22 -; GCN-NEXT: v_or_b32_e32 v14, v26, v25 -; GCN-NEXT: v_or_b32_e32 v15, v29, v28 -; GCN-NEXT: v_or_b32_e32 v16, v39, v38 -; GCN-NEXT: v_or_b32_e32 v17, v50, v49 -; GCN-NEXT: v_or_b32_e32 v19, v53, v52 -; GCN-NEXT: v_or_b32_e32 v20, v55, v31 -; GCN-NEXT: v_or_b32_e32 v22, v44, v32 -; GCN-NEXT: v_or_b32_e32 v23, v46, v33 -; GCN-NEXT: v_or_b32_e32 v25, v56, v34 -; GCN-NEXT: v_or_b32_e32 v26, v40, v35 -; GCN-NEXT: v_or_b32_e32 v28, v41, v36 -; GCN-NEXT: v_or_b32_e32 v29, v42, v37 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40i16_to_v40f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40i16_to_v40f16: ; VI: ; %bb.0: @@ -14796,7 +31548,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v39, 3, v39 @@ -14838,7 +31590,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_add_u16_e32 v19, 3, v19 ; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -14909,7 +31661,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s6 @@ -14972,7 +31724,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s4 @@ -15005,7 +31757,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -15027,7 +31779,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15059,7 +31811,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 @@ -15121,7 +31873,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 @@ -15161,370 +31913,1448 @@ end: ret <40 x half> %phi } +define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40i16_to_v40f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s18 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v40i16_to_v40f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v40i16_to_v40f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v8, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v7, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v6, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v23, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v22, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v21, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v20, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v23, s21 +; GFX9-NEXT: v_mov_b32_e32 v22, s20 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v26, s43 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s41 +; GFX9-NEXT: v_mov_b32_e32 v29, s40 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v32, s13 +; GFX9-NEXT: v_mov_b32_e32 v33, s12 +; GFX9-NEXT: v_mov_b32_e32 v34, s11 +; GFX9-NEXT: v_mov_b32_e32 v35, s10 +; GFX9-NEXT: v_mov_b32_e32 v36, s9 +; GFX9-NEXT: v_mov_b32_e32 v37, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v39, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s10, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v38.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v39.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v25, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v25, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v40f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i16> %a, splat (i16 3) + %a2 = bitcast <40 x i16> %a1 to <40 x half> + br label %end + +cmp.false: + %a3 = bitcast <40 x i16> %a to <40 x half> + br label %end + +end: + %phi = phi <40 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x half> %phi +} + define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v40f16_to_v40i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v40 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v4 -; GCN-NEXT: v_or_b32_e32 v9, v9, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v5 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_or_b32_e32 v11, v11, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v7 -; GCN-NEXT: v_or_b32_e32 v12, v12, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_or_b32_e32 v13, v13, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v31 -; GCN-NEXT: v_or_b32_e32 v53, v25, v24 -; GCN-NEXT: v_or_b32_e32 v51, v27, v26 -; GCN-NEXT: v_or_b32_e32 v49, v29, v28 -; GCN-NEXT: v_or_b32_e32 v39, v39, v30 -; GCN-NEXT: v_or_b32_e32 v37, v37, v48 -; GCN-NEXT: v_or_b32_e32 v36, v36, v38 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: v_or_b32_e32 v17, v17, v20 -; GCN-NEXT: v_or_b32_e32 v16, v16, v18 -; GCN-NEXT: v_or_b32_e32 v15, v15, v54 -; GCN-NEXT: v_or_b32_e32 v35, v35, v52 -; GCN-NEXT: v_or_b32_e32 v34, v34, v55 -; GCN-NEXT: v_or_b32_e32 v33, v33, v50 -; GCN-NEXT: v_alignbit_b32 v55, v33, v24, 16 -; GCN-NEXT: v_alignbit_b32 v54, v34, v26, 16 -; GCN-NEXT: v_alignbit_b32 v52, v35, v28, 16 -; GCN-NEXT: v_alignbit_b32 v50, v15, v30, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v48, 16 -; GCN-NEXT: v_alignbit_b32 v38, v13, v38, 16 -; GCN-NEXT: v_alignbit_b32 v23, v12, v23, 16 -; GCN-NEXT: v_alignbit_b32 v22, v11, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v10, v20, 16 -; GCN-NEXT: v_alignbit_b32 v18, v9, v18, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v31 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v54 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v52 -; GCN-NEXT: v_or_b32_e32 v24, v24, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v26, v26, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v49, v49, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v8, v35, v8 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v39, v39, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v7, v15, v7 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v37, v37, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v14, v6 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v36, v36, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v13, v5 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v11, v3 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 -; GCN-NEXT: v_or_b32_e32 v17, v17, v20 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_or_b32_e32 v10, v16, v18 -; GCN-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v40f16_to_v40i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v30 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v42 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v44 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v56 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v59 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v23, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_or_b32_e32 v50, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v54 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v49, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v53 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v36, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v52 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v34, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v32, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_or_b32_e32 v9, v9, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v35, v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v33, v33, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_or_b32_e32 v48, v30, v39 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v39, v20, v30 +; SI-NEXT: v_or_b32_e32 v15, v15, v22 +; SI-NEXT: v_or_b32_e32 v13, v13, v21 +; SI-NEXT: v_or_b32_e32 v11, v11, v28 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_or_b32_e32 v4, v4, v29 +; SI-NEXT: v_alignbit_b32 v40, v39, v23, 16 +; SI-NEXT: v_alignbit_b32 v55, v48, v24, 16 +; SI-NEXT: v_alignbit_b32 v54, v33, v25, 16 +; SI-NEXT: v_alignbit_b32 v53, v35, v26, 16 +; SI-NEXT: v_alignbit_b32 v52, v18, v27, 16 +; SI-NEXT: v_alignbit_b32 v51, v12, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, v9, v28, 16 +; SI-NEXT: v_alignbit_b32 v20, v3, v19, 16 +; SI-NEXT: v_alignbit_b32 v19, v5, v29, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v38 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v31 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v23, v16 +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v16, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v16, v16, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v16, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v40f16_to_v40i16: ; VI: ; %bb.0: @@ -15553,7 +33383,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v39, 0x200, v39 @@ -15595,7 +33425,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 ; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 ; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -15666,7 +33496,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v39, v19, s6 @@ -15730,7 +33560,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v20, v0, s4 @@ -15763,7 +33593,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] @@ -15785,7 +33615,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15817,7 +33647,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100 @@ -15879,7 +33709,7 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 @@ -15918,3 +33748,1035 @@ end: %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <40 x i16> %phi } + +define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v27, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v39 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v34, v25, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v12, v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v28, v28, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v33, v25, v33 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v35, v35, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v25 +; SI-NEXT: v_or_b32_e32 v22, v22, v26 +; SI-NEXT: v_or_b32_e32 v21, v21, v27 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v30, v30, v38 +; SI-NEXT: v_or_b32_e32 v29, v29, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v51 +; SI-NEXT: v_or_b32_e32 v6, v6, v23 +; SI-NEXT: v_or_b32_e32 v4, v4, v52 +; SI-NEXT: v_alignbit_b32 v49, v19, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v20, v27, 16 +; SI-NEXT: v_alignbit_b32 v25, v14, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v35, v48, 16 +; SI-NEXT: v_alignbit_b32 v48, v33, v50, 16 +; SI-NEXT: v_alignbit_b32 v39, v28, v38, 16 +; SI-NEXT: v_alignbit_b32 v38, v12, v37, 16 +; SI-NEXT: v_alignbit_b32 v37, v9, v51, 16 +; SI-NEXT: v_alignbit_b32 v36, v3, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v52, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v40f16_to_v40i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v26, 0x200 +; VI-NEXT: v_add_f16_e32 v24, s16, v26 +; VI-NEXT: v_add_f16_e32 v39, s43, v26 +; VI-NEXT: v_add_f16_e32 v25, s17, v26 +; VI-NEXT: v_add_f16_e32 v38, s42, v26 +; VI-NEXT: v_add_f16_e32 v20, s18, v26 +; VI-NEXT: v_add_f16_e32 v37, s41, v26 +; VI-NEXT: v_add_f16_e32 v21, s19, v26 +; VI-NEXT: v_add_f16_e32 v36, s40, v26 +; VI-NEXT: v_add_f16_e32 v22, s20, v26 +; VI-NEXT: v_add_f16_e32 v35, s15, v26 +; VI-NEXT: v_add_f16_e32 v23, s21, v26 +; VI-NEXT: v_add_f16_e32 v34, s14, v26 +; VI-NEXT: v_add_f16_e32 v6, s22, v26 +; VI-NEXT: v_add_f16_e32 v33, s13, v26 +; VI-NEXT: v_add_f16_e32 v7, s23, v26 +; VI-NEXT: v_add_f16_e32 v32, s12, v26 +; VI-NEXT: v_add_f16_e32 v8, s24, v26 +; VI-NEXT: v_add_f16_e32 v31, s11, v26 +; VI-NEXT: v_add_f16_e32 v9, s25, v26 +; VI-NEXT: v_add_f16_e32 v30, s10, v26 +; VI-NEXT: v_add_f16_e32 v10, s26, v26 +; VI-NEXT: v_add_f16_e32 v29, s9, v26 +; VI-NEXT: v_add_f16_e32 v11, s27, v26 +; VI-NEXT: v_add_f16_e32 v28, s8, v26 +; VI-NEXT: v_add_f16_e32 v12, s28, v26 +; VI-NEXT: v_add_f16_e32 v27, s7, v26 +; VI-NEXT: v_add_f16_e32 v13, s29, v26 +; VI-NEXT: v_add_f16_e32 v26, s6, v26 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v28, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v29, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v30, s10 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v31, s11 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v32, s12 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v33, s13 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v34, s14 +; VI-NEXT: v_mov_b32_e32 v23, s21 +; VI-NEXT: v_mov_b32_e32 v35, s15 +; VI-NEXT: v_mov_b32_e32 v22, s20 +; VI-NEXT: v_mov_b32_e32 v36, s40 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v37, s41 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v38, s42 +; VI-NEXT: v_mov_b32_e32 v25, s17 +; VI-NEXT: v_mov_b32_e32 v39, s43 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v24, v24, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v20 +; VI-NEXT: v_mov_b32_e32 v3, v21 +; VI-NEXT: v_mov_b32_e32 v4, v22 +; VI-NEXT: v_mov_b32_e32 v5, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40f16_to_v40i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v7, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v6, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v23, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v22, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v21, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v20, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v23, s21 +; GFX9-NEXT: v_mov_b32_e32 v22, s20 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v26, s43 +; GFX9-NEXT: v_mov_b32_e32 v27, s42 +; GFX9-NEXT: v_mov_b32_e32 v28, s41 +; GFX9-NEXT: v_mov_b32_e32 v29, s40 +; GFX9-NEXT: v_mov_b32_e32 v30, s15 +; GFX9-NEXT: v_mov_b32_e32 v31, s14 +; GFX9-NEXT: v_mov_b32_e32 v32, s13 +; GFX9-NEXT: v_mov_b32_e32 v33, s12 +; GFX9-NEXT: v_mov_b32_e32 v34, s11 +; GFX9-NEXT: v_mov_b32_e32 v35, s10 +; GFX9-NEXT: v_mov_b32_e32 v36, s9 +; GFX9-NEXT: v_mov_b32_e32 v37, s8 +; GFX9-NEXT: v_mov_b32_e32 v38, s7 +; GFX9-NEXT: v_mov_b32_e32 v39, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v33, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v31, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v29, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v26, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v20 +; GFX9-NEXT: v_mov_b32_e32 v3, v21 +; GFX9-NEXT: v_mov_b32_e32 v4, v22 +; GFX9-NEXT: v_mov_b32_e32 v5, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s12, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s10, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v38.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v39.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v28, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v29, 16, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v23, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v24, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v25, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v35, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v37, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v30, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v22, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v24, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v25, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v40i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v36, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v31, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v30, 16, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v25, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v26, 16, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v24, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v23, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v22, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v21 :: v_dual_mov_b32 v0, v20 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <40 x half> %a, splat (half 0xH0200) + %a2 = bitcast <40 x half> %a1 to <40 x i16> + br label %end + +cmp.false: + %a3 = bitcast <40 x half> %a to <40 x i16> + br label %end + +end: + %phi = phi <40 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 8fa9b3c46ae93..a6e041b2d8300 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -1,26 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define double @bitcast_i64_to_f64(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_f64: ; VI: ; %bb.0: @@ -82,20 +81,106 @@ end: ret double %phi } +define inreg double @bitcast_i64_to_f64_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_i64_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_i64_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_i64_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to double + br label %end + +cmp.false: + %a3 = bitcast i64 %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define i64 @bitcast_f64_to_i64(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_i64: ; VI: ; %bb.0: @@ -131,10 +216,10 @@ define i64 @bitcast_f64_to_i64(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -154,21 +239,106 @@ end: ret i64 %phi } +define inreg i64 @bitcast_f64_to_i64_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast double %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v2i32: ; VI: ; %bb.0: @@ -230,21 +400,107 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_i64_to_v2i32_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_i64_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_i64_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_i64_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB5_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_4: +; GFX11-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_i64: ; VI: ; %bb.0: @@ -305,21 +561,107 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v2i32_to_i64_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v2i32_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v2i32_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v2i32_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v2f32: ; VI: ; %bb.0: @@ -381,21 +723,107 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_i64_to_v2f32_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_i64_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_i64_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_i64_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_i64: ; VI: ; %bb.0: @@ -455,32 +883,120 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v2f32_to_i64_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB11_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_4 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_3: +; SI-NEXT: s_branch .LBB11_2 +; SI-NEXT: .LBB11_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4i16: ; VI: ; %bb.0: @@ -542,46 +1058,142 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_i64_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_i64_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_i64_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_i64: ; VI: ; %bb.0: @@ -647,81 +1259,192 @@ end: ret i64 %phi } -define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg i64 @bitcast_v4i16_to_i64_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_i64_to_v4f16: +; VI-LABEL: bitcast_v4i16_to_i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: s_branch .LBB15_2 ; -; GFX9-LABEL: bitcast_i64_to_v4f16: +; GFX9-LABEL: bitcast_v4i16_to_i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_i64_to_v4f16: +; GFX11-LABEL: bitcast_v4i16_to_i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { +; SI-LABEL: bitcast_i64_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo ; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -750,55 +1473,156 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_i64_to_v4f16_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_i64_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_i64_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_i64_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_i64: ; VI: ; %bb.0: @@ -865,44 +1689,164 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v4f16_to_i64_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v4f16_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_4 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_3: +; VI-NEXT: s_branch .LBB19_2 +; VI-NEXT: .LBB19_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v4bf16: ; VI: ; %bb.0: @@ -964,51 +1908,152 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_i64_to_v4bf16_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_i64_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_i64_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_i64_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB21_3 +; GFX11-NEXT: .LBB21_2: ; %cmp.true +; GFX11-NEXT: s_add_u32 s0, s0, 3 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB21_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_4: +; GFX11-NEXT: s_branch .LBB21_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_i64: ; VI: ; %bb.0: @@ -1017,7 +2062,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -1056,7 +2101,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1067,7 +2112,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -1101,7 +2146,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1113,7 +2158,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1155,7 +2200,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1167,7 +2212,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1202,7 +2247,7 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1222,47 +2267,330 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v4bf16_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_4 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_3: +; VI-NEXT: s_branch .LBB23_2 +; VI-NEXT: .LBB23_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_4 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_branch .LBB23_2 +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-TRUE16-NEXT: .LBB23_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB23_3: +; GFX11-TRUE16-NEXT: s_branch .LBB23_2 +; GFX11-TRUE16-NEXT: .LBB23_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB23_4 +; GFX11-FAKE16-NEXT: .LBB23_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB23_3: +; GFX11-FAKE16-NEXT: s_branch .LBB23_2 +; GFX11-FAKE16-NEXT: .LBB23_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) { -; GCN-LABEL: bitcast_i64_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_4 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB12_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: .LBB12_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_i64_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_4 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB24_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: .LBB24_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_i64_to_v8i8: ; VI: ; %bb.0: @@ -1400,7 +2728,7 @@ define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1412,7 +2740,7 @@ define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GFX11-FAKE16-NEXT: .LBB12_4: ; %end +; GFX11-FAKE16-NEXT: .LBB24_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -1434,97 +2762,319 @@ end: ret <8 x i8> %phi } -define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_i64_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v8i8_to_i64: +; VI-LABEL: bitcast_i64_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_i64_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-TRUE16-LABEL: bitcast_i64_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-TRUE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: .LBB25_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB25_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB25_2 +; +; GFX11-FAKE16-LABEL: bitcast_i64_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB25_3 +; GFX11-FAKE16-NEXT: .LBB25_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: .LBB25_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB25_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB25_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v8i8_to_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB26_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB13_4 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB26_4 +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: .LBB26_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1540,8 +3090,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: .LBB26_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1572,14 +3122,14 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: s_cbranch_execnz .LBB26_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB13_4 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB26_4 +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: .LBB26_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1595,8 +3145,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: .LBB26_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1629,14 +3179,14 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -1665,8 +3215,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-TRUE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -1714,14 +3264,14 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_4 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB13_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -1750,8 +3300,8 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-FAKE16-NEXT: .LBB13_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -1802,20 +3352,278 @@ end: ret i64 %phi } +define inreg i64 @bitcast_v8i8_to_i64_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v8i8_to_i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v8i8_to_i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v8i8_to_i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB27_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB27_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v2i32: ; VI: ; %bb.0: @@ -1851,10 +3659,10 @@ define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: .LBB28_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1874,21 +3682,106 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_f64_to_v2i32_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB29_4 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_3: +; SI-NEXT: s_branch .LBB29_2 +; SI-NEXT: .LBB29_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast double %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_f64: ; VI: ; %bb.0: @@ -1949,20 +3842,106 @@ end: ret double %phi } +define inreg double @bitcast_v2i32_to_f64_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v2i32_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v2i32_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-LABEL: bitcast_v2i32_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB31_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v2f32: ; VI: ; %bb.0: @@ -1998,10 +3977,10 @@ define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: .LBB32_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2021,21 +4000,106 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_f64_to_v2f32_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast double %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_f64: ; VI: ; %bb.0: @@ -2095,33 +4159,121 @@ end: ret double %phi } +define inreg double @bitcast_v2f32_to_f64_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB35_4 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_3: +; SI-NEXT: s_branch .LBB35_2 +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: +; VI-NEXT: s_branch .LBB35_2 +; VI-NEXT: .LBB35_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v5, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_alignbit_b32 v1, v5, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: v_mov_b32_e32 v2, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4i16: ; VI: ; %bb.0: @@ -2157,10 +4309,10 @@ define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2180,46 +4332,143 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: s_branch .LBB37_5 +; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: .LBB37_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_4 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_3: +; VI-NEXT: s_branch .LBB37_2 +; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_4 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_3: +; GFX9-NEXT: s_branch .LBB37_2 +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: s_branch .LBB37_2 +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_f64: ; VI: ; %bb.0: @@ -2285,42 +4534,153 @@ end: ret double %phi } -define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v4 -; GCN-NEXT: v_mov_b32_e32 v1, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg double @bitcast_v4i16_to_f64_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v4i16_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v4i16_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: s_branch .LBB39_2 +; GFX11-NEXT: .LBB39_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { +; SI-LABEL: bitcast_f64_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v1, v5 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4f16: ; VI: ; %bb.0: @@ -2356,10 +4716,10 @@ define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: .LBB40_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2379,55 +4739,154 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_f64_to_v4f16_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_f64_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: +; VI-NEXT: s_branch .LBB41_2 +; VI-NEXT: .LBB41_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: +; GFX9-NEXT: s_branch .LBB41_2 +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: +; GFX11-NEXT: s_branch .LBB41_2 +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_f64: ; VI: ; %bb.0: @@ -2494,38 +4953,156 @@ end: ret double %phi } +define inreg double @bitcast_v4f16_to_f64_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v4f16_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: +; GFX11-NEXT: s_branch .LBB43_2 +; GFX11-NEXT: .LBB43_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v5 -; GCN-NEXT: v_mov_b32_e32 v1, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v5 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v4bf16: ; VI: ; %bb.0: @@ -2561,10 +5138,10 @@ define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: .LBB44_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2584,51 +5161,151 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_f64_to_v4bf16_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s9, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_and_b32 s7, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_4 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_3: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB45_2 +; SI-NEXT: .LBB45_4: +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB45_4 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_3: +; VI-NEXT: s_branch .LBB45_2 +; VI-NEXT: .LBB45_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB45_4 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_3: +; GFX9-NEXT: s_branch .LBB45_2 +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_4 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_3: +; GFX11-NEXT: s_branch .LBB45_2 +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_f64: ; VI: ; %bb.0: @@ -2637,7 +5314,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -2676,7 +5353,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2687,7 +5364,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -2721,7 +5398,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: .LBB46_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2733,7 +5410,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2775,7 +5452,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2787,7 +5464,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2822,7 +5499,7 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2842,45 +5519,326 @@ end: ret double %phi } +define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v4bf16_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB47_4 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_3: +; VI-NEXT: s_branch .LBB47_2 +; VI-NEXT: .LBB47_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_3: +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_3: +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { -; GCN-LABEL: bitcast_f64_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v9, v8, 24 -; GCN-NEXT: v_alignbit_b32 v2, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v9, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_alignbit_b32 v3, v9, v8, 24 -; GCN-NEXT: v_alignbit_b32 v2, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v9, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v8 -; GCN-NEXT: v_mov_b32_e32 v4, v9 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_f64_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v8i8: ; VI: ; %bb.0: @@ -2905,7 +5863,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] @@ -2914,7 +5872,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v8 ; VI-NEXT: v_mov_b32_e32 v4, v9 @@ -2943,7 +5901,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-NEXT: ; %bb.2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] @@ -2952,7 +5910,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, v9 @@ -2977,7 +5935,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2985,7 +5943,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-TRUE16-NEXT: .LBB24_4: ; %end +; GFX11-TRUE16-NEXT: .LBB48_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h @@ -3017,7 +5975,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3027,7 +5985,7 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -3049,101 +6007,339 @@ end: ret <8 x i8> %phi } -define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_f64_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s8, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; SI-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; SI-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; SI-NEXT: s_branch .LBB49_5 +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v8i8_to_f64: +; VI-LABEL: bitcast_f64_to_v8i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v9, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_3 -; VI-NEXT: ; %bb.1: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB25_4 -; VI-NEXT: .LBB25_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB25_3: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s9, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s5, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s16, 16 +; VI-NEXT: s_lshr_b32 s10, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: s_branch .LBB49_5 +; VI-NEXT: .LBB49_3: +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_mov_b32_e32 v9, s17 +; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v7, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB49_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s9, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s5, s17, 8 +; GFX9-NEXT: s_lshr_b32 s11, s16, 16 +; GFX9-NEXT: s_lshr_b32 s10, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB49_5 +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_f64_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[3:4], s[0:1], 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB49_5 +; GFX11-TRUE16-NEXT: .LBB49_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: .LBB49_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_f64_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], s[0:1], 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB49_5 +; GFX11-FAKE16-NEXT: .LBB49_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v1, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-FAKE16-NEXT: .LBB49_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast double %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { +; SI-LABEL: bitcast_v8i8_to_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_4 +; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB50_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr9 @@ -3155,8 +6351,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: .LBB50_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3187,14 +6383,14 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: s_cbranch_execnz .LBB50_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB25_4 -; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB50_4 +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: .LBB50_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3210,8 +6406,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: .LBB50_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3244,14 +6440,14 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -3280,8 +6476,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-TRUE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -3329,14 +6525,14 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_4 -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_4 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB25_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -3365,8 +6561,8 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: .LBB25_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -3417,21 +6613,279 @@ end: ret double %phi } +define inreg double @bitcast_v8i8_to_f64_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v8i8_to_f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v8i8_to_f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-LABEL: bitcast_v8i8_to_f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB51_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v2f32: ; VI: ; %bb.0: @@ -3492,21 +6946,107 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v2i32_to_v2f32_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v2i32_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB53_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v2i32: ; VI: ; %bb.0: @@ -3566,32 +7106,120 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v2f32_to_v2i32_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB55_4 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_3: +; SI-NEXT: s_branch .LBB55_2 +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: +; VI-NEXT: s_branch .LBB55_2 +; VI-NEXT: .LBB55_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4i16: ; VI: ; %bb.0: @@ -3652,46 +7280,142 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v2i32_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_3 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB57_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: s_branch .LBB57_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_3 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB57_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: s_branch .LBB57_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v2i32: ; VI: ; %bb.0: @@ -3757,48 +7481,159 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v4i16_to_v2i32_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v4i16_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB30_4 -; GCN-NEXT: .LBB30_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB30_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB30_2 -; GCN-NEXT: .LBB30_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB60_4 +; SI-NEXT: .LBB60_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB60_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB60_2 +; SI-NEXT: .LBB60_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4f16: ; VI: ; %bb.0: @@ -3859,55 +7694,156 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_v2i32_to_v4f16_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB61_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB61_3 +; SI-NEXT: .LBB61_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s16, 3 +; SI-NEXT: s_add_i32 s6, s17, 3 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: .LBB61_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB61_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB61_2 +; +; VI-LABEL: bitcast_v2i32_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB61_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB61_3 +; VI-NEXT: .LBB61_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB61_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB61_4: +; VI-NEXT: s_branch .LBB61_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB61_3 +; GFX9-NEXT: .LBB61_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB61_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB61_4: +; GFX9-NEXT: s_branch .LBB61_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB61_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB61_3 +; GFX11-NEXT: .LBB61_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB61_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB61_4: +; GFX11-NEXT: s_branch .LBB61_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB31_4 -; GCN-NEXT: .LBB31_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB31_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB31_2 -; GCN-NEXT: .LBB31_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB62_4 +; SI-NEXT: .LBB62_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB62_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB62_2 +; SI-NEXT: .LBB62_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2i32: ; VI: ; %bb.0: @@ -3974,44 +7910,164 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v4f16_to_v2i32_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB63_3 +; SI-NEXT: .LBB63_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB63_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB63_2 +; +; VI-LABEL: bitcast_v4f16_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB63_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB63_4 +; VI-NEXT: .LBB63_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB63_3: +; VI-NEXT: s_branch .LBB63_2 +; VI-NEXT: .LBB63_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB63_4 +; GFX9-NEXT: .LBB63_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB63_3: +; GFX9-NEXT: s_branch .LBB63_2 +; GFX9-NEXT: .LBB63_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB63_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB63_4 +; GFX11-NEXT: .LBB63_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB63_3: +; GFX11-NEXT: s_branch .LBB63_2 +; GFX11-NEXT: .LBB63_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB32_4 -; GCN-NEXT: .LBB32_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB32_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB32_2 -; GCN-NEXT: .LBB32_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB64_4 +; SI-NEXT: .LBB64_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB64_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB64_2 +; SI-NEXT: .LBB64_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v4bf16: ; VI: ; %bb.0: @@ -4072,51 +8128,152 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v2i32_to_v4bf16_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB65_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB65_3 +; SI-NEXT: .LBB65_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s16, 3 +; SI-NEXT: s_add_i32 s5, s17, 3 +; SI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s5, 16 +; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: .LBB65_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB65_4: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB65_2 +; +; VI-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB65_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB65_3 +; VI-NEXT: .LBB65_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB65_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB65_4: +; VI-NEXT: s_branch .LBB65_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB65_3 +; GFX9-NEXT: .LBB65_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB65_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB65_4: +; GFX9-NEXT: s_branch .LBB65_2 +; +; GFX11-LABEL: bitcast_v2i32_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB65_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX11-NEXT: .LBB65_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB65_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB65_4: +; GFX11-NEXT: s_branch .LBB65_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB33_4 -; GCN-NEXT: .LBB33_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB33_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB33_2 -; GCN-NEXT: .LBB33_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB66_4 +; SI-NEXT: .LBB66_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB66_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB66_2 +; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v2i32: ; VI: ; %bb.0: @@ -4125,7 +8282,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -4164,7 +8321,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4175,7 +8332,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: s_cbranch_execz .LBB66_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -4209,7 +8366,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: .LBB66_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4221,7 +8378,7 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -4244,74 +8401,357 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: .LBB66_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB66_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB66_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB67_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB67_3 +; SI-NEXT: .LBB67_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB67_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB67_2 +; +; VI-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB67_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB67_4 +; VI-NEXT: .LBB67_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB67_3: +; VI-NEXT: s_branch .LBB67_2 +; VI-NEXT: .LBB67_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB67_4 +; GFX9-NEXT: .LBB67_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB67_3: +; GFX9-NEXT: s_branch .LBB67_2 +; GFX9-NEXT: .LBB67_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-TRUE16-NEXT: .LBB67_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB33_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB67_3: +; GFX11-TRUE16-NEXT: s_branch .LBB67_2 +; GFX11-TRUE16-NEXT: .LBB67_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32: +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v2 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB33_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB67_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB67_4 +; GFX11-FAKE16-NEXT: .LBB67_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB33_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB67_3: +; GFX11-FAKE16-NEXT: s_branch .LBB67_2 +; GFX11-FAKE16-NEXT: .LBB67_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4331,46 +8771,46 @@ end: } define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v2i32_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB34_4 -; GCN-NEXT: .LBB34_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB34_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB34_2 -; GCN-NEXT: .LBB34_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2i32_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB68_4 +; SI-NEXT: .LBB68_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB68_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB68_2 +; SI-NEXT: .LBB68_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2i32_to_v8i8: ; VI: ; %bb.0: @@ -4507,7 +8947,7 @@ define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB68_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 @@ -4518,7 +8958,7 @@ define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB34_4: ; %end +; GFX11-FAKE16-NEXT: .LBB68_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -4540,76 +8980,298 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2i32_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB69_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB69_3 +; SI-NEXT: .LBB69_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: .LBB69_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB69_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB69_2 +; +; VI-LABEL: bitcast_v2i32_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB69_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB69_3 +; VI-NEXT: .LBB69_2: ; %cmp.true +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: .LBB69_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB69_4: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: s_branch .LBB69_2 +; +; GFX9-LABEL: bitcast_v2i32_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB69_3 +; GFX9-NEXT: .LBB69_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: .LBB69_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB69_4: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB69_2 +; +; GFX11-TRUE16-LABEL: bitcast_v2i32_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-TRUE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-TRUE16-NEXT: .LBB69_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB69_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB69_2 +; +; GFX11-FAKE16-LABEL: bitcast_v2i32_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB69_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB69_3 +; GFX11-FAKE16-NEXT: .LBB69_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-FAKE16-NEXT: .LBB69_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB69_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB69_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB35_4 -; GCN-NEXT: .LBB35_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB35_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB35_2 -; GCN-NEXT: .LBB35_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB70_4 +; SI-NEXT: .LBB70_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB70_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB70_2 +; SI-NEXT: .LBB70_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v2i32: ; VI: ; %bb.0: @@ -4623,14 +9285,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: s_cbranch_execnz .LBB70_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB35_4 -; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB70_4 +; VI-NEXT: .LBB70_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: .LBB70_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4646,8 +9308,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB35_2 -; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB70_2 +; VI-NEXT: .LBB70_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4678,14 +9340,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: s_cbranch_execnz .LBB70_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB35_4 -; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB70_4 +; GFX9-NEXT: .LBB70_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: .LBB70_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4701,8 +9363,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB35_2 -; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB70_2 +; GFX9-NEXT: .LBB70_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4735,14 +9397,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-TRUE16-NEXT: .LBB35_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-TRUE16-NEXT: .LBB70_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -4771,8 +9433,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-TRUE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -4820,14 +9482,14 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_4 -; GFX11-FAKE16-NEXT: .LBB35_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_4 +; GFX11-FAKE16-NEXT: .LBB70_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB35_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -4856,8 +9518,8 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB35_2 -; GFX11-FAKE16-NEXT: .LBB35_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB70_2 +; GFX11-FAKE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -4908,32 +9570,289 @@ end: ret <2 x i32> %phi } +define inreg <2 x i32> @bitcast_v8i8_to_v2i32_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v2i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB71_2 +; +; VI-LABEL: bitcast_v8i8_to_v2i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB71_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB71_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB71_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB71_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v2i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB71_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB71_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB71_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v2i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 +; GFX11-NEXT: .LBB71_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB71_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB71_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB71_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB36_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB36_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: .LBB36_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; %bb.4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4i16: ; VI: ; %bb.0: @@ -4993,46 +9912,144 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB73_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB73_4 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: .LBB73_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB73_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB73_4 +; VI-NEXT: .LBB73_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB73_3: +; VI-NEXT: s_branch .LBB73_2 +; VI-NEXT: .LBB73_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB73_4 +; GFX9-NEXT: .LBB73_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB73_3: +; GFX9-NEXT: s_branch .LBB73_2 +; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 +; GFX11-NEXT: .LBB73_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB73_3: +; GFX11-NEXT: s_branch .LBB73_2 +; GFX11-NEXT: .LBB73_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB37_4 -; GCN-NEXT: .LBB37_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB37_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB37_2 -; GCN-NEXT: .LBB37_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB74_4 +; SI-NEXT: .LBB74_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB74_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB74_2 +; SI-NEXT: .LBB74_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v2f32: ; VI: ; %bb.0: @@ -5098,48 +10115,159 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v4i16_to_v2f32_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB75_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB75_3 +; SI-NEXT: .LBB75_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: .LBB75_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB75_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB75_2 +; +; VI-LABEL: bitcast_v4i16_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB75_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB75_3 +; VI-NEXT: .LBB75_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB75_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB75_4: +; VI-NEXT: s_branch .LBB75_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB75_4 +; GFX9-NEXT: .LBB75_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB75_3: +; GFX9-NEXT: s_branch .LBB75_2 +; GFX9-NEXT: .LBB75_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB75_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB75_4 +; GFX11-NEXT: .LBB75_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB75_3: +; GFX11-NEXT: s_branch .LBB75_2 +; GFX11-NEXT: .LBB75_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB38_4 -; GCN-NEXT: .LBB38_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB38_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB38_2 -; GCN-NEXT: .LBB38_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB76_4 +; SI-NEXT: .LBB76_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB76_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB76_2 +; SI-NEXT: .LBB76_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4f16: ; VI: ; %bb.0: @@ -5155,32 +10283,135 @@ define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v2f32_to_v4f16: +; GFX9-LABEL: bitcast_v2f32_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define inreg <4 x half> @bitcast_v2f32_to_v4f16_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB77_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB77_3 +; SI-NEXT: .LBB77_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB77_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB77_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB77_2 +; +; VI-LABEL: bitcast_v2f32_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB77_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB77_4 +; VI-NEXT: .LBB77_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB77_3: +; VI-NEXT: s_branch .LBB77_2 +; VI-NEXT: .LBB77_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB77_4 +; GFX9-NEXT: .LBB77_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB77_3: +; GFX9-NEXT: s_branch .LBB77_2 +; GFX9-NEXT: .LBB77_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v2f32_to_v4f16: +; GFX11-LABEL: bitcast_v2f32_to_v4f16_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB77_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB77_4 +; GFX11-NEXT: .LBB77_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB77_3: +; GFX11-NEXT: s_branch .LBB77_2 +; GFX11-NEXT: .LBB77_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5200,54 +10431,54 @@ end: } define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB39_4 -; GCN-NEXT: .LBB39_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB39_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v4, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB39_2 -; GCN-NEXT: .LBB39_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB78_4 +; SI-NEXT: .LBB78_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB78_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB78_2 +; SI-NEXT: .LBB78_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v2f32: ; VI: ; %bb.0: @@ -5314,44 +10545,164 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v4f16_to_v2f32_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_cbranch_execnz .LBB79_3 +; SI-NEXT: .LBB79_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB79_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB79_2 +; +; VI-LABEL: bitcast_v4f16_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB79_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB79_4 +; VI-NEXT: .LBB79_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB79_3: +; VI-NEXT: s_branch .LBB79_2 +; VI-NEXT: .LBB79_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB79_4 +; GFX9-NEXT: .LBB79_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB79_3: +; GFX9-NEXT: s_branch .LBB79_2 +; GFX9-NEXT: .LBB79_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB79_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB79_4 +; GFX11-NEXT: .LBB79_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB79_3: +; GFX11-NEXT: s_branch .LBB79_2 +; GFX11-NEXT: .LBB79_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB40_4 -; GCN-NEXT: .LBB40_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB40_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB40_2 -; GCN-NEXT: .LBB40_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB80_4 +; SI-NEXT: .LBB80_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB80_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB80_2 +; SI-NEXT: .LBB80_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4bf16: ; VI: ; %bb.0: @@ -5411,51 +10762,155 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v2f32_to_v4bf16_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB81_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_and_b32 s8, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB81_4 +; SI-NEXT: .LBB81_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB81_3: +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB81_2 +; SI-NEXT: .LBB81_4: +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB81_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB81_4 +; VI-NEXT: .LBB81_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB81_3: +; VI-NEXT: s_branch .LBB81_2 +; VI-NEXT: .LBB81_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB81_4 +; GFX9-NEXT: .LBB81_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB81_3: +; GFX9-NEXT: s_branch .LBB81_2 +; GFX9-NEXT: .LBB81_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB81_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB81_4 +; GFX11-NEXT: .LBB81_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB81_3: +; GFX11-NEXT: s_branch .LBB81_2 +; GFX11-NEXT: .LBB81_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB41_4 -; GCN-NEXT: .LBB41_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB41_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB41_2 -; GCN-NEXT: .LBB41_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB82_4 +; SI-NEXT: .LBB82_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB82_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB82_2 +; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v2f32: ; VI: ; %bb.0: @@ -5464,7 +10919,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -5503,7 +10958,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5514,7 +10969,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: s_cbranch_execz .LBB82_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -5548,7 +11003,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 -; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: .LBB82_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5560,7 +11015,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -5602,7 +11057,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 -; GFX11-TRUE16-NEXT: .LBB41_2: ; %end +; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5614,7 +11069,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5649,7 +11104,7 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB41_2: ; %end +; GFX11-FAKE16-NEXT: .LBB82_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5669,47 +11124,330 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB83_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: s_cbranch_execnz .LBB83_3 +; SI-NEXT: .LBB83_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB83_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_branch .LBB83_2 +; +; VI-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB83_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB83_4 +; VI-NEXT: .LBB83_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB83_3: +; VI-NEXT: s_branch .LBB83_2 +; VI-NEXT: .LBB83_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB83_4 +; GFX9-NEXT: .LBB83_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB83_3: +; GFX9-NEXT: s_branch .LBB83_2 +; GFX9-NEXT: .LBB83_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-TRUE16-NEXT: .LBB83_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v5, v6 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB83_3: +; GFX11-TRUE16-NEXT: s_branch .LBB83_2 +; GFX11-TRUE16-NEXT: .LBB83_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB83_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB83_4 +; GFX11-FAKE16-NEXT: .LBB83_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB83_3: +; GFX11-FAKE16-NEXT: s_branch .LBB83_2 +; GFX11-FAKE16-NEXT: .LBB83_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v2f32_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB42_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB42_4 -; GCN-NEXT: .LBB42_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB42_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB42_2 -; GCN-NEXT: .LBB42_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v2f32_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB84_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB84_4 +; SI-NEXT: .LBB84_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB84_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB84_2 +; SI-NEXT: .LBB84_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v8i8: ; VI: ; %bb.0: @@ -5845,7 +11583,7 @@ define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB84_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5855,7 +11593,7 @@ define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB42_4: ; %end +; GFX11-FAKE16-NEXT: .LBB84_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -5877,76 +11615,317 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v2f32_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s18, 0 +; SI-NEXT: s_cbranch_scc0 .LBB85_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB85_4 +; SI-NEXT: .LBB85_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB85_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB85_2 +; SI-NEXT: .LBB85_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB85_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s5, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB85_4 +; VI-NEXT: .LBB85_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v9, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v8, s16, 1.0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: s_branch .LBB85_5 +; VI-NEXT: .LBB85_3: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: s_branch .LBB85_2 +; VI-NEXT: .LBB85_4: +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_mov_b32_e32 v9, s17 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB85_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB85_4 +; GFX9-NEXT: .LBB85_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v9, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v8, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB85_5 +; GFX9-NEXT: .LBB85_3: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB85_2 +; GFX9-NEXT: .LBB85_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB85_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v2f32_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-TRUE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB85_5 +; GFX11-TRUE16-NEXT: .LBB85_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB85_2 +; GFX11-TRUE16-NEXT: .LBB85_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: .LBB85_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v2f32_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB85_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB85_4 +; GFX11-FAKE16-NEXT: .LBB85_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB85_5 +; GFX11-FAKE16-NEXT: .LBB85_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB85_2 +; GFX11-FAKE16-NEXT: .LBB85_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-FAKE16-NEXT: .LBB85_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB43_4 -; GCN-NEXT: .LBB43_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB43_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB43_2 -; GCN-NEXT: .LBB43_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB86_4 +; SI-NEXT: .LBB86_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB86_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB86_2 +; SI-NEXT: .LBB86_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_or_b32_e32 v0, v10, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v2f32: ; VI: ; %bb.0: @@ -5960,14 +11939,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: s_cbranch_execnz .LBB86_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_4 -; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB86_4 +; VI-NEXT: .LBB86_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: .LBB86_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5983,8 +11962,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB43_2 -; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB86_2 +; VI-NEXT: .LBB86_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6015,14 +11994,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: s_cbranch_execnz .LBB86_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_4 -; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB86_4 +; GFX9-NEXT: .LBB86_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: .LBB86_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -6038,8 +12017,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB43_2 -; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB86_2 +; GFX9-NEXT: .LBB86_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6072,14 +12051,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-TRUE16-NEXT: .LBB43_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-TRUE16-NEXT: .LBB86_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -6108,8 +12087,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-TRUE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -6157,14 +12136,14 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_4 -; GFX11-FAKE16-NEXT: .LBB43_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_4 +; GFX11-FAKE16-NEXT: .LBB86_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB43_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -6193,8 +12172,8 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB43_2 -; GFX11-FAKE16-NEXT: .LBB43_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB86_2 +; GFX11-FAKE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -6245,50 +12224,309 @@ end: ret <2 x float> %phi } +define inreg <2 x float> @bitcast_v8i8_to_v2f32_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v2f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_branch .LBB87_2 +; +; VI-LABEL: bitcast_v8i8_to_v2f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB87_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB87_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB87_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB87_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v2f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB87_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB87_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB87_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v2f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 +; GFX11-NEXT: .LBB87_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB87_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB87_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB87_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v3 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: v_mov_b32_e32 v6, v1 -; GCN-NEXT: v_mov_b32_e32 v7, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB44_4 -; GCN-NEXT: .LBB44_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB44_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB44_2 -; GCN-NEXT: .LBB44_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB88_4 +; SI-NEXT: .LBB88_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB88_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: .LBB88_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4f16: ; VI: ; %bb.0: @@ -6354,40 +12592,148 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_v4i16_to_v4f16_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB89_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: s_cbranch_execnz .LBB89_3 +; SI-NEXT: .LBB89_2: ; %cmp.true +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: .LBB89_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB89_2 +; +; VI-LABEL: bitcast_v4i16_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB89_3 +; VI-NEXT: .LBB89_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB89_4: +; VI-NEXT: s_branch .LBB89_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB89_4 +; GFX9-NEXT: .LBB89_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB89_3: +; GFX9-NEXT: s_branch .LBB89_2 +; GFX9-NEXT: .LBB89_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB89_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB89_4 +; GFX11-NEXT: .LBB89_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB89_3: +; GFX11-NEXT: s_branch .LBB89_2 +; GFX11-NEXT: .LBB89_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB45_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: .LBB45_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB90_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB90_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4i16: ; VI: ; %bb.0: @@ -6454,48 +12800,164 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB91_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB91_3 +; SI-NEXT: .LBB91_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB91_4: +; SI-NEXT: s_branch .LBB91_2 +; +; VI-LABEL: bitcast_v4f16_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB91_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB91_4 +; VI-NEXT: .LBB91_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v3, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v2, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB91_3: +; VI-NEXT: s_branch .LBB91_2 +; VI-NEXT: .LBB91_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB91_4 +; GFX9-NEXT: .LBB91_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB91_3: +; GFX9-NEXT: s_branch .LBB91_2 +; GFX9-NEXT: .LBB91_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_4 +; GFX11-NEXT: .LBB91_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_3: +; GFX11-NEXT: s_branch .LBB91_2 +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_mov_b32_e32 v5, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB46_4 -; GCN-NEXT: .LBB46_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB46_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB46_2 -; GCN-NEXT: .LBB46_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB92_4 +; SI-NEXT: .LBB92_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB92_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_2 +; SI-NEXT: .LBB92_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v4bf16: ; VI: ; %bb.0: @@ -6561,56 +13023,174 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v4i16_to_v4bf16_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s9, s18, 16 +; SI-NEXT: s_lshl_b32 s8, s19, 16 +; SI-NEXT: s_cbranch_execnz .LBB93_3 +; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_lshl_b32 s6, s17, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_and_b32 s8, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s4, 16 +; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB93_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB93_2 +; +; VI-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB93_3 +; VI-NEXT: .LBB93_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB93_4: +; VI-NEXT: s_branch .LBB93_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB93_4 +; GFX9-NEXT: .LBB93_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB93_3: +; GFX9-NEXT: s_branch .LBB93_2 +; GFX9-NEXT: .LBB93_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB93_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB93_4 +; GFX11-NEXT: .LBB93_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB93_3: +; GFX11-NEXT: s_branch .LBB93_2 +; GFX11-NEXT: .LBB93_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB47_4 -; GCN-NEXT: .LBB47_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB47_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB47_2 -; GCN-NEXT: .LBB47_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB94_4 +; SI-NEXT: .LBB94_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB94_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB94_2 +; SI-NEXT: .LBB94_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4i16: ; VI: ; %bb.0: @@ -6619,7 +13199,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -6658,7 +13238,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6669,7 +13249,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -6703,7 +13283,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s6 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 -; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: .LBB94_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6715,7 +13295,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 @@ -6760,7 +13340,7 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; GFX11-TRUE16-NEXT: .LBB47_2: ; %end +; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6772,44 +13352,318 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB94_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s19 +; SI-NEXT: s_cbranch_scc0 .LBB95_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: s_cbranch_execnz .LBB95_3 +; SI-NEXT: .LBB95_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB95_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB95_2 +; +; VI-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB95_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB95_4 +; VI-NEXT: .LBB95_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB95_3: +; VI-NEXT: s_branch .LBB95_2 +; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB95_4 +; GFX9-NEXT: .LBB95_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v0, v3, v4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB95_3: +; GFX9-NEXT: s_branch .LBB95_2 +; GFX9-NEXT: .LBB95_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-TRUE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB95_3: +; GFX11-TRUE16-NEXT: s_branch .LBB95_2 +; GFX11-TRUE16-NEXT: .LBB95_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB95_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB95_4 +; GFX11-FAKE16-NEXT: .LBB95_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB47_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB95_3: +; GFX11-FAKE16-NEXT: s_branch .LBB95_2 +; GFX11-FAKE16-NEXT: .LBB95_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6829,66 +13683,66 @@ end: } define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v4i16_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v3 -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v10 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB48_4 -; GCN-NEXT: .LBB48_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB48_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v11 -; GCN-NEXT: v_or_b32_e32 v4, v1, v12 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_bfe_u32 v7, v10, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB48_2 -; GCN-NEXT: .LBB48_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v0, v11, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v1 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4i16_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB96_4 +; SI-NEXT: .LBB96_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB96_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v4, v1, v11 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: v_bfe_u32 v7, v10, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB96_2 +; SI-NEXT: .LBB96_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4i16_to_v8i8: ; VI: ; %bb.0: @@ -6914,7 +13768,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v2, 3 ; VI-NEXT: v_add_u16_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -6929,7 +13783,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v9 ; VI-NEXT: v_mov_b32_e32 v1, v4 @@ -7033,7 +13887,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB96_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] @@ -7044,7 +13898,7 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -7066,84 +13920,347 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4i16_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB97_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_bfe_u32 s8, s19, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s8, s7, 24 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB97_2 +; +; VI-LABEL: bitcast_v4i16_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s17, 24 +; VI-NEXT: s_lshr_b32 s8, s17, 16 +; VI-NEXT: s_lshr_b32 s5, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s16, 16 +; VI-NEXT: s_lshr_b32 s12, s16, 8 +; VI-NEXT: s_mov_b32 s9, s17 +; VI-NEXT: s_cbranch_execnz .LBB97_3 +; VI-NEXT: .LBB97_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s5, s17, 16 +; VI-NEXT: s_add_i32 s9, s17, 3 +; VI-NEXT: s_add_i32 s8, s5, 3 +; VI-NEXT: s_and_b32 s4, s9, 0xffff +; VI-NEXT: s_lshl_b32 s5, s8, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s5, s16, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s6, s4, 0x30000 +; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 +; VI-NEXT: s_lshr_b32 s5, s7, 8 +; VI-NEXT: s_lshr_b32 s11, s6, 16 +; VI-NEXT: s_lshr_b32 s12, s6, 8 +; VI-NEXT: s_bfe_u32 s10, s8, 0x80008 +; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s9 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB97_4: +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: s_branch .LBB97_2 +; +; GFX9-LABEL: bitcast_v4i16_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB97_4 +; GFX9-NEXT: .LBB97_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB97_5 +; GFX9-NEXT: .LBB97_3: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB97_2 +; GFX9-NEXT: .LBB97_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB97_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4i16_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB97_5 +; GFX11-TRUE16-NEXT: .LBB97_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB97_2 +; GFX11-TRUE16-NEXT: .LBB97_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: .LBB97_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4i16_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_4 +; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB97_5 +; GFX11-FAKE16-NEXT: .LBB97_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB97_2 +; GFX11-FAKE16-NEXT: .LBB97_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-FAKE16-NEXT: .LBB97_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v10, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB49_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB49_4 -; GCN-NEXT: .LBB49_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB49_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v4, v7, v1 -; GCN-NEXT: v_or_b32_e32 v1, v5, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v2, v0, v4 -; GCN-NEXT: v_or_b32_e32 v0, v3, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: .LBB49_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v11, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v5, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB98_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB98_4 +; SI-NEXT: .LBB98_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB98_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB98_2 +; SI-NEXT: .LBB98_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v9 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4i16: ; VI: ; %bb.0: @@ -7157,14 +14274,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: s_cbranch_execnz .LBB98_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB49_4 -; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB98_4 +; VI-NEXT: .LBB98_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB49_3: ; %cmp.false +; VI-NEXT: .LBB98_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -7180,8 +14297,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 -; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB98_2 +; VI-NEXT: .LBB98_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -7212,14 +14329,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: s_cbranch_execnz .LBB98_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB49_4 -; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB98_4 +; GFX9-NEXT: .LBB98_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: .LBB98_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -7235,8 +14352,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB98_2 +; GFX9-NEXT: .LBB98_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -7269,14 +14386,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-TRUE16-NEXT: .LBB49_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-TRUE16-NEXT: .LBB98_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -7305,8 +14422,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-TRUE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -7354,14 +14471,14 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_4 -; GFX11-FAKE16-NEXT: .LBB49_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB98_4 +; GFX11-FAKE16-NEXT: .LBB98_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB49_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -7390,8 +14507,8 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-FAKE16-NEXT: .LBB49_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 +; GFX11-FAKE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -7442,58 +14559,327 @@ end: ret <4 x i16> %phi } +define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v4i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_or_b32 s6, s6, s4 +; SI-NEXT: s_lshr_b32 s8, s5, 16 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_lshr_b32 s8, s7, 16 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: s_branch .LBB99_2 +; +; VI-LABEL: bitcast_v8i8_to_v4i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB99_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB99_3 +; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB99_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB99_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB99_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v4i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB99_3 +; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB99_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB99_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB99_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v4i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB99_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB99_3 +; GFX11-NEXT: .LBB99_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB99_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB99_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB99_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB50_4 -; GCN-NEXT: .LBB50_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB50_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 -; GCN-NEXT: .LBB50_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB100_4 +; SI-NEXT: .LBB100_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB100_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB100_2 +; SI-NEXT: .LBB100_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v4bf16: ; VI: ; %bb.0: @@ -7560,62 +14946,185 @@ end: ret <4 x bfloat> %phi } +define inreg <4 x bfloat> @bitcast_v4f16_to_v4bf16_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB101_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: s_cbranch_execnz .LBB101_3 +; SI-NEXT: .LBB101_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: .LBB101_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB101_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB101_2 +; +; VI-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB101_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB101_4 +; VI-NEXT: .LBB101_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_f16_e32 v2, s16, v0 +; VI-NEXT: v_add_f16_sdwa v3, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v2, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB101_3: +; VI-NEXT: s_branch .LBB101_2 +; VI-NEXT: .LBB101_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB101_4 +; GFX9-NEXT: .LBB101_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB101_3: +; GFX9-NEXT: s_branch .LBB101_2 +; GFX9-NEXT: .LBB101_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB101_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_vccnz .LBB101_4 +; GFX11-NEXT: .LBB101_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB101_3: +; GFX11-NEXT: s_branch .LBB101_2 +; GFX11-NEXT: .LBB101_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB51_4 -; GCN-NEXT: .LBB51_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB51_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 -; GCN-NEXT: .LBB51_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB102_4 +; SI-NEXT: .LBB102_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB102_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB102_2 +; SI-NEXT: .LBB102_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v4f16: ; VI: ; %bb.0: @@ -7624,7 +15133,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB102_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -7663,7 +15172,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7674,7 +15183,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB102_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -7708,7 +15217,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_mov_b32 s6, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s6 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB102_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7720,7 +15229,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -7762,7 +15271,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0 -; GFX11-TRUE16-NEXT: .LBB51_2: ; %end +; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7774,7 +15283,7 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -7794,24 +15303,316 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: .LBB102_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s19 +; SI-NEXT: s_cbranch_scc0 .LBB103_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB103_2 +; +; VI-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB103_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB103_4 +; VI-NEXT: .LBB103_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB103_3: +; VI-NEXT: s_branch .LBB103_2 +; VI-NEXT: .LBB103_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB103_4 +; GFX9-NEXT: .LBB103_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v3, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB103_3: +; GFX9-NEXT: s_branch .LBB103_2 +; GFX9-NEXT: .LBB103_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-TRUE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB103_3: +; GFX11-TRUE16-NEXT: s_branch .LBB103_2 +; GFX11-TRUE16-NEXT: .LBB103_4: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB103_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB103_4 +; GFX11-FAKE16-NEXT: .LBB103_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB51_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB103_3: +; GFX11-FAKE16-NEXT: s_branch .LBB103_2 +; GFX11-FAKE16-NEXT: .LBB103_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7831,69 +15632,69 @@ end: } define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v4f16_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB52_4 -; GCN-NEXT: .LBB52_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB52_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v9, v0 -; GCN-NEXT: v_or_b32_e32 v4, v8, v1 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 -; GCN-NEXT: .LBB52_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v4, v2, v3 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4f16_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB104_4 +; SI-NEXT: .LBB104_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB104_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v4, v8, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB104_2 +; SI-NEXT: .LBB104_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f16_to_v8i8: ; VI: ; %bb.0: @@ -7916,7 +15717,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7931,7 +15732,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v8 ; VI-NEXT: v_mov_b32_e32 v4, v9 @@ -7960,7 +15761,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-NEXT: ; %bb.2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] @@ -7971,7 +15772,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, v9 @@ -8036,7 +15837,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB104_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] @@ -8047,7 +15848,7 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX11-FAKE16-NEXT: .LBB52_4: ; %end +; GFX11-FAKE16-NEXT: .LBB104_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -8069,76 +15870,337 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4f16_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: s_cbranch_scc0 .LBB105_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v4, v8, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB105_3 +; SI-NEXT: .LBB105_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB105_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB105_2 +; +; VI-LABEL: bitcast_v4f16_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB105_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s8, s17, 24 +; VI-NEXT: s_lshr_b32 s10, s17, 16 +; VI-NEXT: s_lshr_b32 s5, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s16, 16 +; VI-NEXT: s_lshr_b32 s9, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB105_4 +; VI-NEXT: .LBB105_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; VI-NEXT: v_add_f16_e32 v8, s17, v0 +; VI-NEXT: v_add_f16_e32 v2, s4, v0 +; VI-NEXT: v_or_b32_e32 v10, v8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v9, v0, v1 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: v_mov_b32_e32 v4, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB105_3: +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: s_branch .LBB105_2 +; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: v_mov_b32_e32 v6, s10 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s5, s17, 24 +; GFX9-NEXT: s_lshr_b32 s8, s17, 16 +; GFX9-NEXT: s_lshr_b32 s9, s17, 8 +; GFX9-NEXT: s_lshr_b32 s10, s16, 16 +; GFX9-NEXT: s_lshr_b32 s11, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB105_4 +; GFX9-NEXT: .LBB105_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: s_branch .LBB105_5 +; GFX9-NEXT: .LBB105_3: +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: s_branch .LBB105_2 +; GFX9-NEXT: .LBB105_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB105_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4f16_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-TRUE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB105_5 +; GFX11-TRUE16-NEXT: .LBB105_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB105_2 +; GFX11-TRUE16-NEXT: .LBB105_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: .LBB105_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4f16_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB105_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB105_4 +; GFX11-FAKE16-NEXT: .LBB105_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-FAKE16-NEXT: s_branch .LBB105_5 +; GFX11-FAKE16-NEXT: .LBB105_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: s_branch .LBB105_2 +; GFX11-FAKE16-NEXT: .LBB105_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-FAKE16-NEXT: .LBB105_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v2 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB53_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB53_4 -; GCN-NEXT: .LBB53_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB53_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v11 -; GCN-NEXT: v_or_b32_e32 v2, v2, v8 -; GCN-NEXT: v_or_b32_e32 v3, v3, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 -; GCN-NEXT: .LBB53_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v5, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v1 -; GCN-NEXT: v_or_b32_e32 v2, v11, v2 -; GCN-NEXT: v_or_b32_e32 v3, v12, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x300, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v4f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB106_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB106_4 +; SI-NEXT: .LBB106_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB106_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB106_2 +; SI-NEXT: .LBB106_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4f16: ; VI: ; %bb.0: @@ -8152,14 +16214,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: s_cbranch_execnz .LBB106_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB53_4 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB106_4 +; VI-NEXT: .LBB106_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: .LBB106_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8175,8 +16237,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 -; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB106_2 +; VI-NEXT: .LBB106_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8207,14 +16269,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: s_cbranch_execnz .LBB106_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB53_4 -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB106_4 +; GFX9-NEXT: .LBB106_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: .LBB106_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8230,8 +16292,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 -; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB106_2 +; GFX9-NEXT: .LBB106_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8264,14 +16326,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-TRUE16-NEXT: .LBB53_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-TRUE16-NEXT: .LBB106_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -8300,8 +16362,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-TRUE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -8349,14 +16411,14 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_4 -; GFX11-FAKE16-NEXT: .LBB53_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB106_4 +; GFX11-FAKE16-NEXT: .LBB106_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB53_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -8385,8 +16447,8 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB53_2 -; GFX11-FAKE16-NEXT: .LBB53_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB106_2 +; GFX11-FAKE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -8437,68 +16499,324 @@ end: ret <4 x half> %phi } +define inreg <4 x half> @bitcast_v8i8_to_v4f16_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v4f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB107_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_cbranch_execnz .LBB107_3 +; SI-NEXT: .LBB107_2: ; %cmp.true +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s7, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: .LBB107_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB107_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_branch .LBB107_2 +; +; VI-LABEL: bitcast_v8i8_to_v4f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB107_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB107_3 +; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB107_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB107_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB107_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v4f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB107_3 +; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB107_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB107_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB107_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v4f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB107_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX11-NEXT: .LBB107_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB107_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB107_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB107_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v4bf16_to_v8i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v2 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB54_4 -; GCN-NEXT: .LBB54_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB54_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; GCN-NEXT: v_alignbit_b32 v0, v0, v10, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 -; GCN-NEXT: .LBB54_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v2, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v4bf16_to_v8i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB108_4 +; SI-NEXT: .LBB108_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB108_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB108_2 +; SI-NEXT: .LBB108_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4bf16_to_v8i8: ; VI: ; %bb.0: @@ -8523,7 +16841,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -8569,7 +16887,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v8 ; VI-NEXT: v_mov_b32_e32 v4, v9 @@ -8598,7 +16916,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-NEXT: ; %bb.2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -8640,7 +16958,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v11 -; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, v9 @@ -8660,7 +16978,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 @@ -8669,9 +16987,9 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v9.h -; GFX11-TRUE16-NEXT: .LBB54_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l @@ -8715,7 +17033,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; GFX11-TRUE16-NEXT: .LBB54_4: ; %end +; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h @@ -8748,7 +17066,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-FAKE16-NEXT: ; %bb.2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8792,7 +17110,7 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 -; GFX11-FAKE16-NEXT: .LBB54_4: ; %end +; GFX11-FAKE16-NEXT: .LBB108_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v8 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v9 @@ -8814,80 +17132,482 @@ end: ret <8 x i8> %phi } +define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s20, 0 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_branch .LBB109_2 +; +; VI-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s18, 0 +; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s8, s17, 24 +; VI-NEXT: s_lshr_b32 s5, s17, 16 +; VI-NEXT: s_lshr_b32 s9, s17, 8 +; VI-NEXT: s_lshr_b32 s10, s16, 16 +; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: .LBB109_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16 +; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_mov_b32_e32 v4, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: s_branch .LBB109_2 +; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s9, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 16 +; GFX9-NEXT: s_lshr_b32 s10, s17, 8 +; GFX9-NEXT: s_lshr_b32 s8, s16, 16 +; GFX9-NEXT: s_lshr_b32 s5, s16, 8 +; GFX9-NEXT: s_cbranch_execnz .LBB109_4 +; GFX9-NEXT: .LBB109_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v9, v2, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; GFX9-NEXT: v_mov_b32_e32 v4, v8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB109_3: +; GFX9-NEXT: ; implicit-def: $sgpr5 +; GFX9-NEXT: ; implicit-def: $sgpr8 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr9 +; GFX9-NEXT: s_branch .LBB109_2 +; GFX9-NEXT: .LBB109_4: +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s0, 8 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-TRUE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s0 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s0, 0, s1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v10 :: v_dual_add_nc_u32 v7, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB109_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB109_2 +; GFX11-TRUE16-NEXT: .LBB109_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v8i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB109_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB109_4 +; GFX11-FAKE16-NEXT: .LBB109_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s1, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v2, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v6, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB109_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr5 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr2 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr7 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB109_2 +; GFX11-FAKE16-NEXT: .LBB109_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v8i8_to_v4bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v10, v1 -; GCN-NEXT: v_mov_b32_e32 v9, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v8, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v7, v4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: .LBB55_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v10 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v11, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v7, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v2, v8, v2 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GCN-NEXT: .LBB55_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v8i8_to_v4bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v3, v5, v2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: .LBB110_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB110_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: .LBB110_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8i8_to_v4bf16: ; VI: ; %bb.0: @@ -8901,14 +17621,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: s_cbranch_execnz .LBB110_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB55_4 -; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB110_4 +; VI-NEXT: .LBB110_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: .LBB110_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8924,8 +17644,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr6 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 -; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB110_2 +; VI-NEXT: .LBB110_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v9 ; VI-NEXT: v_add_u16_e32 v1, 3, v2 ; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -8956,14 +17676,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: s_cbranch_execnz .LBB110_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB55_4 -; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB110_4 +; GFX9-NEXT: .LBB110_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: .LBB110_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -8979,8 +17699,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 -; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB110_2 +; GFX9-NEXT: .LBB110_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -9013,14 +17733,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-TRUE16-NEXT: .LBB55_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-TRUE16-NEXT: .LBB110_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l @@ -9049,8 +17769,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-TRUE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 @@ -9098,14 +17818,14 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_4 -; GFX11-FAKE16-NEXT: .LBB55_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB110_4 +; GFX11-FAKE16-NEXT: .LBB110_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB55_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -9134,8 +17854,8 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB55_2 -; GFX11-FAKE16-NEXT: .LBB55_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB110_2 +; GFX11-FAKE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v9, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v2, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -9185,3 +17905,269 @@ end: %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <4 x bfloat> %phi } + +define inreg <4 x bfloat> @bitcast_v8i8_to_v4bf16_scalar(<8 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s24, 0 +; SI-NEXT: s_cbranch_scc0 .LBB111_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB111_3 +; SI-NEXT: .LBB111_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: s_lshl_b32 s6, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s18, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_and_b32 s7, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_and_b32 s9, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: .LBB111_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB111_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: s_branch .LBB111_2 +; +; VI-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s24, 0 +; VI-NEXT: s_cbranch_scc0 .LBB111_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_cbranch_execnz .LBB111_3 +; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: .LBB111_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB111_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; VI-NEXT: s_branch .LBB111_2 +; +; GFX9-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s24, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_cbranch_execnz .LBB111_3 +; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: .LBB111_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB111_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_branch .LBB111_2 +; +; GFX11-LABEL: bitcast_v8i8_to_v4bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB111_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s7, s8 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX11-NEXT: .LBB111_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s3, 8 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s17, 8 +; GFX11-NEXT: s_and_b32 s4, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s19, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_or_b32 s5, s2, s3 +; GFX11-NEXT: .LBB111_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB111_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: s_branch .LBB111_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 08590a3af70f5..d8fe5f27e9ac8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -1,46 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v22f32: ; VI: ; %bb.0: @@ -164,41 +164,286 @@ end: ret <22 x float> %phi } +define inreg <22 x float> @bitcast_v22i32_to_v22f32_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v22i32_to_v22f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v22f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v22i32_to_v22f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v22i32: ; VI: ; %bb.0: @@ -207,7 +452,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -231,7 +476,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -242,7 +487,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -266,7 +511,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -278,7 +523,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -291,7 +536,7 @@ define <22 x i32> @bitcast_v22f32_to_v22i32(<22 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -311,41 +556,275 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v22f32_to_v22i32_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v22f32_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v22f32_to_v22i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v11i64: ; VI: ; %bb.0: @@ -354,7 +833,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -378,7 +857,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -389,7 +868,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -413,7 +892,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -425,7 +904,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -449,7 +928,7 @@ define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -469,109 +948,354 @@ end: ret <11 x i64> %phi } -define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <11 x i64> @bitcast_v22i32_to_v11i64_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v11i64_to_v22i32: +; VI-LABEL: bitcast_v22i32_to_v11i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB5_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 ; -; GFX9-LABEL: bitcast_v11i64_to_v22i32: +; GFX9-LABEL: bitcast_v22i32_to_v11i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v22i32_to_v11i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i64_to_v22i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i64_to_v22i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -583,7 +1307,7 @@ define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -613,7 +1337,7 @@ define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -633,41 +1357,292 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v11i64_to_v22i32_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v11i64_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v11i64_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v11i64_to_v22i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x i64> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v11f64: ; VI: ; %bb.0: @@ -676,7 +1651,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -700,7 +1675,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -711,7 +1686,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -735,7 +1710,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -747,7 +1722,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -771,7 +1746,7 @@ define <11 x double> @bitcast_v22i32_to_v11f64(<22 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -791,52 +1766,297 @@ end: ret <11 x double> %phi } -define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <11 x double> @bitcast_v22i32_to_v11f64_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v11f64_to_v22i32: +; VI-LABEL: bitcast_v22i32_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v22i32_to_v11f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + +define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v11f64_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f64_to_v22i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -847,7 +2067,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -860,7 +2080,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -872,7 +2092,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -885,7 +2105,7 @@ define <22 x i32> @bitcast_v11f64_to_v22i32(<11 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -905,220 +2125,442 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v11f64_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v11f64_to_v22i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v1, v1, v49 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v5, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v10, v10, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v12, v12, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v13, v13, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v22, v22, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22i32_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22i32_to_v44i16: ; VI: ; %bb.0: @@ -1148,7 +2590,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -1172,9 +2614,9 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -1220,7 +2662,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -1296,7 +2738,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -1320,9 +2762,9 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -1368,7 +2810,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -1403,7 +2845,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -1427,7 +2869,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1460,7 +2902,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -1484,9 +2926,9 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -1532,7 +2974,7 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -1575,372 +3017,1283 @@ end: ret <44 x i16> %phi } +define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v22i32_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v22i32: ; VI: ; %bb.0: @@ -1977,7 +4330,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2046,9 +4399,9 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -2117,7 +4470,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2207,7 +4560,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -2300,9 +4653,9 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -2373,7 +4726,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2402,7 +4755,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2426,7 +4779,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2482,7 +4835,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2506,7 +4859,7 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2526,466 +4879,1339 @@ end: ret <22 x i32> %phi } -define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v22i32_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v22i32_to_v44f16: +; VI-LABEL: bitcast_v44i16_to_v22i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + +define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v22i32_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v22i32_to_v44f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 @@ -2993,7 +6219,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -3017,9 +6243,9 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -3065,7 +6291,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -3141,7 +6367,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -3165,9 +6391,9 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 @@ -3213,7 +6439,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -3248,7 +6474,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -3272,7 +6498,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3305,7 +6531,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -3329,9 +6555,9 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 @@ -3377,7 +6603,7 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -3420,465 +6646,1527 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v22i32_to_v44f16_scalar(<22 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22i32_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s22, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s44, s24, 16 +; SI-NEXT: s_lshr_b32 s45, s25, 16 +; SI-NEXT: s_lshr_b32 s46, s26, 16 +; SI-NEXT: s_lshr_b32 s47, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s28, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s58, s13, 16 +; SI-NEXT: s_lshr_b32 s59, s12, 16 +; SI-NEXT: s_lshr_b32 s60, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s10, 16 +; SI-NEXT: s_lshr_b32 s62, s8, 16 +; SI-NEXT: s_lshr_b32 s63, s7, 16 +; SI-NEXT: s_lshr_b32 s72, s6, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v22i32_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v22i32_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v22i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v22i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v22i32: ; VI: ; %bb.0: @@ -3915,7 +8203,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3984,9 +8272,9 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4055,7 +8343,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4145,7 +8433,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -4238,9 +8526,9 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -4312,7 +8600,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4341,7 +8629,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4365,7 +8653,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4421,7 +8709,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4445,7 +8733,7 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4465,41 +8753,1050 @@ end: ret <22 x i32> %phi } +define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v22i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v44f16_to_v22i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v22i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v11i64: ; VI: ; %bb.0: @@ -4508,7 +9805,7 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4532,7 +9829,7 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4543,7 +9840,7 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4567,11 +9864,392 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22f32_to_v11i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define inreg <11 x i64> @bitcast_v22f32_to_v11i64_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v22f32_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v22f32_to_v11i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + +define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i64_to_v22f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i64_to_v22f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v22f32_to_v11i64: +; GFX11-LABEL: bitcast_v11i64_to_v22f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -4579,84 +10257,148 @@ define <11 x i64> @bitcast_v22f32_to_v11i64(<22 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <22 x float> %a1 to <11 x i64> + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <22 x float> br label %end cmp.false: - %a3 = bitcast <22 x float> %a to <11 x i64> + %a3 = bitcast <11 x i64> %a to <22 x float> br label %end end: - %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <11 x i64> %phi + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi } -define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <22 x float> @bitcast_v11i64_to_v22f32_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v11i64_to_v22f32: +; VI-LABEL: bitcast_v11i64_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 @@ -4679,19 +10421,43 @@ define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB23_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 ; -; GFX9-LABEL: bitcast_v11i64_to_v22f32: +; GFX9-LABEL: bitcast_v11i64_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 @@ -4714,20 +10480,38 @@ define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB23_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 ; -; GFX11-LABEL: bitcast_v11i64_to_v22f32: +; GFX11-LABEL: bitcast_v11i64_to_v22f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo @@ -4756,8 +10540,6 @@ define <22 x float> @bitcast_v11i64_to_v22f32(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4777,40 +10559,40 @@ end: } define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v11f64: ; VI: ; %bb.0: @@ -4819,7 +10601,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4843,7 +10625,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -4854,7 +10636,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -4878,7 +10660,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4890,7 +10672,7 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -4903,60 +10685,455 @@ define <11 x double> @bitcast_v22f32_to_v11f64(<22 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + +define inreg <11 x double> @bitcast_v22f32_to_v11f64_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v22f32_to_v11f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v22f32_to_v11f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + +define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v11f64_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f64_to_v22f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f64_to_v22f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f64_to_v22f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <22 x float> %a1 to <11 x double> + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <22 x float> br label %end cmp.false: - %a3 = bitcast <22 x float> %a to <11 x double> + %a3 = bitcast <11 x double> %a to <22 x float> br label %end end: - %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <11 x double> %phi + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi } -define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 ; -; VI-LABEL: bitcast_v11f64_to_v22f32: +; VI-LABEL: bitcast_v11f64_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4968,19 +11145,43 @@ define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 ; -; GFX9-LABEL: bitcast_v11f64_to_v22f32: +; GFX9-LABEL: bitcast_v11f64_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -4992,20 +11193,38 @@ define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB27_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 ; -; GFX11-LABEL: bitcast_v11f64_to_v22f32: +; GFX11-LABEL: bitcast_v11f64_to_v22f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 @@ -5017,8 +11236,6 @@ define <22 x float> @bitcast_v11f64_to_v22f32(<11 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5038,219 +11255,240 @@ end: } define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v1, v1, v49 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v5, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v10, v10, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v12, v12, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v13, v13, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v20, v20, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v22, v22, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v27, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v28, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v37, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44i16: ; VI: ; %bb.0: @@ -5280,7 +11518,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -5304,9 +11542,9 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -5352,7 +11590,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -5428,7 +11666,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -5452,9 +11690,9 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -5500,7 +11738,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -5535,7 +11773,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -5548,7 +11786,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5581,7 +11819,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -5605,9 +11843,9 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -5642,7 +11880,7 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -5685,372 +11923,1265 @@ end: ret <44 x i16> %phi } +define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v22, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v17, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v14, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v10, s28 +; SI-NEXT: v_mov_b32_e32 v9, s29 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 +; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 +; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 +; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 +; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v23, v49 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v22f32_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v26, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v23, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s27 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v22f32: ; VI: ; %bb.0: @@ -6087,7 +13218,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6156,9 +13287,9 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -6227,7 +13358,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6317,7 +13448,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -6410,9 +13541,9 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -6483,7 +13614,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6512,7 +13643,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -6536,7 +13667,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6592,7 +13723,7 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -6616,9 +13747,879 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + +define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v44i16_to_v22f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v22f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -6637,443 +14638,446 @@ end: } define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v22f32_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v22f32_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v22f32_to_v44f16: ; VI: ; %bb.0: @@ -7103,7 +15107,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -7127,9 +15131,9 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -7175,7 +15179,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -7251,7 +15255,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -7275,9 +15279,9 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 ; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 @@ -7323,7 +15327,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -7358,7 +15362,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 @@ -7371,15 +15375,1157 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <22 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <22 x float> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <22 x float> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + +define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v22f32_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v22f32_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v21, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v17, s21 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v14, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v26, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_or_b32_sdwa v22, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v23, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v22f32_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v21, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v17, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v14, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s27 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v50, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 @@ -7397,100 +16543,7 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7509,464 +16562,499 @@ end: } define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v22f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v22f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v22f32: ; VI: ; %bb.0: @@ -8003,7 +17091,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8072,9 +17160,9 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -8143,7 +17231,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8233,7 +17321,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -8326,9 +17414,9 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -8400,7 +17488,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8429,7 +17517,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -8453,7 +17541,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8509,7 +17597,7 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -8533,71 +17621,1291 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <44 x half> %a, splat (half 0xH0200) - %a2 = bitcast <44 x half> %a1 to <22 x float> + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + +define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v22f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v44f16_to_v22f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v22f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <22 x float> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <22 x float> + br label %end + +end: + %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x float> %phi +} + +define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v11i64_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i64_to_v11f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i64_to_v11f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11i64_to_v11f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <11 x double> br label %end cmp.false: - %a3 = bitcast <44 x half> %a to <22 x float> + %a3 = bitcast <11 x i64> %a to <11 x double> br label %end end: - %phi = phi <22 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <22 x float> %phi + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi } -define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <11 x double> @bitcast_v11i64_to_v11f64_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v11i64_to_v11f64: +; VI-LABEL: bitcast_v11i64_to_v11f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -8620,19 +18928,43 @@ define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v11i64_to_v11f64: +; GFX9-LABEL: bitcast_v11i64_to_v11f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -8655,20 +18987,38 @@ define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v11i64_to_v11f64: +; GFX11-LABEL: bitcast_v11i64_to_v11f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -8697,8 +19047,6 @@ define <11 x double> @bitcast_v11i64_to_v11f64(<11 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8718,29 +19066,29 @@ end: } define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f64_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v11i64: ; VI: ; %bb.0: @@ -8749,7 +19097,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -8762,7 +19110,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -8773,7 +19121,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -8786,7 +19134,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -8798,7 +19146,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -8811,7 +19159,7 @@ define <11 x i64> @bitcast_v11f64_to_v11i64(<11 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8831,220 +19179,442 @@ end: ret <11 x i64> %phi } +define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v11f64_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v10, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v11f64_to_v11i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v39, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v49, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v1, v1, v49 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v5, v5, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v8, v8, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v10, v10, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v11, v11, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v28, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i64_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44i16: ; VI: ; %bb.0: @@ -9074,7 +19644,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -9098,9 +19668,9 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc @@ -9146,7 +19716,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -9222,7 +19792,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -9246,9 +19816,9 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc @@ -9294,7 +19864,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -9329,7 +19899,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9359,7 +19929,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9392,7 +19962,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -9416,9 +19986,9 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -9470,7 +20040,7 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -9513,372 +20083,1283 @@ end: ret <44 x i16> %phi } +define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_readfirstlane_b32 s9, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s28 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v8, s22 +; SI-NEXT: v_mov_b32_e32 v9, s20 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: s_lshr_b32 s15, s8, 16 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: s_lshr_b32 s41, s12, 16 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: s_lshr_b32 s43, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s15, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v11i64_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v11i64_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s6, s6, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x i64> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v11i64: ; VI: ; %bb.0: @@ -9915,7 +21396,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -9984,9 +21465,9 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -10055,7 +21536,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10145,7 +21626,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -10238,9 +21719,9 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -10311,7 +21792,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10340,7 +21821,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -10364,7 +21845,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10420,7 +21901,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -10444,7 +21925,7 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -10464,444 +21945,1317 @@ end: ret <11 x i64> %phi } +define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v44i16_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v11i64_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11i64_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11i64_to_v44f16: ; VI: ; %bb.0: @@ -10931,7 +23285,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -10955,9 +23309,9 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc @@ -11003,7 +23357,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -11079,7 +23433,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -11103,9 +23457,9 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc @@ -11151,7 +23505,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -11186,7 +23540,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11216,7 +23570,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11249,7 +23603,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -11273,9 +23627,9 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11327,7 +23681,7 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -11370,465 +23724,1527 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v11i64_to_v44f16_scalar(<11 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11i64_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_readfirstlane_b32 s8, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s14, s4, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 16 +; SI-NEXT: s_add_u32 s16, s18, 3 +; SI-NEXT: s_addc_u32 s17, s19, 0 +; SI-NEXT: s_lshr_b32 s18, s16, 16 +; SI-NEXT: s_lshr_b32 s19, s17, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s40, s20, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s42, s22, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s44, s24, 16 +; SI-NEXT: s_lshr_b32 s45, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s46, s26, 16 +; SI-NEXT: s_lshr_b32 s47, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s56, s28, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s58, s12, 16 +; SI-NEXT: s_lshr_b32 s59, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s60, s10, 16 +; SI-NEXT: s_lshr_b32 s61, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s62, s7, 16 +; SI-NEXT: s_lshr_b32 s63, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s72, s6, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s14 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v11i64_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_readfirstlane_b32 s13, v0 +; VI-NEXT: v_readfirstlane_b32 s12, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: v_readfirstlane_b32 s9, v4 +; VI-NEXT: v_readfirstlane_b32 s8, v5 +; VI-NEXT: v_readfirstlane_b32 s6, v6 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v7 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s6, 16 +; VI-NEXT: s_lshr_b32 s40, s8, 16 +; VI-NEXT: s_lshr_b32 s41, s9, 16 +; VI-NEXT: s_lshr_b32 s42, s10, 16 +; VI-NEXT: s_lshr_b32 s43, s11, 16 +; VI-NEXT: s_lshr_b32 s44, s12, 16 +; VI-NEXT: s_lshr_b32 s45, s13, 16 +; VI-NEXT: s_lshr_b32 s46, s29, 16 +; VI-NEXT: s_lshr_b32 s47, s28, 16 +; VI-NEXT: s_lshr_b32 s56, s27, 16 +; VI-NEXT: s_lshr_b32 s57, s26, 16 +; VI-NEXT: s_lshr_b32 s58, s25, 16 +; VI-NEXT: s_lshr_b32 s59, s24, 16 +; VI-NEXT: s_lshr_b32 s60, s23, 16 +; VI-NEXT: s_lshr_b32 s61, s22, 16 +; VI-NEXT: s_lshr_b32 s62, s21, 16 +; VI-NEXT: s_lshr_b32 s63, s20, 16 +; VI-NEXT: s_lshr_b32 s72, s19, 16 +; VI-NEXT: s_lshr_b32 s73, s18, 16 +; VI-NEXT: s_lshr_b32 s74, s17, 16 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s75, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s74, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s73, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s72, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s63, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s62, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s61, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s60, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s59, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s58, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s57, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s56, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s47, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_or_b32 s6, s6, s15 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s13 +; VI-NEXT: v_mov_b32_e32 v15, s12 +; VI-NEXT: v_mov_b32_e32 v16, s11 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v20, s6 +; VI-NEXT: v_mov_b32_e32 v21, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v11i64_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_lshr_b32 s15, s12, 16 +; GFX9-NEXT: s_lshr_b32 s40, s11, 16 +; GFX9-NEXT: s_lshr_b32 s41, s10, 16 +; GFX9-NEXT: s_lshr_b32 s42, s9, 16 +; GFX9-NEXT: s_lshr_b32 s43, s8, 16 +; GFX9-NEXT: s_lshr_b32 s44, s7, 16 +; GFX9-NEXT: s_lshr_b32 s45, s6, 16 +; GFX9-NEXT: s_lshr_b32 s46, s29, 16 +; GFX9-NEXT: s_lshr_b32 s47, s28, 16 +; GFX9-NEXT: s_lshr_b32 s56, s27, 16 +; GFX9-NEXT: s_lshr_b32 s57, s26, 16 +; GFX9-NEXT: s_lshr_b32 s58, s25, 16 +; GFX9-NEXT: s_lshr_b32 s59, s24, 16 +; GFX9-NEXT: s_lshr_b32 s60, s23, 16 +; GFX9-NEXT: s_lshr_b32 s61, s22, 16 +; GFX9-NEXT: s_lshr_b32 s62, s21, 16 +; GFX9-NEXT: s_lshr_b32 s63, s20, 16 +; GFX9-NEXT: s_lshr_b32 s72, s19, 16 +; GFX9-NEXT: s_lshr_b32 s73, s18, 16 +; GFX9-NEXT: s_lshr_b32 s74, s17, 16 +; GFX9-NEXT: s_lshr_b32 s75, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v3 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s26, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s27, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v19, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-FAKE16-NEXT: s_mov_b32 s62, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s8, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s62 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s6, s6, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s27, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s28, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s29, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x i64> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v11i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v11i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v11i64: ; VI: ; %bb.0: @@ -11865,7 +25281,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -11934,9 +25350,9 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -12005,7 +25421,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12095,7 +25511,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -12188,9 +25604,9 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -12262,7 +25678,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12291,7 +25707,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12315,7 +25731,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12371,7 +25787,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -12395,7 +25811,7 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12415,209 +25831,1239 @@ end: ret <11 x i64> %phi } +define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v11i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v44f16_to_v11i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v11i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} + define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_alignbit_b32 v23, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v24, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v25, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v27, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v29, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v30, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v35, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v38, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v1, v1, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GCN-NEXT: v_or_b32_e32 v2, v2, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v3, v3, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 8, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v5, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v7, v7, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v9, v9, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v10, v10, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v11, v11, v28 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v12, v12, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v14, v14, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v16, v16, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v17, v17, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v18, v18, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v20, v20, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v31, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f64_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_alignbit_b32 v23, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v25, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v34, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v39, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v49, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44i16: ; VI: ; %bb.0: @@ -12647,7 +27093,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -12671,9 +27117,9 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12708,7 +27154,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -12784,7 +27230,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -12808,9 +27254,9 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12845,7 +27291,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -12880,7 +27326,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12893,7 +27339,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12926,7 +27372,7 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -12950,9 +27396,9 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -12980,39 +27426,904 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} + +define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v22, s17 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s29 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v49 +; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v11f64_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -13031,371 +28342,364 @@ end: } define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v32, v30 -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v57 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v47 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v44 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v59 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v58 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v63 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v62 -; GCN-NEXT: v_or_b32_e32 v8, v8, v61 -; GCN-NEXT: v_or_b32_e32 v9, v9, v60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v22 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v62 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v0, v56, v0 -; GCN-NEXT: v_or_b32_e32 v1, v57, v1 -; GCN-NEXT: v_or_b32_e32 v2, v47, v2 -; GCN-NEXT: v_or_b32_e32 v3, v45, v3 -; GCN-NEXT: v_or_b32_e32 v4, v44, v4 -; GCN-NEXT: v_or_b32_e32 v5, v43, v5 -; GCN-NEXT: v_or_b32_e32 v6, v42, v6 -; GCN-NEXT: v_or_b32_e32 v7, v41, v7 -; GCN-NEXT: v_or_b32_e32 v8, v61, v8 -; GCN-NEXT: v_or_b32_e32 v9, v60, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v22, v16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v22, v17 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v22, v18 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v22, v19 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v22, v20 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v22, v21 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: v_or_b32_e32 v2, v2, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 +; SI-NEXT: v_or_b32_e32 v4, v4, v36 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v35 +; SI-NEXT: v_or_b32_e32 v7, v7, v41 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_or_b32_e32 v16, v16, v59 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v9, v40 +; SI-NEXT: v_or_b32_e32 v10, v10, v33 +; SI-NEXT: v_or_b32_e32 v11, v11, v32 +; SI-NEXT: v_or_b32_e32 v12, v12, v63 +; SI-NEXT: v_or_b32_e32 v13, v13, v62 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v60 +; SI-NEXT: v_or_b32_e32 v17, v17, v58 +; SI-NEXT: v_or_b32_e32 v18, v18, v57 +; SI-NEXT: v_or_b32_e32 v19, v19, v56 +; SI-NEXT: v_or_b32_e32 v20, v20, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v43, v2 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v42, v5 +; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v7, v41, v7 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: v_or_b32_e32 v16, v59, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v9, v40, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 +; SI-NEXT: v_or_b32_e32 v11, v32, v11 +; SI-NEXT: v_or_b32_e32 v12, v63, v12 +; SI-NEXT: v_or_b32_e32 v13, v62, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v60, v15 +; SI-NEXT: v_or_b32_e32 v17, v58, v17 +; SI-NEXT: v_or_b32_e32 v18, v57, v18 +; SI-NEXT: v_or_b32_e32 v19, v56, v19 +; SI-NEXT: v_or_b32_e32 v20, v47, v20 +; SI-NEXT: v_or_b32_e32 v21, v46, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v11f64: ; VI: ; %bb.0: @@ -13432,7 +28736,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -13501,9 +28805,9 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v45 @@ -13572,7 +28876,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v22, 3, v32 ; VI-NEXT: v_add_u16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13662,7 +28966,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -13755,9 +29059,9 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -13828,7 +29132,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13857,7 +29161,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -13881,7 +29185,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13937,7 +29241,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -13961,7 +29265,7 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -13981,422 +29285,1295 @@ end: ret <11 x double> %phi } +define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v50, v8 +; SI-NEXT: v_mov_b32_e32 v51, v6 +; SI-NEXT: v_mov_b32_e32 v52, v4 +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: v_mov_b32_e32 v54, v0 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v9, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v10, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v11, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v20, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v60 +; SI-NEXT: v_or_b32_e32 v21, v0, v55 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v44i16_to_v11f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v11f64_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v61 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v56 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v46 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v44 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v40 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v54 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v52 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v50 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v48, v28 -; GCN-NEXT: v_or_b32_e32 v26, v38, v52 -; GCN-NEXT: v_or_b32_e32 v27, v37, v40 -; GCN-NEXT: v_or_b32_e32 v28, v35, v41 -; GCN-NEXT: v_or_b32_e32 v33, v33, v55 -; GCN-NEXT: v_or_b32_e32 v31, v31, v53 -; GCN-NEXT: v_or_b32_e32 v30, v30, v51 -; GCN-NEXT: v_or_b32_e32 v35, v46, v49 -; GCN-NEXT: v_or_b32_e32 v37, v47, v39 -; GCN-NEXT: v_or_b32_e32 v36, v56, v36 -; GCN-NEXT: v_or_b32_e32 v34, v57, v34 -; GCN-NEXT: v_or_b32_e32 v32, v58, v32 -; GCN-NEXT: v_or_b32_e32 v29, v59, v29 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v11f64_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v24 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f64_to_v44f16: ; VI: ; %bb.0: @@ -14426,7 +30603,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -14450,9 +30627,9 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14487,7 +30664,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 @@ -14563,7 +30740,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -14587,9 +30764,9 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14624,7 +30801,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v51, v0, s4 @@ -14659,7 +30836,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14672,7 +30849,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14705,7 +30882,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20 @@ -14729,9 +30906,9 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -14766,7 +30943,7 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100 @@ -14809,465 +30986,1516 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v11f64_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_and_b64 s[12:13], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v8 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s12, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 +; SI-NEXT: s_lshr_b32 s12, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s12 +; SI-NEXT: s_lshr_b32 s12, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s12 +; SI-NEXT: s_lshr_b32 s12, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s12 +; SI-NEXT: s_lshr_b32 s12, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: s_lshr_b32 s12, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s12 +; SI-NEXT: s_lshr_b32 s12, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; SI-NEXT: s_lshr_b32 s12, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s12 +; SI-NEXT: s_lshr_b32 s12, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: s_lshr_b32 s12, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: s_lshr_b32 s12, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: s_lshr_b32 s12, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s12 +; SI-NEXT: s_lshr_b32 s12, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s12 +; SI-NEXT: s_lshr_b32 s12, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s12 +; SI-NEXT: s_lshr_b32 s12, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s12 +; SI-NEXT: s_lshr_b32 s12, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s12 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s12 +; SI-NEXT: s_lshr_b32 s12, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s12 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s12 +; SI-NEXT: s_lshr_b32 s12, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s12 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s12 +; SI-NEXT: s_lshr_b32 s12, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[29:30], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[25:26], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v39, v39, v48 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v35, v35, v36 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v11f64_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_mov_b32_e32 v9, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: v_mov_b32_e32 v16, s26 +; VI-NEXT: v_mov_b32_e32 v17, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_or_b32_sdwa v22, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; VI-NEXT: v_or_b32_sdwa v23, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; VI-NEXT: v_or_b32_sdwa v12, v12, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v10, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v11f64_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v14, s22 +; GFX9-NEXT: v_mov_b32_e32 v15, s23 +; GFX9-NEXT: v_mov_b32_e32 v9, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: v_mov_b32_e32 v16, s26 +; GFX9-NEXT: v_mov_b32_e32 v17, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v9, v49, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v48, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v30, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v48, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v39, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v11, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v29, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v28, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <11 x double> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <11 x double> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v11f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v19 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v0, v50, v0 -; GCN-NEXT: v_or_b32_e32 v1, v48, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: v_or_b32_e32 v2, v38, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: v_or_b32_e32 v3, v36, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 -; GCN-NEXT: v_or_b32_e32 v4, v34, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v33 -; GCN-NEXT: v_or_b32_e32 v5, v32, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v63 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; GCN-NEXT: v_or_b32_e32 v7, v60, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v57 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v22, v8 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v22, v9 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v22, v10 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v22, v11 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v22, v12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v22, v13 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v22, v14 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v22, v15 -; GCN-NEXT: v_or_b32_e32 v16, v45, v16 -; GCN-NEXT: v_or_b32_e32 v17, v43, v17 -; GCN-NEXT: v_or_b32_e32 v18, v41, v18 -; GCN-NEXT: v_or_b32_e32 v19, v55, v19 -; GCN-NEXT: v_or_b32_e32 v20, v53, v20 -; GCN-NEXT: v_or_b32_e32 v21, v52, v21 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v32 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v62 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v60 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v59 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v57 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v56 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v52 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_or_b32_e32 v9, v11, v10 -; GCN-NEXT: v_or_b32_e32 v10, v13, v12 -; GCN-NEXT: v_or_b32_e32 v11, v15, v14 -; GCN-NEXT: v_or_b32_e32 v12, v17, v16 -; GCN-NEXT: v_or_b32_e32 v13, v19, v18 -; GCN-NEXT: v_or_b32_e32 v14, v21, v20 -; GCN-NEXT: v_or_b32_e32 v15, v23, v22 -; GCN-NEXT: v_or_b32_e32 v16, v25, v24 -; GCN-NEXT: v_or_b32_e32 v17, v27, v26 -; GCN-NEXT: v_or_b32_e32 v18, v29, v28 -; GCN-NEXT: v_or_b32_e32 v19, v31, v30 -; GCN-NEXT: v_or_b32_e32 v20, v33, v32 -; GCN-NEXT: v_or_b32_e32 v21, v35, v34 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v11f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v57 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v38, v2 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_or_b32_e32 v5, v32, v5 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_or_b32_e32 v20, v54, v20 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v40, v19 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v41 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v55 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v11f64: ; VI: ; %bb.0: @@ -15304,7 +32532,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v21, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v21, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -15373,9 +32601,9 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v21, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v45, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -15444,7 +32672,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15534,7 +32762,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; kill: killed $vgpr22 @@ -15627,9 +32855,9 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload @@ -15701,7 +32929,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15730,7 +32958,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15754,7 +32982,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15810,7 +33038,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15834,7 +33062,7 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15854,607 +33082,1630 @@ end: ret <11 x double> %phi } +define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v11f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 +; SI-NEXT: v_or_b32_e32 v6, v26, v6 +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v9, v49, v9 +; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v11, v59, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v43, v15 +; SI-NEXT: v_or_b32_e32 v16, v41, v16 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_or_b32_e32 v18, v53, v18 +; SI-NEXT: v_or_b32_e32 v19, v31, v19 +; SI-NEXT: v_or_b32_e32 v20, v24, v20 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v59 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v40 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v23 +; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v22 +; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v43 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v50, v34 +; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v55 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v38, v27 +; SI-NEXT: v_mov_b32_e32 v37, v28 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v28, v37 +; SI-NEXT: v_mov_b32_e32 v27, v38 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v25, v55 +; SI-NEXT: v_mov_b32_e32 v55, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v34 +; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v35 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v23, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v44f16_to_v11f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_mov_b32_e32 v32, v7 +; VI-NEXT: v_mov_b32_e32 v33, v6 +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_mov_b32_e32 v36, v3 +; VI-NEXT: v_mov_b32_e32 v37, v2 +; VI-NEXT: v_mov_b32_e32 v38, v1 +; VI-NEXT: v_mov_b32_e32 v39, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v21, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v39, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v37, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v36, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v35, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v33, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v20, v22, v20 +; VI-NEXT: v_add_f16_sdwa v21, v32, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v44f16_to_v11f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v7 +; GFX9-NEXT: v_mov_b32_e32 v33, v6 +; GFX9-NEXT: v_mov_b32_e32 v34, v5 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v36, v3 +; GFX9-NEXT: v_mov_b32_e32 v37, v2 +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_mov_b32_e32 v39, v0 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v33, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v1 :: v_dual_mov_b32 v35, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, s73 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v35.h +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s43 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v17, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s26, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v36 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <11 x double> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <11 x double> + br label %end + +end: + %phi = phi <11 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x double> %phi +} + define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v44i16_to_v44f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; kill: killed $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; kill: killed $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; kill: killed $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; kill: killed $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; kill: killed $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v51 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v53 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v55 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v30 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v53 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v60 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v61 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v62 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v63 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v31 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v32 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v36 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v37 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v38 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v58 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v57 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_or_b32_e32 v23, v25, v24 -; GCN-NEXT: v_or_b32_e32 v24, v27, v26 -; GCN-NEXT: v_or_b32_e32 v25, v29, v28 -; GCN-NEXT: v_or_b32_e32 v26, v32, v31 -; GCN-NEXT: v_or_b32_e32 v27, v35, v34 -; GCN-NEXT: v_or_b32_e32 v28, v38, v37 -; GCN-NEXT: v_or_b32_e32 v29, v51, v39 -; GCN-NEXT: v_or_b32_e32 v31, v53, v48 -; GCN-NEXT: v_or_b32_e32 v32, v55, v49 -; GCN-NEXT: v_or_b32_e32 v34, v41, v40 -; GCN-NEXT: v_or_b32_e32 v35, v43, v42 -; GCN-NEXT: v_or_b32_e32 v37, v45, v44 -; GCN-NEXT: v_or_b32_e32 v38, v47, v46 -; GCN-NEXT: v_or_b32_e32 v39, v57, v56 -; GCN-NEXT: v_or_b32_e32 v48, v59, v58 -; GCN-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44i16_to_v44f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44i16_to_v44f16: ; VI: ; %bb.0: @@ -16485,7 +34736,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v22, 3, v22 @@ -16531,7 +34782,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: v_add_u16_e32 v23, 3, v23 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -16608,7 +34859,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v51, v21, s6 @@ -16677,7 +34928,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v22, v0, s4 @@ -16712,7 +34963,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -16736,7 +34987,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16770,7 +35021,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 @@ -16838,7 +35089,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 @@ -16880,404 +35131,1630 @@ end: ret <44 x half> %phi } +define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44i16_to_v44f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v44i16_to_v44f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v44i16_to_v44f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v9, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v8, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v23, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v22, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v26, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s23 +; GFX9-NEXT: v_mov_b32_e32 v22, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s43 +; GFX9-NEXT: v_mov_b32_e32 v31, s42 +; GFX9-NEXT: v_mov_b32_e32 v32, s41 +; GFX9-NEXT: v_mov_b32_e32 v33, s40 +; GFX9-NEXT: v_mov_b32_e32 v34, s15 +; GFX9-NEXT: v_mov_b32_e32 v35, s14 +; GFX9-NEXT: v_mov_b32_e32 v36, s13 +; GFX9-NEXT: v_mov_b32_e32 v37, s12 +; GFX9-NEXT: v_mov_b32_e32 v38, s11 +; GFX9-NEXT: v_mov_b32_e32 v39, s10 +; GFX9-NEXT: v_mov_b32_e32 v48, s9 +; GFX9-NEXT: v_mov_b32_e32 v49, s8 +; GFX9-NEXT: v_mov_b32_e32 v50, s7 +; GFX9-NEXT: v_mov_b32_e32 v51, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v22, v37, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v36, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v35, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v31, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v30, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s12, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s9, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v48.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v49.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v51.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s7 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v29, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v44f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s11 :: v_dual_mov_b32 v37, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s9 :: v_dual_mov_b32 v39, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <44 x i16> %a, splat (i16 3) + %a2 = bitcast <44 x i16> %a1 to <44 x half> + br label %end + +cmp.false: + %a3 = bitcast <44 x i16> %a to <44 x half> + br label %end + +end: + %phi = phi <44 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x half> %phi +} + define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v44f16_to_v44i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v30 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v42 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v6 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v27 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_or_b32_e32 v4, v4, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v9 -; GCN-NEXT: v_or_b32_e32 v7, v7, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v10 -; GCN-NEXT: v_or_b32_e32 v8, v8, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v14 -; GCN-NEXT: v_or_b32_e32 v12, v12, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v27, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v54, v28, v27 -; GCN-NEXT: v_or_b32_e32 v52, v30, v29 -; GCN-NEXT: v_or_b32_e32 v50, v50, v53 -; GCN-NEXT: v_or_b32_e32 v48, v48, v51 -; GCN-NEXT: v_or_b32_e32 v38, v38, v49 -; GCN-NEXT: v_or_b32_e32 v37, v37, v39 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_or_b32_e32 v17, v17, v21 -; GCN-NEXT: v_alignbit_b32 v40, v35, v27, 16 -; GCN-NEXT: v_alignbit_b32 v55, v33, v29, 16 -; GCN-NEXT: v_alignbit_b32 v53, v31, v53, 16 -; GCN-NEXT: v_alignbit_b32 v51, v15, v51, 16 -; GCN-NEXT: v_alignbit_b32 v49, v12, v49, 16 -; GCN-NEXT: v_alignbit_b32 v39, v8, v39, 16 -; GCN-NEXT: v_alignbit_b32 v25, v3, v25, 16 -; GCN-NEXT: v_alignbit_b32 v23, v11, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v7, v26, 16 -; GCN-NEXT: v_alignbit_b32 v24, v4, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v1, v21, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v40 -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v36 -; GCN-NEXT: v_or_b32_e32 v28, v28, v29 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v55 -; GCN-NEXT: v_or_b32_e32 v30, v30, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v33, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GCN-NEXT: v_or_b32_e32 v36, v36, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v48, v48, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v38, v38, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v12, v12, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v37, v37, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v20, v20, v25 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v18, v18, v23 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v11, v11, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v4, v4, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_or_b32_e32 v17, v17, v21 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v44f16_to_v44i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v45 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v46 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v31, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_or_b32_e32 v34, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v37, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v48, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v51, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v55 +; SI-NEXT: v_or_b32_e32 v50, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v39, v30, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v36 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v36, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v33, v33, v30 +; SI-NEXT: v_or_b32_e32 v21, v21, v52 +; SI-NEXT: v_or_b32_e32 v17, v17, v25 +; SI-NEXT: v_or_b32_e32 v13, v13, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v22 +; SI-NEXT: v_or_b32_e32 v16, v16, v23 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_alignbit_b32 v41, v48, v26, 16 +; SI-NEXT: v_alignbit_b32 v40, v37, v27, 16 +; SI-NEXT: v_alignbit_b32 v55, v34, v28, 16 +; SI-NEXT: v_alignbit_b32 v54, v31, v29, 16 +; SI-NEXT: v_alignbit_b32 v53, v19, v30, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v52, 16 +; SI-NEXT: v_alignbit_b32 v25, v11, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16 +; SI-NEXT: v_alignbit_b32 v22, v1, v22, 16 +; SI-NEXT: v_alignbit_b32 v23, v8, v23, 16 +; SI-NEXT: v_alignbit_b32 v18, v2, v18, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v44f16_to_v44i16: ; VI: ; %bb.0: @@ -17308,7 +36785,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 @@ -17354,7 +36831,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 ; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -17431,7 +36908,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v51, v21, s6 @@ -17501,7 +36978,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v22, v0, s4 @@ -17536,7 +37013,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -17560,7 +37037,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17594,7 +37071,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100 @@ -17662,7 +37139,7 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100 @@ -17703,3 +37180,1129 @@ end: %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <44 x i16> %phi } + +define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v3, v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v9, v9, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_or_b32_e32 v13, v13, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v31, v31, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v34, v34, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v37, v37, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v18, v18, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v27, v27, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v30 +; SI-NEXT: v_or_b32_e32 v23, v23, v29 +; SI-NEXT: v_or_b32_e32 v20, v20, v28 +; SI-NEXT: v_or_b32_e32 v39, v39, v51 +; SI-NEXT: v_or_b32_e32 v36, v36, v50 +; SI-NEXT: v_or_b32_e32 v33, v33, v49 +; SI-NEXT: v_or_b32_e32 v15, v15, v48 +; SI-NEXT: v_or_b32_e32 v11, v11, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_alignbit_b32 v52, v24, v52, 16 +; SI-NEXT: v_alignbit_b32 v30, v21, v30, 16 +; SI-NEXT: v_alignbit_b32 v29, v18, v29, 16 +; SI-NEXT: v_alignbit_b32 v28, v37, v28, 16 +; SI-NEXT: v_alignbit_b32 v51, v34, v51, 16 +; SI-NEXT: v_alignbit_b32 v50, v31, v50, 16 +; SI-NEXT: v_alignbit_b32 v49, v13, v49, 16 +; SI-NEXT: v_alignbit_b32 v48, v9, v48, 16 +; SI-NEXT: v_alignbit_b32 v17, v6, v17, 16 +; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 +; SI-NEXT: v_alignbit_b32 v12, v1, v12, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v27, v27, v52 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v44f16_to_v44i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v30, 0x200 +; VI-NEXT: v_add_f16_e32 v24, s16, v30 +; VI-NEXT: v_add_f16_e32 v51, s43, v30 +; VI-NEXT: v_add_f16_e32 v25, s17, v30 +; VI-NEXT: v_add_f16_e32 v50, s42, v30 +; VI-NEXT: v_add_f16_e32 v26, s18, v30 +; VI-NEXT: v_add_f16_e32 v49, s41, v30 +; VI-NEXT: v_add_f16_e32 v27, s19, v30 +; VI-NEXT: v_add_f16_e32 v48, s40, v30 +; VI-NEXT: v_add_f16_e32 v28, s20, v30 +; VI-NEXT: v_add_f16_e32 v39, s15, v30 +; VI-NEXT: v_add_f16_e32 v29, s21, v30 +; VI-NEXT: v_add_f16_e32 v38, s14, v30 +; VI-NEXT: v_add_f16_e32 v22, s22, v30 +; VI-NEXT: v_add_f16_e32 v37, s13, v30 +; VI-NEXT: v_add_f16_e32 v23, s23, v30 +; VI-NEXT: v_add_f16_e32 v36, s12, v30 +; VI-NEXT: v_add_f16_e32 v8, s24, v30 +; VI-NEXT: v_add_f16_e32 v35, s11, v30 +; VI-NEXT: v_add_f16_e32 v9, s25, v30 +; VI-NEXT: v_add_f16_e32 v34, s10, v30 +; VI-NEXT: v_add_f16_e32 v10, s26, v30 +; VI-NEXT: v_add_f16_e32 v33, s9, v30 +; VI-NEXT: v_add_f16_e32 v11, s27, v30 +; VI-NEXT: v_add_f16_e32 v32, s8, v30 +; VI-NEXT: v_add_f16_e32 v12, s28, v30 +; VI-NEXT: v_add_f16_e32 v31, s7, v30 +; VI-NEXT: v_add_f16_e32 v13, s29, v30 +; VI-NEXT: v_add_f16_e32 v30, s6, v30 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v30, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v31, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v32, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v33, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v34, s10 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v35, s11 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v36, s12 +; VI-NEXT: v_mov_b32_e32 v23, s23 +; VI-NEXT: v_mov_b32_e32 v37, s13 +; VI-NEXT: v_mov_b32_e32 v22, s22 +; VI-NEXT: v_mov_b32_e32 v38, s14 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v39, s15 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v48, s40 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v49, s41 +; VI-NEXT: v_mov_b32_e32 v26, s18 +; VI-NEXT: v_mov_b32_e32 v50, s42 +; VI-NEXT: v_mov_b32_e32 v25, s17 +; VI-NEXT: v_mov_b32_e32 v51, s43 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v8, v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v24 +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v22 +; VI-NEXT: v_mov_b32_e32 v7, v23 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v44f16_to_v44i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v9, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v8, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v23, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v22, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v27, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s23 +; GFX9-NEXT: v_mov_b32_e32 v22, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s43 +; GFX9-NEXT: v_mov_b32_e32 v31, s42 +; GFX9-NEXT: v_mov_b32_e32 v32, s41 +; GFX9-NEXT: v_mov_b32_e32 v33, s40 +; GFX9-NEXT: v_mov_b32_e32 v34, s15 +; GFX9-NEXT: v_mov_b32_e32 v35, s14 +; GFX9-NEXT: v_mov_b32_e32 v36, s13 +; GFX9-NEXT: v_mov_b32_e32 v37, s12 +; GFX9-NEXT: v_mov_b32_e32 v38, s11 +; GFX9-NEXT: v_mov_b32_e32 v39, s10 +; GFX9-NEXT: v_mov_b32_e32 v48, s9 +; GFX9-NEXT: v_mov_b32_e32 v49, s8 +; GFX9-NEXT: v_mov_b32_e32 v50, s7 +; GFX9-NEXT: v_mov_b32_e32 v51, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v22, v37, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v36, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v35, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v33, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v31, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v30, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v24 +; GFX9-NEXT: v_mov_b32_e32 v1, v25 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v22 +; GFX9-NEXT: v_mov_b32_e32 v7, v23 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s14, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s13, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s12, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s11, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s9, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v48.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v49.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v51.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s7 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v34, 16, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v28, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v29, 16, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v19, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v20, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v21, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v26, 16, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v27, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v26, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v44i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s15 :: v_dual_mov_b32 v33, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s13 :: v_dual_mov_b32 v35, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s11 :: v_dual_mov_b32 v37, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s9 :: v_dual_mov_b32 v39, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s6 :: v_dual_mov_b32 v49, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v51, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v49, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v48, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v38, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v37, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v33, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v30, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v29, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v28, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v27, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v26, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v22 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <44 x half> %a, splat (half 0xH0200) + %a2 = bitcast <44 x half> %a1 to <44 x i16> + br label %end + +cmp.false: + %a3 = bitcast <44 x half> %a to <44 x i16> + br label %end + +end: + %phi = phi <44 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <44 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index b1a194f8a3a7d..79adc25903ac7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -1,48 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v24f32: ; VI: ; %bb.0: @@ -172,43 +172,300 @@ end: ret <24 x float> %phi } +define inreg <24 x float> @bitcast_v24i32_to_v24f32_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v24i32_to_v24f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v24f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v24i32_to_v24f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v24i32: ; VI: ; %bb.0: @@ -217,7 +474,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -243,7 +500,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -254,7 +511,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -280,7 +537,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -292,7 +549,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -306,7 +563,7 @@ define <24 x i32> @bitcast_v24f32_to_v24i32(<24 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -326,43 +583,288 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v24f32_to_v24i32_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v24f32_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v24f32_to_v24i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v12i64: ; VI: ; %bb.0: @@ -371,7 +873,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -397,7 +899,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -408,7 +910,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -434,7 +936,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -446,7 +948,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -472,7 +974,7 @@ define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -492,93 +994,350 @@ end: ret <12 x i64> %phi } -define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x i64> @bitcast_v24i32_to_v12i64_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v12i64_to_v24i32: +; VI-LABEL: bitcast_v24i32_to_v12i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB5_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 ; -; GFX9-LABEL: bitcast_v12i64_to_v24i32: +; GFX9-LABEL: bitcast_v24i32_to_v12i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v24i32_to_v12i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v12i64_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i64_to_v24i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i64_to_v24i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc @@ -600,7 +1359,7 @@ define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -612,7 +1371,7 @@ define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -644,7 +1403,7 @@ define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -664,43 +1423,306 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v12i64_to_v24i32_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v12i64_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v12i64_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v12i64_to_v24i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i64> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v12f64: ; VI: ; %bb.0: @@ -709,7 +1731,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -735,7 +1757,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -746,7 +1768,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -772,7 +1794,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -784,7 +1806,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -810,7 +1832,7 @@ define <12 x double> @bitcast_v24i32_to_v12f64(<24 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -830,54 +1852,311 @@ end: ret <12 x double> %phi } -define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x double> @bitcast_v24i32_to_v12f64_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v12f64_to_v24i32: +; VI-LABEL: bitcast_v24i32_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v24i32_to_v12f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v12f64_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f64_to_v24i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -888,7 +2167,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -902,7 +2181,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -914,7 +2193,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -928,7 +2207,7 @@ define <24 x i32> @bitcast_v12f64_to_v24i32(<12 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -948,269 +2227,474 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v12f64_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v12f64_to_v24i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v4, v4, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v5, v5, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v30 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v35 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v29 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24i32_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24i32_to_v48i16: ; VI: ; %bb.0: @@ -1242,7 +2726,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -1268,9 +2752,9 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -1320,7 +2804,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -1402,7 +2886,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -1428,9 +2912,9 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -1480,7 +2964,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -1517,7 +3001,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -1543,7 +3027,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1578,7 +3062,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -1604,9 +3088,9 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -1656,7 +3140,7 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -1701,419 +3185,1413 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v24i32_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v24i32: ; VI: ; %bb.0: @@ -2154,7 +4632,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2229,9 +4707,9 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -2306,7 +4784,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2405,7 +4883,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -2513,9 +4991,9 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -2597,7 +5075,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2626,7 +5104,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2652,7 +5130,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2712,7 +5190,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2738,7 +5216,7 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2758,543 +5236,1540 @@ end: ret <24 x i32> %phi } -define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v24i32_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v24i32_to_v48f16: +; VI-LABEL: bitcast_v48i16_to_v24i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + +define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v24i32_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v24i32_to_v48f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -3315,7 +6790,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -3341,9 +6816,9 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -3393,7 +6868,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -3475,7 +6950,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -3501,9 +6976,9 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 @@ -3553,7 +7028,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -3590,7 +7065,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -3616,7 +7091,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3651,7 +7126,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -3677,9 +7152,9 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 @@ -3729,7 +7204,7 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -3774,538 +7249,1700 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v24i32_to_v48f16_scalar(<24 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24i32_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s26, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s28, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s15, 16 +; SI-NEXT: s_lshr_b32 s61, s14, 16 +; SI-NEXT: s_lshr_b32 s62, s13, 16 +; SI-NEXT: s_lshr_b32 s63, s12, 16 +; SI-NEXT: s_lshr_b32 s72, s11, 16 +; SI-NEXT: s_lshr_b32 s73, s10, 16 +; SI-NEXT: s_lshr_b32 s74, s8, 16 +; SI-NEXT: s_lshr_b32 s75, s7, 16 +; SI-NEXT: s_lshr_b32 s76, s6, 16 +; SI-NEXT: s_lshr_b32 s77, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v24i32_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v24i32_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v24i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v24i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v24i32: ; VI: ; %bb.0: @@ -4346,7 +8983,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -4421,9 +9058,9 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4498,7 +9135,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4597,7 +9234,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -4705,9 +9342,9 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -4790,7 +9427,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -4819,7 +9456,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4845,7 +9482,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4905,7 +9542,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -4931,7 +9568,7 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -4951,43 +9588,1169 @@ end: ret <24 x i32> %phi } +define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v24i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v48f16_to_v24i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v24i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v12i64: ; VI: ; %bb.0: @@ -4996,7 +10759,7 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5022,7 +10785,7 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5033,7 +10796,7 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5059,11 +10822,410 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24f32_to_v12i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define inreg <12 x i64> @bitcast_v24f32_to_v12i64_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v24f32_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v24f32_to_v12i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v12i64_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i64_to_v24f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i64_to_v24f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v24f32_to_v12i64: +; GFX11-LABEL: bitcast_v12i64_to_v24f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -5071,87 +11233,154 @@ define <12 x i64> @bitcast_v24f32_to_v12i64(<24 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <24 x float> %a1 to <12 x i64> + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <24 x float> br label %end cmp.false: - %a3 = bitcast <24 x float> %a to <12 x i64> + %a3 = bitcast <12 x i64> %a to <24 x float> br label %end end: - %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <12 x i64> %phi + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi } -define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <24 x float> @bitcast_v12i64_to_v24f32_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v12i64_to_v24f32: +; VI-LABEL: bitcast_v12i64_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 @@ -5176,19 +11405,44 @@ define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB23_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 ; -; GFX9-LABEL: bitcast_v12i64_to_v24f32: +; GFX9-LABEL: bitcast_v12i64_to_v24f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc ; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 @@ -5213,20 +11467,39 @@ define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB23_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 ; -; GFX11-LABEL: bitcast_v12i64_to_v24f32: +; GFX11-LABEL: bitcast_v12i64_to_v24f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo @@ -5257,8 +11530,6 @@ define <24 x float> @bitcast_v12i64_to_v24f32(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5278,42 +11549,42 @@ end: } define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v12f64: ; VI: ; %bb.0: @@ -5322,7 +11593,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5348,7 +11619,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5359,7 +11630,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5385,7 +11656,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5397,7 +11668,7 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -5411,61 +11682,473 @@ define <12 x double> @bitcast_v24f32_to_v12f64(<24 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define inreg <12 x double> @bitcast_v24f32_to_v12f64_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v24f32_to_v12f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v24f32_to_v12f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v12f64_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f64_to_v24f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f64_to_v24f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f64_to_v24f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <24 x float> %a1 to <12 x double> + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <24 x float> br label %end cmp.false: - %a3 = bitcast <24 x float> %a to <12 x double> + %a3 = bitcast <12 x double> %a to <24 x float> br label %end end: - %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <12 x double> %phi + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi } -define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 ; -; VI-LABEL: bitcast_v12f64_to_v24f32: +; VI-LABEL: bitcast_v12f64_to_v24f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -5478,19 +12161,44 @@ define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 ; -; GFX9-LABEL: bitcast_v12f64_to_v24f32: +; GFX9-LABEL: bitcast_v12f64_to_v24f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -5503,20 +12211,39 @@ define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB27_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 ; -; GFX11-LABEL: bitcast_v12f64_to_v24f32: +; GFX11-LABEL: bitcast_v12f64_to_v24f32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 @@ -5529,8 +12256,6 @@ define <24 x float> @bitcast_v12f64_to_v24f32(<12 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -5550,268 +12275,264 @@ end: } define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v51, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v4, v4, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v5, v5, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v10, v10, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v30 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v35 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v29 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v36, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48i16: ; VI: ; %bb.0: @@ -5843,7 +12564,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -5869,9 +12590,9 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -5921,7 +12642,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -6003,7 +12724,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -6029,9 +12750,9 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -6081,7 +12802,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -6118,7 +12839,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -6132,7 +12853,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6167,7 +12888,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -6193,9 +12914,9 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -6233,7 +12954,7 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -6278,419 +12999,1388 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v26, s16 +; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v18, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 +; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 +; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 +; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v26, v26, v53 +; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v24, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v24f32_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v20, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v28, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v20, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v16, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v24f32: ; VI: ; %bb.0: @@ -6731,7 +14421,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -6806,9 +14496,9 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -6883,7 +14573,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6982,7 +14672,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -7090,9 +14780,9 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -7174,7 +14864,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7203,7 +14893,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -7229,7 +14919,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7289,7 +14979,7 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -7315,9 +15005,1028 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + +define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v48i16_to_v24f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v24f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -7336,531 +16045,509 @@ end: } define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v24f32_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v24f32_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v24f32_to_v48f16: ; VI: ; %bb.0: @@ -7892,7 +16579,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -7918,9 +16605,9 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -7970,7 +16657,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -8052,7 +16739,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -8078,9 +16765,9 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 ; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 @@ -8130,7 +16817,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -8167,7 +16854,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -8181,7 +16868,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8216,7 +16903,7 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -8242,9 +16929,9 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 @@ -8258,58 +16945,1201 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <24 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <24 x float> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <24 x float> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + +define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v24f32_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v24f32_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v19, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v15, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v10, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v20, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v16, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v28, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v55 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v24f32_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v19, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v15, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v10, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v20, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v16, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v55, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v16, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v51, 16, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v49, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v54, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v53, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -8328,537 +18158,576 @@ end: } define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v24f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v24f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v24f32: ; VI: ; %bb.0: @@ -8899,7 +18768,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8974,9 +18843,9 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -9051,7 +18920,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9150,7 +19019,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -9258,9 +19127,9 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -9343,7 +19212,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -9372,7 +19241,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -9398,7 +19267,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9458,7 +19327,7 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -9484,73 +19353,1420 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <48 x half> %a, splat (half 0xH0200) - %a2 = bitcast <48 x half> %a1 to <24 x float> + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + +define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v24f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v48f16_to_v24f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v24f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <24 x float> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <24 x float> + br label %end + +end: + %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x float> %phi +} + +define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v12i64_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i64_to_v12f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i64_to_v12f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i64_to_v12f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <12 x double> br label %end cmp.false: - %a3 = bitcast <48 x half> %a to <24 x float> + %a3 = bitcast <12 x i64> %a to <12 x double> br label %end end: - %phi = phi <24 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <24 x float> %phi + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi } -define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <12 x double> @bitcast_v12i64_to_v12f64_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 ; -; VI-LABEL: bitcast_v12i64_to_v12f64: +; VI-LABEL: bitcast_v12i64_to_v12f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 @@ -9575,19 +20791,44 @@ define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB37_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 ; -; GFX9-LABEL: bitcast_v12i64_to_v12f64: +; GFX9-LABEL: bitcast_v12i64_to_v12f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 @@ -9612,20 +20853,39 @@ define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB37_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 ; -; GFX11-LABEL: bitcast_v12i64_to_v12f64: +; GFX11-LABEL: bitcast_v12i64_to_v12f64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -9656,8 +20916,6 @@ define <12 x double> @bitcast_v12i64_to_v12f64(<12 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo ; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -9677,30 +20935,30 @@ end: } define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f64_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v12i64: ; VI: ; %bb.0: @@ -9709,7 +20967,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -9723,7 +20981,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: .LBB38_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9734,7 +20992,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -9748,7 +21006,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: .LBB38_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -9760,7 +21018,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 @@ -9774,7 +21032,7 @@ define <12 x i64> @bitcast_v12f64_to_v12i64(<12 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -9794,269 +21052,474 @@ end: ret <12 x i64> %phi } +define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v12f64_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v12f64_to_v12i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v3, v3, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v4, v4, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v5, v5, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v6, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v8, v8, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v32 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v29 -; GCN-NEXT: v_or_b32_e32 v16, v16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v35 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v31 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i64_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48i16: ; VI: ; %bb.0: @@ -10088,7 +21551,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -10114,9 +21577,9 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc @@ -10166,7 +21629,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -10248,7 +21711,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -10274,9 +21737,9 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc @@ -10326,7 +21789,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -10363,7 +21826,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10395,7 +21858,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10430,7 +21893,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -10456,9 +21919,9 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10514,7 +21977,7 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -10559,419 +22022,1413 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v5 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_readfirstlane_b32 s9, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s28 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s22 +; SI-NEXT: v_mov_b32_e32 v10, s20 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: s_lshr_b32 s41, s8, 16 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: s_lshr_b32 s43, s12, 16 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: s_lshr_b32 s45, s29, 16 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_lshr_b32 s56, s23, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s41, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v12i64_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v12i64_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x i64> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v12i64: ; VI: ; %bb.0: @@ -11012,7 +23469,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -11087,9 +23544,9 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -11164,7 +23621,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11263,7 +23720,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -11371,9 +23828,9 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -11455,7 +23912,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -11484,7 +23941,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11510,7 +23967,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11570,7 +24027,7 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -11596,9 +24053,1028 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + +define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v48i16_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11617,531 +25093,509 @@ end: } define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i64_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i64_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i64_to_v48f16: ; VI: ; %bb.0: @@ -12173,7 +25627,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -12199,9 +25653,9 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc @@ -12251,7 +25705,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -12333,7 +25787,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -12359,9 +25813,9 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc @@ -12411,7 +25865,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -12448,7 +25902,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -12480,7 +25934,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12515,7 +25969,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -12541,9 +25995,9 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -12599,7 +26053,7 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -12644,538 +26098,1700 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v12i64_to_v48f16_scalar(<12 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i64_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s44, s22, 16 +; SI-NEXT: s_lshr_b32 s45, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s46, s24, 16 +; SI-NEXT: s_lshr_b32 s47, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s56, s26, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s58, s28, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s60, s14, 16 +; SI-NEXT: s_lshr_b32 s61, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s62, s12, 16 +; SI-NEXT: s_lshr_b32 s63, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s72, s10, 16 +; SI-NEXT: s_lshr_b32 s73, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s74, s7, 16 +; SI-NEXT: s_lshr_b32 s75, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s76, s6, 16 +; SI-NEXT: s_lshr_b32 s77, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v12i64_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_readfirstlane_b32 s15, v0 +; VI-NEXT: v_readfirstlane_b32 s14, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: v_readfirstlane_b32 s12, v3 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: v_readfirstlane_b32 s10, v5 +; VI-NEXT: v_readfirstlane_b32 s9, v6 +; VI-NEXT: v_readfirstlane_b32 s8, v7 +; VI-NEXT: v_readfirstlane_b32 s6, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v9 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s40, s7, 16 +; VI-NEXT: s_lshr_b32 s41, s6, 16 +; VI-NEXT: s_lshr_b32 s42, s8, 16 +; VI-NEXT: s_lshr_b32 s43, s9, 16 +; VI-NEXT: s_lshr_b32 s44, s10, 16 +; VI-NEXT: s_lshr_b32 s45, s11, 16 +; VI-NEXT: s_lshr_b32 s46, s12, 16 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b32 s56, s14, 16 +; VI-NEXT: s_lshr_b32 s57, s15, 16 +; VI-NEXT: s_lshr_b32 s58, s29, 16 +; VI-NEXT: s_lshr_b32 s59, s28, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 16 +; VI-NEXT: s_lshr_b32 s61, s26, 16 +; VI-NEXT: s_lshr_b32 s62, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s24, 16 +; VI-NEXT: s_lshr_b32 s72, s23, 16 +; VI-NEXT: s_lshr_b32 s73, s22, 16 +; VI-NEXT: s_lshr_b32 s74, s21, 16 +; VI-NEXT: s_lshr_b32 s75, s20, 16 +; VI-NEXT: s_lshr_b32 s76, s19, 16 +; VI-NEXT: s_lshr_b32 s77, s18, 16 +; VI-NEXT: s_lshr_b32 s78, s17, 16 +; VI-NEXT: s_lshr_b32 s79, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s79, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s78, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s77, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s76, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s75, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s74, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s73, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s72, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s63, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s62, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s61, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s60, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s59, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s58, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s28, s57, 16 +; VI-NEXT: s_or_b32 s15, s15, s28 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s28, s56, 16 +; VI-NEXT: s_or_b32 s14, s14, s28 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s28, s47, 16 +; VI-NEXT: s_or_b32 s13, s13, s28 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s28, s46, 16 +; VI-NEXT: s_or_b32 s12, s12, s28 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s28, s45, 16 +; VI-NEXT: s_or_b32 s11, s11, s28 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s28, s44, 16 +; VI-NEXT: s_or_b32 s10, s10, s28 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s28, s43, 16 +; VI-NEXT: s_or_b32 s9, s9, s28 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s28, s42, 16 +; VI-NEXT: s_or_b32 s8, s8, s28 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s28, s41, 16 +; VI-NEXT: s_or_b32 s6, s6, s28 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s28, s40, 16 +; VI-NEXT: s_or_b32 s7, s7, s28 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s14 +; VI-NEXT: v_mov_b32_e32 v16, s13 +; VI-NEXT: v_mov_b32_e32 v17, s12 +; VI-NEXT: v_mov_b32_e32 v18, s11 +; VI-NEXT: v_mov_b32_e32 v19, s10 +; VI-NEXT: v_mov_b32_e32 v20, s9 +; VI-NEXT: v_mov_b32_e32 v21, s8 +; VI-NEXT: v_mov_b32_e32 v22, s6 +; VI-NEXT: v_mov_b32_e32 v23, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v12i64_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s40, s15, 16 +; GFX9-NEXT: s_lshr_b32 s41, s14, 16 +; GFX9-NEXT: s_lshr_b32 s42, s13, 16 +; GFX9-NEXT: s_lshr_b32 s43, s12, 16 +; GFX9-NEXT: s_lshr_b32 s44, s11, 16 +; GFX9-NEXT: s_lshr_b32 s45, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 16 +; GFX9-NEXT: s_lshr_b32 s47, s8, 16 +; GFX9-NEXT: s_lshr_b32 s56, s7, 16 +; GFX9-NEXT: s_lshr_b32 s57, s6, 16 +; GFX9-NEXT: s_lshr_b32 s58, s29, 16 +; GFX9-NEXT: s_lshr_b32 s59, s28, 16 +; GFX9-NEXT: s_lshr_b32 s60, s27, 16 +; GFX9-NEXT: s_lshr_b32 s61, s26, 16 +; GFX9-NEXT: s_lshr_b32 s62, s25, 16 +; GFX9-NEXT: s_lshr_b32 s63, s24, 16 +; GFX9-NEXT: s_lshr_b32 s72, s23, 16 +; GFX9-NEXT: s_lshr_b32 s73, s22, 16 +; GFX9-NEXT: s_lshr_b32 s74, s21, 16 +; GFX9-NEXT: s_lshr_b32 s75, s20, 16 +; GFX9-NEXT: s_lshr_b32 s76, s19, 16 +; GFX9-NEXT: s_lshr_b32 s77, s18, 16 +; GFX9-NEXT: s_lshr_b32 s78, s17, 16 +; GFX9-NEXT: s_lshr_b32 s79, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s40 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: ; implicit-def: $sgpr41 +; GFX9-NEXT: ; implicit-def: $sgpr40 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v23, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-FAKE16-NEXT: s_mov_b32 s74, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s10, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s74 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i64> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v12i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v12i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v12i64: ; VI: ; %bb.0: @@ -13216,7 +27832,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -13291,9 +27907,9 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -13368,7 +27984,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13467,7 +28083,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -13575,9 +28191,9 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -13660,7 +28276,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -13689,7 +28305,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -13715,7 +28331,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13775,7 +28391,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -13801,7 +28417,7 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -13821,257 +28437,1379 @@ end: ret <12 x i64> %phi } +define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v12i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v48f16_to_v12i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v12i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} + define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_alignbit_b32 v25, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v26, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v27, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v28, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v29, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v53, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v53, v41, v53 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v40, v42, v40 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v50, v43, v50 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v55, v44, v55 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v48, v45, v48 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v54, v46, v54 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v39, v47, v39 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v52, v56, v52 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v9, v9, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v10, v10, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v11, v11, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v12, v12, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v13, v13, v30 -; GCN-NEXT: v_or_b32_e32 v14, v14, v36 -; GCN-NEXT: v_or_b32_e32 v15, v15, v29 -; GCN-NEXT: v_or_b32_e32 v16, v16, v35 -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: v_or_b32_e32 v18, v18, v34 -; GCN-NEXT: v_or_b32_e32 v19, v19, v27 -; GCN-NEXT: v_or_b32_e32 v20, v20, v33 -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: v_or_b32_e32 v22, v22, v32 -; GCN-NEXT: v_or_b32_e32 v23, v23, v25 -; GCN-NEXT: v_or_b32_e32 v24, v24, v31 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f64_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_alignbit_b32 v25, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v26, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v27, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v28, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v29, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v35, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v53, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v53 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48i16: ; VI: ; %bb.0: @@ -14103,7 +29841,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -14129,9 +29867,9 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14169,7 +29907,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -14251,7 +29989,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -14277,9 +30015,9 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14317,7 +30055,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -14354,7 +30092,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14368,7 +30106,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14403,7 +30141,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -14429,9 +30167,9 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -14469,7 +30207,7 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -14514,419 +30252,1354 @@ end: ret <48 x i16> %phi } +define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v24, s17 +; SI-NEXT: v_mov_b32_e32 v21, s18 +; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: v_mov_b32_e32 v16, s25 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_or_b32_e32 v23, v23, v53 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v12f64_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v22, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v21, s24 +; GFX9-NEXT: v_mov_b32_e32 v22, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} + define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v33, v28 -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v59 -; GCN-NEXT: v_or_b32_e32 v1, v1, v60 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v58 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v56 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v46 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v45 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v44 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v41 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v62 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v61 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v57 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v42 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v11, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v24 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v63 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v0, v59, v0 -; GCN-NEXT: v_or_b32_e32 v1, v60, v1 -; GCN-NEXT: v_or_b32_e32 v2, v58, v2 -; GCN-NEXT: v_or_b32_e32 v3, v56, v3 -; GCN-NEXT: v_or_b32_e32 v4, v47, v4 -; GCN-NEXT: v_or_b32_e32 v5, v46, v5 -; GCN-NEXT: v_or_b32_e32 v6, v45, v6 -; GCN-NEXT: v_or_b32_e32 v7, v44, v7 -; GCN-NEXT: v_or_b32_e32 v8, v43, v8 -; GCN-NEXT: v_or_b32_e32 v9, v41, v9 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v24, v16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v24, v17 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v24, v18 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v24, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v24, v20 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v24, v21 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v24, v22 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v24, v23 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v4, v4, v45 +; SI-NEXT: v_or_b32_e32 v5, v5, v37 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_or_b32_e32 v7, v7, v36 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v43 +; SI-NEXT: v_or_b32_e32 v10, v10, v42 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_or_b32_e32 v12, v12, v41 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v40 +; SI-NEXT: v_or_b32_e32 v15, v15, v32 +; SI-NEXT: v_or_b32_e32 v16, v16, v63 +; SI-NEXT: v_or_b32_e32 v17, v17, v62 +; SI-NEXT: v_or_b32_e32 v18, v18, v61 +; SI-NEXT: v_or_b32_e32 v19, v19, v60 +; SI-NEXT: v_or_b32_e32 v20, v20, v59 +; SI-NEXT: v_or_b32_e32 v21, v21, v58 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v56 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 +; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 +; SI-NEXT: v_or_b32_e32 v5, v37, v5 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_or_b32_e32 v7, v36, v7 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v8, v35, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v42, v10 +; SI-NEXT: v_or_b32_e32 v11, v34, v11 +; SI-NEXT: v_or_b32_e32 v12, v41, v12 +; SI-NEXT: v_or_b32_e32 v13, v33, v13 +; SI-NEXT: v_or_b32_e32 v14, v40, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v16, v63, v16 +; SI-NEXT: v_or_b32_e32 v17, v62, v17 +; SI-NEXT: v_or_b32_e32 v18, v61, v18 +; SI-NEXT: v_or_b32_e32 v19, v60, v19 +; SI-NEXT: v_or_b32_e32 v20, v59, v20 +; SI-NEXT: v_or_b32_e32 v21, v58, v21 +; SI-NEXT: v_or_b32_e32 v22, v57, v22 +; SI-NEXT: v_or_b32_e32 v23, v56, v23 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v12f64: ; VI: ; %bb.0: @@ -14967,7 +31640,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -15042,9 +31715,9 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v47 @@ -15119,7 +31792,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v24, 3, v32 ; VI-NEXT: v_add_u16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15218,7 +31891,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -15326,9 +31999,9 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -15410,7 +32083,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15439,7 +32112,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -15465,7 +32138,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15525,7 +32198,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -15551,7 +32224,7 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -15571,508 +32244,1505 @@ end: ret <12 x double> %phi } +define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_mov_b32_e32 v38, v16 +; SI-NEXT: v_mov_b32_e32 v39, v14 +; SI-NEXT: v_mov_b32_e32 v48, v12 +; SI-NEXT: v_mov_b32_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v16, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v17, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v58 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v49 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v32, v56 +; SI-NEXT: v_mov_b32_e32 v33, v47 +; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v48i16_to_v12f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v12f64_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; kill: killed $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_mov_b32_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v39 -; GCN-NEXT: v_mov_b32_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v49 -; GCN-NEXT: v_mov_b32_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v61 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v59 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v57 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v46 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v44 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v42 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v40 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v55 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v53 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v41 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v54 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v27, v25 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_or_b32_e32 v37, v39, v38 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v54, v53 -; GCN-NEXT: v_or_b32_e32 v49, v55, v52 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12f64_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_mov_b32_e32 v35, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f64_to_v48f16: ; VI: ; %bb.0: @@ -16104,7 +33774,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -16130,9 +33800,9 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16170,7 +33840,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 @@ -16252,7 +33922,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -16278,9 +33948,9 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16318,7 +33988,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v55, v0, s4 @@ -16355,7 +34025,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16369,7 +34039,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16404,7 +34074,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 @@ -16430,9 +34100,9 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 @@ -16470,7 +34140,7 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100 @@ -16515,538 +34185,1688 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12f64_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_and_b64 s[14:15], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v10 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s14, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 +; SI-NEXT: s_lshr_b32 s14, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s14 +; SI-NEXT: s_lshr_b32 s14, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s14 +; SI-NEXT: s_lshr_b32 s14, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s14 +; SI-NEXT: s_lshr_b32 s14, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s14 +; SI-NEXT: s_lshr_b32 s14, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s14 +; SI-NEXT: s_lshr_b32 s14, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s14 +; SI-NEXT: s_lshr_b32 s14, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s14 +; SI-NEXT: s_lshr_b32 s14, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s14 +; SI-NEXT: s_lshr_b32 s14, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s14 +; SI-NEXT: s_lshr_b32 s14, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: s_lshr_b32 s14, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s14 +; SI-NEXT: s_lshr_b32 s14, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s14 +; SI-NEXT: s_lshr_b32 s14, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s14 +; SI-NEXT: s_lshr_b32 s14, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s14 +; SI-NEXT: s_lshr_b32 s14, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s14 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s14 +; SI-NEXT: s_lshr_b32 s14, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s14 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s14 +; SI-NEXT: s_lshr_b32 s14, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s14 +; SI-NEXT: s_lshr_b32 s14, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s14 +; SI-NEXT: s_lshr_b32 s14, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s14 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s14 +; SI-NEXT: s_lshr_b32 s14, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[50:51], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[29:30], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v51, v51, v52 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v12f64_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v11, s22 +; VI-NEXT: v_mov_b32_e32 v12, s23 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v22, s25 +; VI-NEXT: v_mov_b32_e32 v17, s26 +; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v23, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v12f64_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v11, s22 +; GFX9-NEXT: v_mov_b32_e32 v12, s23 +; GFX9-NEXT: v_mov_b32_e32 v21, s24 +; GFX9-NEXT: v_mov_b32_e32 v22, s25 +; GFX9-NEXT: v_mov_b32_e32 v17, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v13, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v22, v35, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v11, v54, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v53, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v52, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v34, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v17, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v39, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v37, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v38, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v36, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v24 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v52, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v51, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v35, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v33, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v49, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v38, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v37, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v36, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v34, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v32, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v31, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v30, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <12 x double> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x double> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v12f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v53 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v52, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; GCN-NEXT: v_or_b32_e32 v2, v50, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: v_or_b32_e32 v4, v38, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; GCN-NEXT: v_or_b32_e32 v5, v36, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v24, v10 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v24, v11 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v24, v12 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v24, v13 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v24, v14 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v24, v15 -; GCN-NEXT: v_or_b32_e32 v16, v61, v16 -; GCN-NEXT: v_or_b32_e32 v17, v59, v17 -; GCN-NEXT: v_or_b32_e32 v18, v57, v18 -; GCN-NEXT: v_or_b32_e32 v19, v47, v19 -; GCN-NEXT: v_or_b32_e32 v20, v32, v20 -; GCN-NEXT: v_or_b32_e32 v21, v43, v21 -; GCN-NEXT: v_or_b32_e32 v22, v41, v22 -; GCN-NEXT: v_or_b32_e32 v23, v40, v23 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; kill: killed $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v48 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v36 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v34 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v33 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_or_b32_e32 v11, v13, v12 -; GCN-NEXT: v_or_b32_e32 v12, v15, v14 -; GCN-NEXT: v_or_b32_e32 v13, v17, v16 -; GCN-NEXT: v_or_b32_e32 v14, v19, v18 -; GCN-NEXT: v_or_b32_e32 v15, v21, v20 -; GCN-NEXT: v_or_b32_e32 v16, v23, v22 -; GCN-NEXT: v_or_b32_e32 v17, v25, v24 -; GCN-NEXT: v_or_b32_e32 v18, v27, v26 -; GCN-NEXT: v_or_b32_e32 v19, v29, v28 -; GCN-NEXT: v_or_b32_e32 v20, v31, v30 -; GCN-NEXT: v_or_b32_e32 v21, v33, v32 -; GCN-NEXT: v_or_b32_e32 v22, v35, v34 -; GCN-NEXT: v_or_b32_e32 v23, v37, v36 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v12f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_or_b32_e32 v2, v50, v2 +; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v4, v38, v4 +; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 +; SI-NEXT: v_or_b32_e32 v21, v44, v21 +; SI-NEXT: v_or_b32_e32 v22, v42, v22 +; SI-NEXT: v_or_b32_e32 v23, v40, v23 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_or_b32_e32 v20, v46, v20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v45 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v43 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v42 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v12f64: ; VI: ; %bb.0: @@ -17087,7 +35907,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v23, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v23, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -17162,9 +35982,9 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v23, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v47, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -17239,7 +36059,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -17338,7 +36158,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload @@ -17446,9 +36266,9 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; kill: killed $vgpr24 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -17531,7 +36351,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -17560,7 +36380,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -17586,7 +36406,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17646,7 +36466,7 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -17672,9 +36492,1135 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <12 x double> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <12 x double> + br label %end + +end: + %phi = phi <12 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x double> %phi +} + +define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v12f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v8, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v54, v4 +; SI-NEXT: v_or_b32_e32 v5, v35, v5 +; SI-NEXT: v_or_b32_e32 v6, v30, v6 +; SI-NEXT: v_or_b32_e32 v7, v55, v7 +; SI-NEXT: v_or_b32_e32 v8, v63, v8 +; SI-NEXT: v_or_b32_e32 v9, v43, v9 +; SI-NEXT: v_or_b32_e32 v10, v49, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v37, v12 +; SI-NEXT: v_or_b32_e32 v13, v61, v13 +; SI-NEXT: v_or_b32_e32 v14, v33, v14 +; SI-NEXT: v_or_b32_e32 v15, v59, v15 +; SI-NEXT: v_or_b32_e32 v16, v57, v16 +; SI-NEXT: v_or_b32_e32 v17, v47, v17 +; SI-NEXT: v_or_b32_e32 v18, v45, v18 +; SI-NEXT: v_or_b32_e32 v19, v25, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v28, v21 +; SI-NEXT: v_or_b32_e32 v22, v26, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v47 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v28 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v46 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v44 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v26 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v59 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v60 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v55, v52 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v53, v43 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v61, v56 +; SI-NEXT: v_mov_b32_e32 v56, v25 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v44 +; SI-NEXT: v_mov_b32_e32 v44, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v27, v43 +; SI-NEXT: v_mov_b32_e32 v25, v56 +; SI-NEXT: v_mov_b32_e32 v56, v61 +; SI-NEXT: v_mov_b32_e32 v61, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v43, v53 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: v_mov_b32_e32 v44, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v24, v41 +; SI-NEXT: v_mov_b32_e32 v41, v46 +; SI-NEXT: v_mov_b32_e32 v46, v59 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v32, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v48f16_to_v12f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v32, v9 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v34, v7 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v36, v5 +; VI-NEXT: v_mov_b32_e32 v37, v4 +; VI-NEXT: v_mov_b32_e32 v38, v3 +; VI-NEXT: v_mov_b32_e32 v39, v2 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v14, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v16, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v17, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v18, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v19, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_or_b32_sdwa v20, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v21, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v23, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v49, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v48, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v39, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v38, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v37, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v35, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v33, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v22, v24, v22 +; VI-NEXT: v_add_f16_sdwa v23, v32, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v48f16_to_v12f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v9 +; GFX9-NEXT: v_mov_b32_e32 v33, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v7 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: v_mov_b32_e32 v36, v5 +; GFX9-NEXT: v_mov_b32_e32 v37, v4 +; GFX9-NEXT: v_mov_b32_e32 v38, v3 +; GFX9-NEXT: v_mov_b32_e32 v39, v2 +; GFX9-NEXT: v_mov_b32_e32 v48, v1 +; GFX9-NEXT: v_mov_b32_e32 v49, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v49 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v43, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v42, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v41, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v40, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v55, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v54, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v53, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v52, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v23 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v3 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s57 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v5 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s0, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s18, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s19, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s20, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s21, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s22, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s23, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s24, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s25, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s26, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v37, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v35, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v33, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17693,697 +37639,713 @@ end: } define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v48i16_to_v48f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; kill: killed $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v35 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v36 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v37 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v38 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v39 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v48 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v51 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v52 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v53 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v33, v38, v33 -; GCN-NEXT: v_or_b32_e32 v32, v39, v32 -; GCN-NEXT: v_or_b32_e32 v38, v49, v48 -; GCN-NEXT: v_or_b32_e32 v39, v51, v50 -; GCN-NEXT: v_or_b32_e32 v48, v53, v52 -; GCN-NEXT: v_or_b32_e32 v49, v55, v54 -; GCN-NEXT: v_or_b32_e32 v50, v41, v40 -; GCN-NEXT: v_or_b32_e32 v51, v43, v42 -; GCN-NEXT: v_or_b32_e32 v52, v45, v44 -; GCN-NEXT: v_or_b32_e32 v53, v47, v46 -; GCN-NEXT: v_or_b32_e32 v54, v57, v56 -; GCN-NEXT: v_or_b32_e32 v55, v59, v58 -; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48i16_to_v48f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48i16_to_v48f16: ; VI: ; %bb.0: @@ -18416,7 +38378,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v55, 3, v55 @@ -18466,7 +38428,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: v_add_u16_e32 v23, 3, v23 ; VI-NEXT: v_add_u16_e32 v25, 3, v25 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_sdwa v4, v4, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18549,7 +38511,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s6 @@ -18624,7 +38586,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s4 @@ -18661,7 +38623,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] @@ -18687,7 +38649,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18723,7 +38685,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 @@ -18797,7 +38759,7 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 @@ -18841,450 +38803,1845 @@ end: ret <48 x half> %phi } +define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48i16_to_v48f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v48i16_to_v48f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v48i16_to_v48f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v11, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v10, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v26, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v24, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v28, s24 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s18 +; GFX9-NEXT: v_mov_b32_e32 v31, s17 +; GFX9-NEXT: v_mov_b32_e32 v30, s16 +; GFX9-NEXT: v_mov_b32_e32 v34, s43 +; GFX9-NEXT: v_mov_b32_e32 v35, s42 +; GFX9-NEXT: v_mov_b32_e32 v36, s41 +; GFX9-NEXT: v_mov_b32_e32 v37, s40 +; GFX9-NEXT: v_mov_b32_e32 v38, s15 +; GFX9-NEXT: v_mov_b32_e32 v39, s14 +; GFX9-NEXT: v_mov_b32_e32 v48, s13 +; GFX9-NEXT: v_mov_b32_e32 v49, s12 +; GFX9-NEXT: v_mov_b32_e32 v50, s11 +; GFX9-NEXT: v_mov_b32_e32 v51, s10 +; GFX9-NEXT: v_mov_b32_e32 v52, s9 +; GFX9-NEXT: v_mov_b32_e32 v53, s8 +; GFX9-NEXT: v_mov_b32_e32 v54, s7 +; GFX9-NEXT: v_mov_b32_e32 v55, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v54, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v53, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v52, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v37, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v35, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v34, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v7, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s13, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s10, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v55.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s9 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v49, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v30, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v48, 16, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s13 :: v_dual_mov_b32 v39, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s11 :: v_dual_mov_b32 v49, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s9 :: v_dual_mov_b32 v51, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <48 x i16> %a, splat (i16 3) + %a2 = bitcast <48 x i16> %a1 to <48 x half> + br label %end + +cmp.false: + %a3 = bitcast <48 x i16> %a to <48 x half> + br label %end + +end: + %phi = phi <48 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x half> %phi +} + define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v48f16_to_v48i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v53, v31 -; GCN-NEXT: v_or_b32_e32 v51, v51, v32 -; GCN-NEXT: v_or_b32_e32 v49, v49, v54 -; GCN-NEXT: v_or_b32_e32 v39, v39, v52 -; GCN-NEXT: v_or_b32_e32 v37, v37, v50 -; GCN-NEXT: v_or_b32_e32 v36, v36, v48 -; GCN-NEXT: v_or_b32_e32 v35, v35, v38 -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v29 -; GCN-NEXT: v_or_b32_e32 v24, v24, v27 -; GCN-NEXT: v_or_b32_e32 v22, v22, v25 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v55 -; GCN-NEXT: v_or_b32_e32 v13, v13, v40 -; GCN-NEXT: v_or_b32_e32 v15, v15, v42 -; GCN-NEXT: v_or_b32_e32 v17, v17, v43 -; GCN-NEXT: v_or_b32_e32 v20, v20, v44 -; GCN-NEXT: v_or_b32_e32 v19, v19, v46 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_or_b32_e32 v16, v16, v56 -; GCN-NEXT: v_or_b32_e32 v14, v14, v57 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_or_b32_e32 v2, v2, v60 -; GCN-NEXT: v_alignbit_b32 v40, v2, v31, 16 -; GCN-NEXT: v_alignbit_b32 v55, v3, v32, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v54, 16 -; GCN-NEXT: v_alignbit_b32 v52, v14, v52, 16 -; GCN-NEXT: v_alignbit_b32 v50, v16, v50, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v48, 16 -; GCN-NEXT: v_alignbit_b32 v38, v19, v38, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v30, 16 -; GCN-NEXT: v_alignbit_b32 v29, v17, v29, 16 -; GCN-NEXT: v_alignbit_b32 v27, v15, v27, 16 -; GCN-NEXT: v_alignbit_b32 v25, v13, v25, 16 -; GCN-NEXT: v_alignbit_b32 v23, v12, v23, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v53, v53, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v51, v51, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v49, v49, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v4, v4, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v39, v39, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v9, v14, v9 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v37, v37, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v10, v16, v10 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v36, v36, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v11, v18, v11 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x5c, v0 -; GCN-NEXT: v_or_b32_e32 v35, v35, v38 -; GCN-NEXT: v_or_b32_e32 v6, v19, v6 -; GCN-NEXT: v_or_b32_e32 v19, v28, v30 -; GCN-NEXT: v_or_b32_e32 v5, v20, v5 -; GCN-NEXT: v_or_b32_e32 v20, v26, v29 -; GCN-NEXT: v_or_b32_e32 v17, v17, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v27 -; GCN-NEXT: v_or_b32_e32 v15, v15, v33 -; GCN-NEXT: v_or_b32_e32 v22, v22, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v45 -; GCN-NEXT: v_or_b32_e32 v21, v21, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v39, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v48f16_to_v48i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v59 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v60 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v61 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v62 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v34 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v53 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v52, v21 +; SI-NEXT: v_mov_b32_e32 v21, v6 +; SI-NEXT: v_mov_b32_e32 v6, v7 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v58, v2 +; SI-NEXT: v_mov_b32_e32 v60, v50 +; SI-NEXT: v_mov_b32_e32 v50, v3 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v58 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_or_b32_e32 v60, v30, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v26 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_or_b32_e32 v58, v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v52, v35, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v25 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_or_b32_e32 v57, v33, v23 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v56, v24, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v34, v24, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: v_or_b32_e32 v59, v2, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_or_b32_e32 v5, v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v11, v11, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v10, v10, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v36 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v14, v14, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v20, v20, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v18 +; SI-NEXT: v_or_b32_e32 v19, v19, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 +; SI-NEXT: v_or_b32_e32 v17, v17, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v35 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37 +; SI-NEXT: v_or_b32_e32 v38, v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v42 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v37, v35, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 +; SI-NEXT: v_or_b32_e32 v51, v33, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v50, v24, v33 +; SI-NEXT: v_or_b32_e32 v8, v8, v29 +; SI-NEXT: v_or_b32_e32 v7, v7, v30 +; SI-NEXT: v_or_b32_e32 v6, v6, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v45 +; SI-NEXT: v_or_b32_e32 v28, v28, v25 +; SI-NEXT: v_or_b32_e32 v27, v27, v46 +; SI-NEXT: v_alignbit_b32 v44, v50, v31, 16 +; SI-NEXT: v_alignbit_b32 v43, v51, v32, 16 +; SI-NEXT: v_alignbit_b32 v42, v37, v29, 16 +; SI-NEXT: v_alignbit_b32 v41, v38, v30, 16 +; SI-NEXT: v_alignbit_b32 v40, v17, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v19, v45, 16 +; SI-NEXT: v_alignbit_b32 v54, v20, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v14, v25, 16 +; SI-NEXT: v_alignbit_b32 v25, v10, v46, 16 +; SI-NEXT: v_alignbit_b32 v24, v11, v23, 16 +; SI-NEXT: v_alignbit_b32 v23, v5, v22, 16 +; SI-NEXT: v_alignbit_b32 v22, v59, v47, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v55 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v54 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v48f16_to_v48i16: ; VI: ; %bb.0: @@ -19317,7 +40674,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 @@ -19367,7 +40724,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 ; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_sdwa v4, v4, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19450,7 +40807,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v23, v55, v23, s6 @@ -19526,7 +40883,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v22 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v23 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v24, v0, s4 @@ -19563,7 +40920,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] @@ -19589,7 +40946,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19625,7 +40982,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100 @@ -19699,7 +41056,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100 @@ -19742,3 +41099,1251 @@ end: %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <48 x i16> %phi } + +define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v41 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v10, v10, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v14, v14, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v35, v35, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_or_b32_e32 v34, v34, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_or_b32_e32 v38, v38, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v49, v49, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; SI-NEXT: v_or_b32_e32 v23, v23, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v53 +; SI-NEXT: v_or_b32_e32 v22, v22, v50 +; SI-NEXT: v_or_b32_e32 v25, v25, v30 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_or_b32_e32 v21, v21, v41 +; SI-NEXT: v_or_b32_e32 v16, v16, v28 +; SI-NEXT: v_or_b32_e32 v48, v48, v54 +; SI-NEXT: v_or_b32_e32 v39, v39, v42 +; SI-NEXT: v_or_b32_e32 v32, v32, v52 +; SI-NEXT: v_or_b32_e32 v31, v31, v51 +; SI-NEXT: v_or_b32_e32 v15, v15, v43 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_or_b32_e32 v7, v7, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v44 +; SI-NEXT: v_alignbit_b32 v40, v22, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v23, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, v18, v41, 16 +; SI-NEXT: v_alignbit_b32 v28, v49, v28, 16 +; SI-NEXT: v_alignbit_b32 v55, v38, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v34, v42, 16 +; SI-NEXT: v_alignbit_b32 v53, v35, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v50, v11, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v5, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v2, v44, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v16, v12 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 +; SI-NEXT: v_or_b32_e32 v12, v12, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v8, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v48f16_to_v48i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v34, 0x200 +; VI-NEXT: v_add_f16_e32 v30, s16, v34 +; VI-NEXT: v_add_f16_e32 v55, s43, v34 +; VI-NEXT: v_add_f16_e32 v31, s17, v34 +; VI-NEXT: v_add_f16_e32 v54, s42, v34 +; VI-NEXT: v_add_f16_e32 v32, s18, v34 +; VI-NEXT: v_add_f16_e32 v53, s41, v34 +; VI-NEXT: v_add_f16_e32 v33, s19, v34 +; VI-NEXT: v_add_f16_e32 v52, s40, v34 +; VI-NEXT: v_add_f16_e32 v24, s20, v34 +; VI-NEXT: v_add_f16_e32 v51, s15, v34 +; VI-NEXT: v_add_f16_e32 v25, s21, v34 +; VI-NEXT: v_add_f16_e32 v50, s14, v34 +; VI-NEXT: v_add_f16_e32 v26, s22, v34 +; VI-NEXT: v_add_f16_e32 v49, s13, v34 +; VI-NEXT: v_add_f16_e32 v27, s23, v34 +; VI-NEXT: v_add_f16_e32 v48, s12, v34 +; VI-NEXT: v_add_f16_e32 v28, s24, v34 +; VI-NEXT: v_add_f16_e32 v39, s11, v34 +; VI-NEXT: v_add_f16_e32 v29, s25, v34 +; VI-NEXT: v_add_f16_e32 v38, s10, v34 +; VI-NEXT: v_add_f16_e32 v10, s26, v34 +; VI-NEXT: v_add_f16_e32 v37, s9, v34 +; VI-NEXT: v_add_f16_e32 v11, s27, v34 +; VI-NEXT: v_add_f16_e32 v36, s8, v34 +; VI-NEXT: v_add_f16_e32 v12, s28, v34 +; VI-NEXT: v_add_f16_e32 v35, s7, v34 +; VI-NEXT: v_add_f16_e32 v13, s29, v34 +; VI-NEXT: v_add_f16_e32 v34, s6, v34 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v34, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v35, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v36, s8 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v37, s9 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v38, s10 +; VI-NEXT: v_mov_b32_e32 v29, s25 +; VI-NEXT: v_mov_b32_e32 v39, s11 +; VI-NEXT: v_mov_b32_e32 v28, s24 +; VI-NEXT: v_mov_b32_e32 v48, s12 +; VI-NEXT: v_mov_b32_e32 v27, s23 +; VI-NEXT: v_mov_b32_e32 v49, s13 +; VI-NEXT: v_mov_b32_e32 v26, s22 +; VI-NEXT: v_mov_b32_e32 v50, s14 +; VI-NEXT: v_mov_b32_e32 v25, s21 +; VI-NEXT: v_mov_b32_e32 v51, s15 +; VI-NEXT: v_mov_b32_e32 v24, s20 +; VI-NEXT: v_mov_b32_e32 v52, s40 +; VI-NEXT: v_mov_b32_e32 v33, s19 +; VI-NEXT: v_mov_b32_e32 v53, s41 +; VI-NEXT: v_mov_b32_e32 v32, s18 +; VI-NEXT: v_mov_b32_e32 v54, s42 +; VI-NEXT: v_mov_b32_e32 v31, s17 +; VI-NEXT: v_mov_b32_e32 v55, s43 +; VI-NEXT: v_mov_b32_e32 v30, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v30, v30, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v10, v10, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v30 +; VI-NEXT: v_mov_b32_e32 v1, v31 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v24 +; VI-NEXT: v_mov_b32_e32 v5, v25 +; VI-NEXT: v_mov_b32_e32 v6, v26 +; VI-NEXT: v_mov_b32_e32 v7, v27 +; VI-NEXT: v_mov_b32_e32 v8, v28 +; VI-NEXT: v_mov_b32_e32 v9, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v48f16_to_v48i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v11, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v10, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v27, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v25, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v24, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v29, s25 +; GFX9-NEXT: v_mov_b32_e32 v28, s24 +; GFX9-NEXT: v_mov_b32_e32 v27, s23 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s21 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s18 +; GFX9-NEXT: v_mov_b32_e32 v31, s17 +; GFX9-NEXT: v_mov_b32_e32 v30, s16 +; GFX9-NEXT: v_mov_b32_e32 v34, s43 +; GFX9-NEXT: v_mov_b32_e32 v35, s42 +; GFX9-NEXT: v_mov_b32_e32 v36, s41 +; GFX9-NEXT: v_mov_b32_e32 v37, s40 +; GFX9-NEXT: v_mov_b32_e32 v38, s15 +; GFX9-NEXT: v_mov_b32_e32 v39, s14 +; GFX9-NEXT: v_mov_b32_e32 v48, s13 +; GFX9-NEXT: v_mov_b32_e32 v49, s12 +; GFX9-NEXT: v_mov_b32_e32 v50, s11 +; GFX9-NEXT: v_mov_b32_e32 v51, s10 +; GFX9-NEXT: v_mov_b32_e32 v52, s9 +; GFX9-NEXT: v_mov_b32_e32 v53, s8 +; GFX9-NEXT: v_mov_b32_e32 v54, s7 +; GFX9-NEXT: v_mov_b32_e32 v55, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v30, v55, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v54, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v53, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v52, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v51, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v50, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v49, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v39, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v38, 16, v29 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v10, v37, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v35, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v34, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v30 +; GFX9-NEXT: v_mov_b32_e32 v1, v31 +; GFX9-NEXT: v_mov_b32_e32 v2, v32 +; GFX9-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-NEXT: v_mov_b32_e32 v4, v24 +; GFX9-NEXT: v_mov_b32_e32 v5, v25 +; GFX9-NEXT: v_mov_b32_e32 v6, v26 +; GFX9-NEXT: v_mov_b32_e32 v7, v27 +; GFX9-NEXT: v_mov_b32_e32 v8, v28 +; GFX9-NEXT: v_mov_b32_e32 v9, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v7, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s15, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s13, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s10, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v55.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s9 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v31, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v33, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v49, 16, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v30, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v20, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v39, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v19, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v30, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v48, 16, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v35, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v36, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v37, 16, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v32, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s15 :: v_dual_mov_b32 v37, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s13 :: v_dual_mov_b32 v39, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s11 :: v_dual_mov_b32 v49, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s9 :: v_dual_mov_b32 v51, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s6 :: v_dual_mov_b32 v53, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v48, 16, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v53 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v35, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v34, 16, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v33, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v32, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v31, 16, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v30, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <48 x half> %a, splat (half 0xH0200) + %a2 = bitcast <48 x half> %a1 to <48 x i16> + br label %end + +cmp.false: + %a3 = bitcast <48 x half> %a to <48 x i16> + br label %end + +end: + %phi = phi <48 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <48 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 75baa36ca3d11..e19eba6270957 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -1,50 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v26f32: ; VI: ; %bb.0: @@ -180,45 +180,317 @@ end: ret <26 x float> %phi } +define inreg <26 x float> @bitcast_v26i32_to_v26f32_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v26i32_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v26i32_to_v26f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v26i32: ; VI: ; %bb.0: @@ -227,7 +499,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -255,7 +527,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -266,7 +538,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -294,7 +566,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -306,7 +578,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -321,7 +593,7 @@ define <26 x i32> @bitcast_v26f32_to_v26i32(<26 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -341,45 +613,304 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v26f32_to_v26i32_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v26f32_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v26f32_to_v26i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v13i64: ; VI: ; %bb.0: @@ -388,7 +919,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -416,7 +947,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -427,7 +958,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -455,7 +986,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -467,7 +998,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -495,7 +1026,7 @@ define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -515,82 +1046,354 @@ end: ret <13 x i64> %phi } -define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x i64> @bitcast_v26i32_to_v13i64_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v13i64_to_v26i32: +; VI-LABEL: bitcast_v26i32_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v26i32_to_v13i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + +define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v13i64_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13i64_to_v26i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -601,7 +1404,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -629,7 +1432,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -641,7 +1444,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -676,7 +1479,7 @@ define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -696,45 +1499,324 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v13i64_to_v26i32_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v13i64_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v13i64_to_v26i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v13f64: ; VI: ; %bb.0: @@ -743,7 +1825,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -771,7 +1853,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -782,7 +1864,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -810,7 +1892,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -822,7 +1904,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -850,7 +1932,7 @@ define <13 x double> @bitcast_v26i32_to_v13f64(<26 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -870,56 +1952,328 @@ end: ret <13 x double> %phi } -define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x double> @bitcast_v26i32_to_v13f64_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v13f64_to_v26i32: +; VI-LABEL: bitcast_v26i32_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v26i32_to_v13f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v13f64_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13f64_to_v26i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -930,7 +2284,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -945,7 +2299,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -957,7 +2311,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -972,7 +2326,7 @@ define <26 x i32> @bitcast_v13f64_to_v26i32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -992,287 +2346,522 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v13f64_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v13f64_to_v26i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v54, v47, v54 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v52, v57, v52 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v10, v10, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v37 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v31 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26i32_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26i32_to_v52i16: ; VI: ; %bb.0: @@ -1310,7 +2899,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -1338,9 +2927,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -1394,7 +2983,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -1491,7 +3080,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -1519,9 +3108,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -1575,7 +3164,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -1619,7 +3208,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -1647,7 +3236,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1684,7 +3273,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -1712,9 +3301,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -1768,7 +3357,7 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -1815,470 +3404,1534 @@ end: ret <52 x i16> %phi } +define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v26i32_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v26i32: ; VI: ; %bb.0: @@ -2323,7 +4976,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2404,9 +5057,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -2487,7 +5140,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2594,7 +5247,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -2718,9 +5371,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -2812,7 +5465,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2841,7 +5494,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2869,7 +5522,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2933,7 +5586,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -2961,7 +5614,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2981,593 +5634,1697 @@ end: ret <26 x i32> %phi } -define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v26i32_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v26i32_to_v52f16: +; VI-LABEL: bitcast_v52i16_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + +define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v26i32_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v26i32_to_v52f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 @@ -3603,7 +7360,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -3631,9 +7388,9 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 @@ -3687,7 +7444,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -3784,7 +7541,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -3812,9 +7569,9 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 ; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 @@ -3868,7 +7625,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -3912,7 +7669,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -3940,7 +7697,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3977,7 +7734,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -4005,9 +7762,9 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 @@ -4061,7 +7818,7 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -4108,633 +7865,1862 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26i32_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s20, 16 +; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s24, 16 +; SI-NEXT: s_lshr_b32 s57, s25, 16 +; SI-NEXT: s_lshr_b32 s58, s26, 16 +; SI-NEXT: s_lshr_b32 s59, s27, 16 +; SI-NEXT: s_lshr_b32 s60, s28, 16 +; SI-NEXT: s_lshr_b32 s61, s29, 16 +; SI-NEXT: s_lshr_b32 s62, s41, 16 +; SI-NEXT: s_lshr_b32 s63, s40, 16 +; SI-NEXT: s_lshr_b32 s72, s15, 16 +; SI-NEXT: s_lshr_b32 s73, s14, 16 +; SI-NEXT: s_lshr_b32 s74, s13, 16 +; SI-NEXT: s_lshr_b32 s75, s12, 16 +; SI-NEXT: s_lshr_b32 s76, s11, 16 +; SI-NEXT: s_lshr_b32 s77, s10, 16 +; SI-NEXT: s_lshr_b32 s78, s8, 16 +; SI-NEXT: s_lshr_b32 s79, s7, 16 +; SI-NEXT: s_lshr_b32 s88, s6, 16 +; SI-NEXT: s_lshr_b32 s89, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v26i32_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v26i32_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v26i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v26i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v26i32: ; VI: ; %bb.0: @@ -4779,7 +9765,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -4860,9 +9846,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -4943,7 +9929,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5050,7 +10036,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -5174,9 +10160,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -5269,7 +10255,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5298,7 +10284,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5326,7 +10312,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5390,7 +10376,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5418,7 +10404,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5438,45 +10424,1265 @@ end: ret <26 x i32> %phi } +define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v26i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v52f16_to_v26i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v26i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v13i64: ; VI: ; %bb.0: @@ -5485,7 +11691,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5513,7 +11719,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5524,7 +11730,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5552,7 +11758,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5564,7 +11770,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -5579,7 +11785,7 @@ define <13 x i64> @bitcast_v26f32_to_v13i64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5599,67 +11805,326 @@ end: ret <13 x i64> %phi } -define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x i64> @bitcast_v26f32_to_v13i64_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 ; -; VI-LABEL: bitcast_v13i64_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v26f32_to_v13i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + +define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v13i64_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13i64_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -5674,7 +12139,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5685,7 +12150,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -5713,7 +12178,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5725,7 +12190,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -5760,7 +12225,7 @@ define <26 x float> @bitcast_v13i64_to_v26f32(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5780,45 +12245,324 @@ end: ret <26 x float> %phi } +define inreg <26 x float> @bitcast_v13i64_to_v26f32_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v13i64_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v13i64_to_v26f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v13f64: ; VI: ; %bb.0: @@ -5827,7 +12571,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5855,7 +12599,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5866,7 +12610,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -5894,7 +12638,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5906,7 +12650,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -5921,7 +12665,7 @@ define <13 x double> @bitcast_v26f32_to_v13f64(<26 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5941,56 +12685,315 @@ end: ret <13 x double> %phi } -define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x double> @bitcast_v26f32_to_v13f64_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v13f64_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v13f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v26f32_to_v13f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v13f64_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13f64_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6001,7 +13004,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -6016,7 +13019,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6028,7 +13031,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -6043,7 +13046,7 @@ define <26 x float> @bitcast_v13f64_to_v26f32(<13 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6063,287 +13066,522 @@ end: ret <26 x float> %phi } +define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v13f64_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v13f64_to_v26f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v32, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v54, v47, v54 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v52, v57, v52 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v9, v9, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v10, v10, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v14, v14, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v18, v18, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v37 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v35 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v31 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v26f32_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v48, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v50, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v26f32_to_v52i16: ; VI: ; %bb.0: @@ -6381,7 +13619,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -6409,9 +13647,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -6465,7 +13703,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -6562,7 +13800,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -6590,9 +13828,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -6646,7 +13884,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -6690,7 +13928,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -6705,7 +13943,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6742,7 +13980,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -6770,9 +14008,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -6813,7 +14051,7 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -6860,523 +14098,1587 @@ end: ret <52 x i16> %phi } -define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v27, s17 +; SI-NEXT: v_mov_b32_e32 v25, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v21, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v20, s23 +; SI-NEXT: v_mov_b32_e32 v18, s24 +; SI-NEXT: v_mov_b32_e32 v17, s25 +; SI-NEXT: v_mov_b32_e32 v16, s26 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v14, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 +; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 +; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 +; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v28, v28, v40 +; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_branch .LBB29_2 ; -; VI-LABEL: bitcast_v52i16_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v52i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v25 -; VI-NEXT: v_mov_b32_e32 v33, v24 -; VI-NEXT: v_mov_b32_e32 v34, v23 -; VI-NEXT: v_mov_b32_e32 v35, v22 -; VI-NEXT: v_mov_b32_e32 v36, v21 -; VI-NEXT: v_mov_b32_e32 v37, v20 -; VI-NEXT: v_mov_b32_e32 v38, v19 -; VI-NEXT: v_mov_b32_e32 v39, v18 -; VI-NEXT: v_mov_b32_e32 v48, v17 -; VI-NEXT: v_mov_b32_e32 v49, v16 -; VI-NEXT: v_mov_b32_e32 v50, v15 -; VI-NEXT: v_mov_b32_e32 v51, v14 -; VI-NEXT: v_mov_b32_e32 v52, v13 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v43, v6 -; VI-NEXT: v_mov_b32_e32 v44, v5 -; VI-NEXT: v_mov_b32_e32 v45, v4 -; VI-NEXT: v_mov_b32_e32 v46, v3 -; VI-NEXT: v_mov_b32_e32 v47, v2 -; VI-NEXT: v_mov_b32_e32 v56, v1 -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB29_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v25, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v21, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s23 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v43, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + +define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v52i16_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v52i16_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v25 +; VI-NEXT: v_mov_b32_e32 v33, v24 +; VI-NEXT: v_mov_b32_e32 v34, v23 +; VI-NEXT: v_mov_b32_e32 v35, v22 +; VI-NEXT: v_mov_b32_e32 v36, v21 +; VI-NEXT: v_mov_b32_e32 v37, v20 +; VI-NEXT: v_mov_b32_e32 v38, v19 +; VI-NEXT: v_mov_b32_e32 v39, v18 +; VI-NEXT: v_mov_b32_e32 v48, v17 +; VI-NEXT: v_mov_b32_e32 v49, v16 +; VI-NEXT: v_mov_b32_e32 v50, v15 +; VI-NEXT: v_mov_b32_e32 v51, v14 +; VI-NEXT: v_mov_b32_e32 v52, v13 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v43, v6 +; VI-NEXT: v_mov_b32_e32 v44, v5 +; VI-NEXT: v_mov_b32_e32 v45, v4 +; VI-NEXT: v_mov_b32_e32 v46, v3 +; VI-NEXT: v_mov_b32_e32 v47, v2 +; VI-NEXT: v_mov_b32_e32 v56, v1 +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v25, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v6, v25, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v7, v25, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v8, v25, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -7449,9 +15751,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -7532,7 +15834,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7639,7 +15941,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -7763,9 +16065,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -7857,7 +16159,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -7886,7 +16188,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -7914,7 +16216,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7978,7 +16280,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -8006,7 +16308,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -8026,606 +16328,1710 @@ end: ret <26 x float> %phi } -define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v26f32_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB31_2 ; -; VI-LABEL: bitcast_v26f32_to_v52f16: +; VI-LABEL: bitcast_v52i16_to_v26f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + +define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v26f32_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v26f32_to_v52f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 @@ -8648,7 +18054,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: s_cbranch_execz .LBB32_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -8676,9 +18082,9 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB32_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB32_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -8732,7 +18138,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -8829,7 +18235,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB32_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -8857,9 +18263,9 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: .LBB32_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB32_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 ; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 @@ -8913,7 +18319,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -8957,7 +18363,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -8972,7 +18378,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9009,7 +18415,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -9037,9 +18443,9 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 @@ -9080,7 +18486,7 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -9127,738 +18533,677 @@ end: ret <52 x half> %phi } -define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v26f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v26f32_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v13 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v30, v41, v30 +; SI-NEXT: buffer_store_dword v30, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v55 +; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v53 +; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 +; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v11 +; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v13 +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 +; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 +; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v30, v4 +; SI-NEXT: buffer_store_dword v4, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v48 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v36 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB33_2 ; -; VI-LABEL: bitcast_v52f16_to_v26f32: +; VI-LABEL: bitcast_v26f32_to_v52f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v25 -; VI-NEXT: v_mov_b32_e32 v33, v24 -; VI-NEXT: v_mov_b32_e32 v34, v23 -; VI-NEXT: v_mov_b32_e32 v35, v22 -; VI-NEXT: v_mov_b32_e32 v36, v21 -; VI-NEXT: v_mov_b32_e32 v37, v20 -; VI-NEXT: v_mov_b32_e32 v38, v19 -; VI-NEXT: v_mov_b32_e32 v39, v18 -; VI-NEXT: v_mov_b32_e32 v48, v17 -; VI-NEXT: v_mov_b32_e32 v49, v16 -; VI-NEXT: v_mov_b32_e32 v50, v15 -; VI-NEXT: v_mov_b32_e32 v51, v14 -; VI-NEXT: v_mov_b32_e32 v52, v13 -; VI-NEXT: v_mov_b32_e32 v53, v12 -; VI-NEXT: v_mov_b32_e32 v54, v11 -; VI-NEXT: v_mov_b32_e32 v55, v10 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v41, v8 -; VI-NEXT: v_mov_b32_e32 v42, v7 -; VI-NEXT: v_mov_b32_e32 v43, v6 -; VI-NEXT: v_mov_b32_e32 v44, v5 -; VI-NEXT: v_mov_b32_e32 v45, v4 -; VI-NEXT: v_mov_b32_e32 v46, v3 -; VI-NEXT: v_mov_b32_e32 v47, v2 -; VI-NEXT: v_mov_b32_e32 v56, v1 -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v18, s18 +; VI-NEXT: v_mov_b32_e32 v17, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v24, s23 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v19, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB33_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v25, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v25, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v25, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v25, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v25, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v25, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v25, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v25, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v25, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v25, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v25, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v25, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v45, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v44, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v43, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v42, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v41, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v40, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v55, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v54, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v53, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v52, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v51, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v48, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v39, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v38, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v36, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v35, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v34, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v33, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v25, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v21, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 @@ -9873,20 +19218,1337 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v25, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v57 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v26f32_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v18, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s23 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v19, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v14, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v43, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v42, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v15, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v64, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v65, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <26 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <26 x float> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <26 x float> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + +define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v52f16_to_v26f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v52f16_to_v26f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v25 +; VI-NEXT: v_mov_b32_e32 v33, v24 +; VI-NEXT: v_mov_b32_e32 v34, v23 +; VI-NEXT: v_mov_b32_e32 v35, v22 +; VI-NEXT: v_mov_b32_e32 v36, v21 +; VI-NEXT: v_mov_b32_e32 v37, v20 +; VI-NEXT: v_mov_b32_e32 v38, v19 +; VI-NEXT: v_mov_b32_e32 v39, v18 +; VI-NEXT: v_mov_b32_e32 v48, v17 +; VI-NEXT: v_mov_b32_e32 v49, v16 +; VI-NEXT: v_mov_b32_e32 v50, v15 +; VI-NEXT: v_mov_b32_e32 v51, v14 +; VI-NEXT: v_mov_b32_e32 v52, v13 +; VI-NEXT: v_mov_b32_e32 v53, v12 +; VI-NEXT: v_mov_b32_e32 v54, v11 +; VI-NEXT: v_mov_b32_e32 v55, v10 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v41, v8 +; VI-NEXT: v_mov_b32_e32 v42, v7 +; VI-NEXT: v_mov_b32_e32 v43, v6 +; VI-NEXT: v_mov_b32_e32 v44, v5 +; VI-NEXT: v_mov_b32_e32 v45, v4 +; VI-NEXT: v_mov_b32_e32 v46, v3 +; VI-NEXT: v_mov_b32_e32 v47, v2 +; VI-NEXT: v_mov_b32_e32 v56, v1 +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v25, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v25, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v25, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v25, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v25, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v25, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v25, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v25, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v25, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v25, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v25, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v25, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v25, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v25, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v25, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v25, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v25, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v45, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v44, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v43, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v42, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v41, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v40, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v54, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v53, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v52, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v51, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v48, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v39, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v38, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v36, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v35, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v34, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_add_f16_sdwa v1, v56, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v56 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -9962,7 +20624,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10069,7 +20731,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_cbranch_execz .LBB34_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -10193,9 +20855,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -10288,7 +20950,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -10317,7 +20979,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -10345,7 +21007,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10409,7 +21071,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -10437,9 +21099,1229 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <26 x float> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <26 x float> + br label %end + +end: + %phi = phi <26 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x float> %phi +} + +define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v26f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v52f16_to_v26f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v26f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10458,44 +22340,44 @@ end: } define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13i64_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v13f64: ; VI: ; %bb.0: @@ -10504,7 +22386,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -10532,7 +22414,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -10543,7 +22425,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -10571,7 +22453,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -10583,7 +22465,7 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -10618,62 +22500,516 @@ define <13 x double> @bitcast_v13i64_to_v13f64(<13 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define inreg <13 x double> @bitcast_v13i64_to_v13f64_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v13i64_to_v13f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v13i64_to_v13f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v13f64_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13f64_to_v13i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v13f64_to_v13i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v13f64_to_v13i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <13 x i64> %a, splat (i64 3) - %a2 = bitcast <13 x i64> %a1 to <13 x double> + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <13 x i64> br label %end cmp.false: - %a3 = bitcast <13 x i64> %a to <13 x double> + %a3 = bitcast <13 x double> %a to <13 x i64> br label %end end: - %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <13 x double> %phi + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi } -define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 ; -; VI-LABEL: bitcast_v13f64_to_v13i64: +; VI-LABEL: bitcast_v13f64_to_v13i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -10687,19 +23023,46 @@ define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: .LBB19_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: .LBB39_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 ; -; GFX9-LABEL: bitcast_v13f64_to_v13i64: +; GFX9-LABEL: bitcast_v13f64_to_v13i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -10713,20 +23076,40 @@ define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB39_3: ; %end ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 ; -; GFX11-LABEL: bitcast_v13f64_to_v13i64: +; GFX11-LABEL: bitcast_v13f64_to_v13i64_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 @@ -10740,8 +23123,6 @@ define <13 x i64> @bitcast_v13f64_to_v13i64(<13 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -10761,286 +23142,301 @@ end: } define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v38, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v50, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_add_i32_e32 v58, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v54, v47, v54 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v52, v57, v52 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v6, v6, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v9, v9, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v10, v10, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v12, v12, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v14, v14, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v18, v18, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v37 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v36 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v34 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v32 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13i64_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v52i16: ; VI: ; %bb.0: @@ -11078,7 +23474,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB40_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -11106,9 +23502,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc @@ -11162,7 +23558,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -11259,7 +23655,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: s_cbranch_execz .LBB40_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -11287,9 +23683,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -11343,7 +23739,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -11387,7 +23783,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11422,7 +23818,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11459,7 +23855,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -11487,9 +23883,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11550,7 +23946,7 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -11597,470 +23993,1534 @@ end: ret <52 x i16> %phi } +define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v2 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: v_readfirstlane_b32 s12, v6 +; SI-NEXT: v_readfirstlane_b32 s11, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s26 +; SI-NEXT: v_mov_b32_e32 v9, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v12, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: s_lshr_b32 s43, s8, 16 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: s_lshr_b32 s45, s12, 16 +; SI-NEXT: s_lshr_b32 s46, s14, 16 +; SI-NEXT: s_lshr_b32 s47, s40, 16 +; SI-NEXT: s_lshr_b32 s56, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s58, s25, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s42, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v13i64_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v13i64: ; VI: ; %bb.0: @@ -12105,7 +25565,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -12186,9 +25646,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -12269,7 +25729,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12376,7 +25836,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -12500,9 +25960,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -12594,7 +26054,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -12623,7 +26083,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12651,7 +26111,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12715,7 +26175,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -12743,7 +26203,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -12763,591 +26223,1696 @@ end: ret <13 x i64> %phi } +define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v52i16_to_v13i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v13i64_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13i64_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13i64_to_v52f16: ; VI: ; %bb.0: @@ -13385,7 +27950,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: s_cbranch_execz .LBB44_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -13413,9 +27978,9 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc @@ -13469,7 +28034,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -13566,7 +28131,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: s_cbranch_execz .LBB44_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -13594,9 +28159,9 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc @@ -13650,7 +28215,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -13694,7 +28259,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13729,7 +28294,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13766,7 +28331,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -13794,9 +28359,9 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -13857,7 +28422,7 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -13904,633 +28469,1862 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13i64_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s40, v1 +; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s7, v9 +; SI-NEXT: v_readfirstlane_b32 s8, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: s_lshr_b32 s43, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s44, s20, 16 +; SI-NEXT: s_lshr_b32 s45, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s46, s22, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s56, s24, 16 +; SI-NEXT: s_lshr_b32 s57, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s58, s26, 16 +; SI-NEXT: s_lshr_b32 s59, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s60, s28, 16 +; SI-NEXT: s_lshr_b32 s61, s29, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s62, s40, 16 +; SI-NEXT: s_lshr_b32 s63, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s72, s14, 16 +; SI-NEXT: s_lshr_b32 s73, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s74, s12, 16 +; SI-NEXT: s_lshr_b32 s75, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s76, s10, 16 +; SI-NEXT: s_lshr_b32 s77, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s78, s7, 16 +; SI-NEXT: s_lshr_b32 s79, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s88, s6, 16 +; SI-NEXT: s_lshr_b32 s89, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v41, s5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v53, v53, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v33, v34, v33 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v33, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v31, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v31, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v13i64_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_readfirstlane_b32 s41, v0 +; VI-NEXT: v_readfirstlane_b32 s40, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: v_readfirstlane_b32 s14, v3 +; VI-NEXT: v_readfirstlane_b32 s13, v4 +; VI-NEXT: v_readfirstlane_b32 s12, v5 +; VI-NEXT: v_readfirstlane_b32 s11, v6 +; VI-NEXT: v_readfirstlane_b32 s10, v7 +; VI-NEXT: v_readfirstlane_b32 s9, v8 +; VI-NEXT: v_readfirstlane_b32 s8, v9 +; VI-NEXT: v_readfirstlane_b32 s6, v10 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v11 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s42, s7, 16 +; VI-NEXT: s_lshr_b32 s43, s6, 16 +; VI-NEXT: s_lshr_b32 s44, s8, 16 +; VI-NEXT: s_lshr_b32 s45, s9, 16 +; VI-NEXT: s_lshr_b32 s46, s10, 16 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 16 +; VI-NEXT: s_lshr_b32 s57, s13, 16 +; VI-NEXT: s_lshr_b32 s58, s14, 16 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s40, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 16 +; VI-NEXT: s_lshr_b32 s62, s29, 16 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s72, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s26, 16 +; VI-NEXT: s_lshr_b32 s74, s25, 16 +; VI-NEXT: s_lshr_b32 s75, s24, 16 +; VI-NEXT: s_lshr_b32 s76, s23, 16 +; VI-NEXT: s_lshr_b32 s77, s22, 16 +; VI-NEXT: s_lshr_b32 s78, s21, 16 +; VI-NEXT: s_lshr_b32 s79, s20, 16 +; VI-NEXT: s_lshr_b32 s88, s19, 16 +; VI-NEXT: s_lshr_b32 s89, s18, 16 +; VI-NEXT: s_lshr_b32 s90, s17, 16 +; VI-NEXT: s_lshr_b32 s91, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s91, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s90, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s89, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s88, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s79, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s78, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s77, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s76, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s75, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s74, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s73, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s72, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s63, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s62, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s29, s61, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s40, s60, 16 +; VI-NEXT: s_or_b32 s29, s29, s40 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s40, s59, 16 +; VI-NEXT: s_or_b32 s15, s15, s40 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s40, s58, 16 +; VI-NEXT: s_or_b32 s14, s14, s40 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s40, s57, 16 +; VI-NEXT: s_or_b32 s13, s13, s40 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s40, s56, 16 +; VI-NEXT: s_or_b32 s12, s12, s40 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s40, s47, 16 +; VI-NEXT: s_or_b32 s11, s11, s40 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s40, s46, 16 +; VI-NEXT: s_or_b32 s10, s10, s40 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s40, s45, 16 +; VI-NEXT: s_or_b32 s9, s9, s40 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s40, s44, 16 +; VI-NEXT: s_or_b32 s8, s8, s40 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s40, s43, 16 +; VI-NEXT: s_or_b32 s6, s6, s40 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s40, s42, 16 +; VI-NEXT: s_or_b32 s7, s7, s40 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s15 +; VI-NEXT: v_mov_b32_e32 v17, s14 +; VI-NEXT: v_mov_b32_e32 v18, s13 +; VI-NEXT: v_mov_b32_e32 v19, s12 +; VI-NEXT: v_mov_b32_e32 v20, s11 +; VI-NEXT: v_mov_b32_e32 v21, s10 +; VI-NEXT: v_mov_b32_e32 v22, s9 +; VI-NEXT: v_mov_b32_e32 v23, s8 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: v_mov_b32_e32 v25, s7 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v13i64_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s42, s41, 16 +; GFX9-NEXT: s_lshr_b32 s43, s40, 16 +; GFX9-NEXT: s_lshr_b32 s44, s15, 16 +; GFX9-NEXT: s_lshr_b32 s45, s14, 16 +; GFX9-NEXT: s_lshr_b32 s46, s13, 16 +; GFX9-NEXT: s_lshr_b32 s47, s12, 16 +; GFX9-NEXT: s_lshr_b32 s56, s11, 16 +; GFX9-NEXT: s_lshr_b32 s57, s10, 16 +; GFX9-NEXT: s_lshr_b32 s58, s9, 16 +; GFX9-NEXT: s_lshr_b32 s59, s8, 16 +; GFX9-NEXT: s_lshr_b32 s60, s7, 16 +; GFX9-NEXT: s_lshr_b32 s61, s6, 16 +; GFX9-NEXT: s_lshr_b32 s62, s29, 16 +; GFX9-NEXT: s_lshr_b32 s63, s28, 16 +; GFX9-NEXT: s_lshr_b32 s72, s27, 16 +; GFX9-NEXT: s_lshr_b32 s73, s26, 16 +; GFX9-NEXT: s_lshr_b32 s74, s25, 16 +; GFX9-NEXT: s_lshr_b32 s75, s24, 16 +; GFX9-NEXT: s_lshr_b32 s76, s23, 16 +; GFX9-NEXT: s_lshr_b32 s77, s22, 16 +; GFX9-NEXT: s_lshr_b32 s78, s21, 16 +; GFX9-NEXT: s_lshr_b32 s79, s20, 16 +; GFX9-NEXT: s_lshr_b32 s88, s19, 16 +; GFX9-NEXT: s_lshr_b32 s89, s18, 16 +; GFX9-NEXT: s_lshr_b32 s90, s17, 16 +; GFX9-NEXT: s_lshr_b32 s91, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s44 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s42 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: ; implicit-def: $sgpr43 +; GFX9-NEXT: ; implicit-def: $sgpr42 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v7 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s11 :: v_dual_mov_b32 v19, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s9 :: v_dual_mov_b32 v21, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s7 :: v_dual_mov_b32 v23, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v25, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v7 +; GFX11-FAKE16-NEXT: s_mov_b32 s78, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s12, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s78 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v25, s10 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v13i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v13i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v13i64: ; VI: ; %bb.0: @@ -14575,7 +30369,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -14656,9 +30450,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -14739,7 +30533,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -14846,7 +30640,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14970,9 +30764,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -15065,7 +30859,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -15094,7 +30888,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15122,7 +30916,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15186,7 +30980,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -15214,9 +31008,1229 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} + +define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v13i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v52f16_to_v13i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v13i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -15235,273 +32249,288 @@ end: } define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v55, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_alignbit_b32 v27, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v28, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v29, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v30, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v32, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v53, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v54, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v55, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v40, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v47, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v40, v45, v40 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v44, v46, v44 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v55, v47, v55 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v43, v56, v43 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v54, v57, v54 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v42, v58, v42 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v7, v7, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v8, v8, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v9, v9, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v10, v10, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v11, v11, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v12, v12, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v13, v13, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v14, v14, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v16, v16, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v17, v17, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v18, v18, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: v_or_b32_e32 v20, v20, v39 -; GCN-NEXT: v_or_b32_e32 v21, v21, v29 -; GCN-NEXT: v_or_b32_e32 v22, v22, v38 -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: v_or_b32_e32 v24, v24, v37 -; GCN-NEXT: v_or_b32_e32 v25, v25, v27 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13f64_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v28, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v29, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v30, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v32, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v39, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13f64_to_v52i16: ; VI: ; %bb.0: @@ -15539,7 +32568,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -15567,9 +32596,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15610,7 +32639,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -15707,7 +32736,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -15735,9 +32764,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15778,7 +32807,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -15822,7 +32851,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15837,7 +32866,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15874,7 +32903,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -15902,9 +32931,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -15945,7 +32974,7 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -15992,470 +33021,1495 @@ end: ret <52 x i16> %phi } +define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_mov_b32_e32 v26, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v13, s28 +; SI-NEXT: v_mov_b32_e32 v14, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v26 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v26 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v13f64_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: v_mov_b32_e32 v31, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: s_branch .LBB49_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: s_branch .LBB49_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} + define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v62 -; GCN-NEXT: v_or_b32_e32 v1, v1, v63 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v61 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v59 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v58 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v57 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v56 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v47 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v46 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v44 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v43 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v42 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v41 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v60 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v26 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v26 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v41 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v0, v62, v0 -; GCN-NEXT: v_or_b32_e32 v1, v63, v1 -; GCN-NEXT: v_or_b32_e32 v2, v61, v2 -; GCN-NEXT: v_or_b32_e32 v3, v59, v3 -; GCN-NEXT: v_or_b32_e32 v4, v58, v4 -; GCN-NEXT: v_or_b32_e32 v5, v57, v5 -; GCN-NEXT: v_or_b32_e32 v6, v56, v6 -; GCN-NEXT: v_or_b32_e32 v7, v47, v7 -; GCN-NEXT: v_or_b32_e32 v8, v46, v8 -; GCN-NEXT: v_or_b32_e32 v9, v44, v9 -; GCN-NEXT: v_or_b32_e32 v10, v43, v10 -; GCN-NEXT: v_or_b32_e32 v11, v42, v11 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v26, v17 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v26, v18 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v26, v19 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v26, v20 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v26, v21 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v26, v22 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v26, v23 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v26, v24 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v26, v25 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_or_b32_e32 v4, v4, v56 +; SI-NEXT: v_or_b32_e32 v5, v5, v47 +; SI-NEXT: v_or_b32_e32 v6, v6, v38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v36 +; SI-NEXT: v_or_b32_e32 v9, v9, v46 +; SI-NEXT: v_or_b32_e32 v10, v10, v35 +; SI-NEXT: v_or_b32_e32 v11, v11, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v44 +; SI-NEXT: v_or_b32_e32 v13, v13, v34 +; SI-NEXT: v_or_b32_e32 v14, v14, v43 +; SI-NEXT: v_or_b32_e32 v15, v15, v42 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_or_b32_e32 v17, v17, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v40 +; SI-NEXT: v_or_b32_e32 v19, v19, v32 +; SI-NEXT: v_or_b32_e32 v20, v20, v63 +; SI-NEXT: v_or_b32_e32 v21, v21, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v61 +; SI-NEXT: v_or_b32_e32 v23, v23, v60 +; SI-NEXT: v_or_b32_e32 v24, v24, v59 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v2, v57, v2 +; SI-NEXT: v_or_b32_e32 v3, v39, v3 +; SI-NEXT: v_or_b32_e32 v4, v56, v4 +; SI-NEXT: v_or_b32_e32 v5, v47, v5 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v36, v8 +; SI-NEXT: v_or_b32_e32 v9, v46, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v44, v12 +; SI-NEXT: v_or_b32_e32 v13, v34, v13 +; SI-NEXT: v_or_b32_e32 v14, v43, v14 +; SI-NEXT: v_or_b32_e32 v15, v42, v15 +; SI-NEXT: v_or_b32_e32 v16, v33, v16 +; SI-NEXT: v_or_b32_e32 v17, v41, v17 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 +; SI-NEXT: v_or_b32_e32 v19, v32, v19 +; SI-NEXT: v_or_b32_e32 v20, v63, v20 +; SI-NEXT: v_or_b32_e32 v21, v62, v21 +; SI-NEXT: v_or_b32_e32 v22, v61, v22 +; SI-NEXT: v_or_b32_e32 v23, v60, v23 +; SI-NEXT: v_or_b32_e32 v24, v59, v24 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v25 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v13f64: ; VI: ; %bb.0: @@ -16500,7 +34554,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -16581,9 +34635,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v57 @@ -16664,7 +34718,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v26, 3, v32 ; VI-NEXT: v_add_u16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -16771,7 +34825,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -16895,9 +34949,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -16989,7 +35043,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -17018,7 +35072,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -17046,7 +35100,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -17110,7 +35164,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -17138,9 +35192,1125 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v8, v1, v62 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v57 +; SI-NEXT: v_mov_b32_e32 v57, v32 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v38 +; SI-NEXT: v_mov_b32_e32 v38, v35 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v47, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v43 +; SI-NEXT: v_mov_b32_e32 v45, v58 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v62 +; SI-NEXT: v_mov_b32_e32 v62, v43 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_mov_b32_e32 v27, v58 +; SI-NEXT: v_mov_b32_e32 v58, v45 +; SI-NEXT: v_mov_b32_e32 v43, v46 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v57, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v52i16_to_v13f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17159,564 +36329,552 @@ end: } define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v13f64_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; kill: killed $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; kill: killed $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v51 -; GCN-NEXT: v_mov_b32_e32 v51, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v53 -; GCN-NEXT: v_mov_b32_e32 v53, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v55 -; GCN-NEXT: v_mov_b32_e32 v55, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 -; GCN-NEXT: v_mov_b32_e32 v41, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v52 -; GCN-NEXT: v_mov_b32_e32 v52, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v30 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v28 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v27 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v62 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v60 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v58 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v32 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v31 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v63 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v61 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v59 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v55 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v53 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v51 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v41 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v13f64_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v24 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v27 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_mov_b32_e32 v55, v24 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: v_mov_b32_e32 v51, v26 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v13f64_to_v52f16: ; VI: ; %bb.0: @@ -17754,7 +36912,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -17782,9 +36940,9 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -17825,7 +36983,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 @@ -17922,7 +37080,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -17950,9 +37108,9 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: .LBB52_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: s_cbranch_execz .LBB52_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -17993,7 +37151,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v43, v0, s4 @@ -18037,7 +37195,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -18052,7 +37210,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18089,7 +37247,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -18117,9 +37275,9 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 ; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 @@ -18160,7 +37318,7 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100 @@ -18207,633 +37365,1850 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v13f64_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_and_b64 s[40:41], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s40, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s40 +; SI-NEXT: s_lshr_b32 s40, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s40 +; SI-NEXT: s_lshr_b32 s40, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s40 +; SI-NEXT: s_lshr_b32 s40, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 +; SI-NEXT: s_lshr_b32 s40, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 +; SI-NEXT: s_lshr_b32 s40, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 +; SI-NEXT: s_lshr_b32 s40, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: s_lshr_b32 s40, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s40 +; SI-NEXT: s_lshr_b32 s40, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s40 +; SI-NEXT: s_lshr_b32 s40, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s40 +; SI-NEXT: s_lshr_b32 s40, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s40 +; SI-NEXT: s_lshr_b32 s40, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 +; SI-NEXT: s_lshr_b32 s40, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s40 +; SI-NEXT: s_lshr_b32 s40, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s40 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s40 +; SI-NEXT: s_lshr_b32 s40, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s40 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s40 +; SI-NEXT: s_lshr_b32 s40, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s40 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s40 +; SI-NEXT: s_lshr_b32 s40, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s40 +; SI-NEXT: s_lshr_b32 s40, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s40 +; SI-NEXT: s_lshr_b32 s40, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s40 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s40 +; SI-NEXT: s_lshr_b32 s40, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s40 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 +; SI-NEXT: s_lshr_b32 s40, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[54:55], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 +; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[26:27], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v51, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v10 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v13, v41, v13 +; SI-NEXT: buffer_store_dword v13, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v10, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v55 +; SI-NEXT: v_add_i32_e32 v55, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v53 +; SI-NEXT: v_add_i32_e32 v53, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v50 +; SI-NEXT: v_add_i32_e32 v49, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v48 +; SI-NEXT: v_add_i32_e32 v39, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v38 +; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 +; SI-NEXT: v_add_i32_e32 v35, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v34 +; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v32 +; SI-NEXT: v_add_i32_e32 v31, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_add_i32_e32 v28, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v27 +; SI-NEXT: v_add_i32_e32 v24, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 +; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v19 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v13f64_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: v_mov_b32_e32 v31, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v15, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 +; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: s_branch .LBB53_2 +; +; GFX9-LABEL: bitcast_v13f64_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: v_mov_b32_e32 v14, s21 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v15, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v41, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v14, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: s_branch .LBB53_2 +; +; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 +; +; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v64, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v66, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v52, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v55, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v15, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v67, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <13 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <13 x double> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <13 x double> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v13f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v44 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; GCN-NEXT: v_or_b32_e32 v0, v42, v0 -; GCN-NEXT: v_or_b32_e32 v1, v40, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; GCN-NEXT: v_or_b32_e32 v2, v54, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: v_or_b32_e32 v3, v52, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; GCN-NEXT: v_or_b32_e32 v4, v50, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v45 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v26, v12 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v26, v13 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v26, v14 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v26, v15 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v26, v16 -; GCN-NEXT: v_or_b32_e32 v17, v48, v17 -; GCN-NEXT: v_or_b32_e32 v18, v38, v18 -; GCN-NEXT: v_or_b32_e32 v19, v36, v19 -; GCN-NEXT: v_or_b32_e32 v20, v34, v20 -; GCN-NEXT: v_or_b32_e32 v21, v32, v21 -; GCN-NEXT: v_or_b32_e32 v22, v33, v22 -; GCN-NEXT: v_or_b32_e32 v23, v35, v23 -; GCN-NEXT: v_or_b32_e32 v24, v37, v24 -; GCN-NEXT: v_or_b32_e32 v25, v39, v25 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v52 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v49 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_or_b32_e32 v13, v15, v14 -; GCN-NEXT: v_or_b32_e32 v14, v17, v16 -; GCN-NEXT: v_or_b32_e32 v15, v19, v18 -; GCN-NEXT: v_or_b32_e32 v16, v21, v20 -; GCN-NEXT: v_or_b32_e32 v17, v23, v22 -; GCN-NEXT: v_or_b32_e32 v18, v25, v24 -; GCN-NEXT: v_or_b32_e32 v19, v27, v26 -; GCN-NEXT: v_or_b32_e32 v20, v29, v28 -; GCN-NEXT: v_or_b32_e32 v21, v31, v30 -; GCN-NEXT: v_or_b32_e32 v22, v33, v32 -; GCN-NEXT: v_or_b32_e32 v23, v35, v34 -; GCN-NEXT: v_or_b32_e32 v24, v37, v36 -; GCN-NEXT: v_or_b32_e32 v25, v39, v38 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v13f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v50, v4 +; SI-NEXT: v_or_b32_e32 v21, v56, v21 +; SI-NEXT: v_or_b32_e32 v22, v46, v22 +; SI-NEXT: v_or_b32_e32 v23, v44, v23 +; SI-NEXT: v_or_b32_e32 v24, v34, v24 +; SI-NEXT: v_or_b32_e32 v25, v32, v25 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v20, v58, v20 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v59 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v35 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v13f64: ; VI: ; %bb.0: @@ -18878,7 +39253,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v25, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v25, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -18959,9 +39334,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v25, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v57, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -19042,7 +39417,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -19149,7 +39524,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -19273,9 +39648,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr26 ; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: ; kill: killed $vgpr26 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -19368,7 +39743,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -19397,7 +39772,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -19425,7 +39800,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19489,7 +39864,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -19517,9 +39892,1229 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <13 x double> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <13 x double> + br label %end + +end: + %phi = phi <13 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x double> %phi +} + +define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v13f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v9, v55, v9 +; SI-NEXT: v_or_b32_e32 v10, v54, v10 +; SI-NEXT: v_or_b32_e32 v11, v47, v11 +; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v13, v52, v13 +; SI-NEXT: v_or_b32_e32 v14, v63, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 +; SI-NEXT: v_or_b32_e32 v18, v33, v18 +; SI-NEXT: v_or_b32_e32 v19, v59, v19 +; SI-NEXT: v_or_b32_e32 v20, v27, v20 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v37, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v38, v25 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v36 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v34 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v50, v63 +; SI-NEXT: v_mov_b32_e32 v63, v58 +; SI-NEXT: v_mov_b32_e32 v58, v30 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v32, v59 +; SI-NEXT: v_mov_b32_e32 v59, v31 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: v_mov_b32_e32 v49, v62 +; SI-NEXT: v_mov_b32_e32 v62, v27 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v27, v62 +; SI-NEXT: v_mov_b32_e32 v62, v49 +; SI-NEXT: v_mov_b32_e32 v26, v61 +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: v_mov_b32_e32 v31, v59 +; SI-NEXT: v_mov_b32_e32 v59, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v63 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v52f16_to_v13f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v32, v11 +; VI-NEXT: v_mov_b32_e32 v33, v10 +; VI-NEXT: v_mov_b32_e32 v34, v9 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v36, v7 +; VI-NEXT: v_mov_b32_e32 v37, v6 +; VI-NEXT: v_mov_b32_e32 v38, v5 +; VI-NEXT: v_mov_b32_e32 v39, v4 +; VI-NEXT: v_mov_b32_e32 v48, v3 +; VI-NEXT: v_mov_b32_e32 v49, v2 +; VI-NEXT: v_mov_b32_e32 v50, v1 +; VI-NEXT: v_mov_b32_e32 v51, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v25, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v51, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v50, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v49, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v39, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v37, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v36, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v35, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v34, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v24, v26, v24 +; VI-NEXT: v_add_f16_sdwa v25, v32, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v52f16_to_v13f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v11 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v34, v9 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v36, v7 +; GFX9-NEXT: v_mov_b32_e32 v37, v6 +; GFX9-NEXT: v_mov_b32_e32 v38, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v4 +; GFX9-NEXT: v_mov_b32_e32 v48, v3 +; GFX9-NEXT: v_mov_b32_e32 v49, v2 +; GFX9-NEXT: v_mov_b32_e32 v50, v1 +; GFX9-NEXT: v_mov_b32_e32 v51, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v39 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v51 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v25 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v7 :: v_dual_mov_b32 v33, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v3 :: v_dual_mov_b32 v37, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v39, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v39 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v16, 16, v17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v7 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s40, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s28, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s29, s41 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v39, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v38, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v37, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v36, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v35, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v34, 16, v50 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v33, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v32, 16, v48 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19538,790 +41133,807 @@ end: } define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v52i16_to_v52f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; kill: killed $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v18 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v51 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v52 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v53 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v54 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v55 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v40 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v41 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v48 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_or_b32_e32 v53, v55, v54 -; GCN-NEXT: v_or_b32_e32 v54, v41, v40 -; GCN-NEXT: v_or_b32_e32 v55, v43, v42 -; GCN-NEXT: v_or_b32_e32 v40, v45, v44 -; GCN-NEXT: v_or_b32_e32 v41, v47, v46 -; GCN-NEXT: v_or_b32_e32 v42, v57, v56 -; GCN-NEXT: v_or_b32_e32 v43, v59, v58 -; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52i16_to_v52f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52i16_to_v52f16: ; VI: ; %bb.0: @@ -20361,7 +41973,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v44, 3, v44 @@ -20415,7 +42027,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_add_u16_e32 v27, 3, v27 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 ; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20515,7 +42127,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v25, v44, v25, s6 @@ -20596,7 +42208,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v25 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v40, v21, s4 @@ -20641,7 +42253,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] @@ -20669,7 +42281,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -20707,7 +42319,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 @@ -20787,7 +42399,7 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 @@ -20833,503 +42445,2160 @@ end: ret <52 x half> %phi } +define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52i16_to_v52f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 +; SI-NEXT: s_branch .LBB57_3 +; SI-NEXT: .LBB57_2: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: .LBB57_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: v_mov_b32_e32 v62, v32 +; SI-NEXT: v_mov_b32_e32 v32, v37 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v53 +; SI-NEXT: v_mov_b32_e32 v53, v55 +; SI-NEXT: v_mov_b32_e32 v55, v41 +; SI-NEXT: v_mov_b32_e32 v41, v42 +; SI-NEXT: s_cbranch_vccnz .LBB57_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v43 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v42 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v52i16_to_v52f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v52i16_to_v52f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v13, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v12, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v35, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v34, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v26, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v37, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v36, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v35, s27 +; GFX9-NEXT: v_mov_b32_e32 v34, s26 +; GFX9-NEXT: v_mov_b32_e32 v33, s25 +; GFX9-NEXT: v_mov_b32_e32 v32, s24 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s16 +; GFX9-NEXT: v_mov_b32_e32 v38, s43 +; GFX9-NEXT: v_mov_b32_e32 v39, s42 +; GFX9-NEXT: v_mov_b32_e32 v48, s41 +; GFX9-NEXT: v_mov_b32_e32 v49, s40 +; GFX9-NEXT: v_mov_b32_e32 v50, s15 +; GFX9-NEXT: v_mov_b32_e32 v51, s14 +; GFX9-NEXT: v_mov_b32_e32 v52, s13 +; GFX9-NEXT: v_mov_b32_e32 v53, s12 +; GFX9-NEXT: v_mov_b32_e32 v54, s11 +; GFX9-NEXT: v_mov_b32_e32 v55, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v41, s8 +; GFX9-NEXT: v_mov_b32_e32 v42, s7 +; GFX9-NEXT: v_mov_b32_e32 v43, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v43, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v42, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v27 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v51, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v50, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v49, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v48, 16, v35 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v10, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s15, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s12, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v67.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s10 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v34, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v23, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v50, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v49, 16, v52 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v52f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s15 :: v_dual_mov_b32 v49, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s11 :: v_dual_mov_b32 v53, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v55, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <52 x i16> %a, splat (i16 3) + %a2 = bitcast <52 x i16> %a1 to <52 x half> + br label %end + +cmp.false: + %a3 = bitcast <52 x i16> %a to <52 x half> + br label %end + +end: + %phi = phi <52 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x half> %phi +} + define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v52f16_to_v52i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v20 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v24 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v30 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v63, v9 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v63 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v56 -; GCN-NEXT: v_or_b32_e32 v12, v8, v12 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; GCN-NEXT: v_or_b32_e32 v13, v13, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v11 -; GCN-NEXT: v_or_b32_e32 v14, v14, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v10 -; GCN-NEXT: v_or_b32_e32 v15, v15, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v2 -; GCN-NEXT: v_or_b32_e32 v16, v16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v41, v22 -; GCN-NEXT: v_or_b32_e32 v40, v40, v36 -; GCN-NEXT: v_or_b32_e32 v54, v54, v42 -; GCN-NEXT: v_or_b32_e32 v52, v52, v55 -; GCN-NEXT: v_or_b32_e32 v8, v50, v53 -; GCN-NEXT: v_or_b32_e32 v20, v20, v51 -; GCN-NEXT: v_or_b32_e32 v21, v21, v49 -; GCN-NEXT: v_or_b32_e32 v29, v29, v34 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_or_b32_e32 v39, v23, v26 -; GCN-NEXT: v_or_b32_e32 v35, v32, v24 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v56, v31, v33 -; GCN-NEXT: v_or_b32_e32 v18, v18, v46 -; GCN-NEXT: v_or_b32_e32 v17, v17, v47 -; GCN-NEXT: v_or_b32_e32 v19, v19, v58 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_or_b32_e32 v5, v5, v62 -; GCN-NEXT: v_or_b32_e32 v4, v4, v43 -; GCN-NEXT: v_or_b32_e32 v3, v3, v44 -; GCN-NEXT: v_alignbit_b32 v44, v3, v22, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v36, 16 -; GCN-NEXT: v_alignbit_b32 v42, v5, v42, 16 -; GCN-NEXT: v_alignbit_b32 v55, v6, v55, 16 -; GCN-NEXT: v_alignbit_b32 v53, v7, v53, 16 -; GCN-NEXT: v_alignbit_b32 v51, v19, v51, 16 -; GCN-NEXT: v_alignbit_b32 v22, v17, v49, 16 -; GCN-NEXT: v_alignbit_b32 v23, v18, v34, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v30, 16 -; GCN-NEXT: v_alignbit_b32 v28, v15, v28, 16 -; GCN-NEXT: v_alignbit_b32 v26, v14, v26, 16 -; GCN-NEXT: v_alignbit_b32 v24, v13, v24, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v33, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v44 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v43 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v49, v49, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v40, v40, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v2, v4, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v54, v54, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v63 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v52, v52, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v6, v6, v11 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v8, v8, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v20, v20, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v19, v19, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v61 -; GCN-NEXT: v_or_b32_e32 v17, v17, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v18, v18, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v16, v16, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x64, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: v_or_b32_e32 v15, v15, v37 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v14, v14, v59 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v13, v13, v57 -; GCN-NEXT: v_or_b32_e32 v28, v56, v36 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v52f16_to_v52i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v36 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v53 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v31, v34 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v37, v22 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v49, v5 +; SI-NEXT: v_mov_b32_e32 v60, v7 +; SI-NEXT: v_mov_b32_e32 v62, v8 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_or_b32_e32 v2, v34, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v33, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v34 +; SI-NEXT: v_or_b32_e32 v62, v35, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v60, v34, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v49, v33, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v55 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: v_or_b32_e32 v22, v22, v29 +; SI-NEXT: v_or_b32_e32 v37, v33, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 +; SI-NEXT: v_or_b32_e32 v59, v28, v26 +; SI-NEXT: v_or_b32_e32 v39, v35, v55 +; SI-NEXT: v_or_b32_e32 v30, v30, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: v_or_b32_e32 v58, v28, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v31 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v28 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v31, v25, v57 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_or_b32_e32 v54, v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 +; SI-NEXT: v_or_b32_e32 v53, v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v61 +; SI-NEXT: v_or_b32_e32 v14, v14, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v21, v21, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_or_b32_e32 v6, v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 +; SI-NEXT: v_or_b32_e32 v36, v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_or_b32_e32 v52, v25, v33 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v50 +; SI-NEXT: v_or_b32_e32 v51, v28, v25 +; SI-NEXT: v_alignbit_b32 v45, v51, v38, 16 +; SI-NEXT: v_alignbit_b32 v44, v52, v44, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, v4, v41, 16 +; SI-NEXT: v_alignbit_b32 v41, v15, v46, 16 +; SI-NEXT: v_alignbit_b32 v40, v21, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v23, v29, 16 +; SI-NEXT: v_alignbit_b32 v29, v18, v47, 16 +; SI-NEXT: v_alignbit_b32 v28, v12, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v14, v26, 16 +; SI-NEXT: v_alignbit_b32 v26, v9, v56, 16 +; SI-NEXT: v_alignbit_b32 v25, v53, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v54, v57, 16 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v33, v33, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v52f16_to_v52i16: ; VI: ; %bb.0: @@ -21369,7 +44638,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 @@ -21423,7 +44692,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 ; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 ; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -21523,7 +44792,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v25, v44, v25, s6 @@ -21605,7 +44874,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v23 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v25 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v21, v40, v21, s4 @@ -21650,7 +44919,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] @@ -21678,7 +44947,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21716,7 +44985,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100 @@ -21796,7 +45065,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100 @@ -21841,3 +45110,1387 @@ end: %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <52 x i16> %phi } + +define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s29 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v43 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v53 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v3, v3, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_or_b32_e32 v9, v9, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_or_b32_e32 v14, v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_or_b32_e32 v12, v12, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_or_b32_e32 v17, v17, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_or_b32_e32 v36, v36, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v34, v34, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v48, v29, v48 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v50, v50, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_or_b32_e32 v19, v19, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v40 +; SI-NEXT: v_or_b32_e32 v24, v24, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v43 +; SI-NEXT: v_or_b32_e32 v26, v26, v45 +; SI-NEXT: v_or_b32_e32 v21, v21, v30 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v49, v49, v46 +; SI-NEXT: v_or_b32_e32 v37, v37, v55 +; SI-NEXT: v_or_b32_e32 v35, v35, v54 +; SI-NEXT: v_or_b32_e32 v33, v33, v47 +; SI-NEXT: v_or_b32_e32 v15, v15, v52 +; SI-NEXT: v_or_b32_e32 v13, v13, v51 +; SI-NEXT: v_or_b32_e32 v11, v11, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_or_b32_e32 v4, v4, v57 +; SI-NEXT: v_alignbit_b32 v44, v24, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v25, v45, 16 +; SI-NEXT: v_alignbit_b32 v42, v19, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v50, v41, 16 +; SI-NEXT: v_alignbit_b32 v41, v48, v46, 16 +; SI-NEXT: v_alignbit_b32 v40, v34, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v36, v54, 16 +; SI-NEXT: v_alignbit_b32 v54, v17, v47, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v52, 16 +; SI-NEXT: v_alignbit_b32 v52, v14, v51, 16 +; SI-NEXT: v_alignbit_b32 v51, v9, v56, 16 +; SI-NEXT: v_alignbit_b32 v29, v3, v28, 16 +; SI-NEXT: v_alignbit_b32 v28, v5, v57, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v27, v27, v44 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v42 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v40 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v52f16_to_v52i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v38, 0x200 +; VI-NEXT: v_add_f16_e32 v36, s16, v38 +; VI-NEXT: v_add_f16_e32 v43, s43, v38 +; VI-NEXT: v_add_f16_e32 v37, s17, v38 +; VI-NEXT: v_add_f16_e32 v42, s42, v38 +; VI-NEXT: v_add_f16_e32 v26, s18, v38 +; VI-NEXT: v_add_f16_e32 v41, s41, v38 +; VI-NEXT: v_add_f16_e32 v27, s19, v38 +; VI-NEXT: v_add_f16_e32 v40, s40, v38 +; VI-NEXT: v_add_f16_e32 v28, s20, v38 +; VI-NEXT: v_add_f16_e32 v55, s15, v38 +; VI-NEXT: v_add_f16_e32 v29, s21, v38 +; VI-NEXT: v_add_f16_e32 v54, s14, v38 +; VI-NEXT: v_add_f16_e32 v30, s22, v38 +; VI-NEXT: v_add_f16_e32 v53, s13, v38 +; VI-NEXT: v_add_f16_e32 v31, s23, v38 +; VI-NEXT: v_add_f16_e32 v52, s12, v38 +; VI-NEXT: v_add_f16_e32 v32, s24, v38 +; VI-NEXT: v_add_f16_e32 v51, s11, v38 +; VI-NEXT: v_add_f16_e32 v33, s25, v38 +; VI-NEXT: v_add_f16_e32 v50, s10, v38 +; VI-NEXT: v_add_f16_e32 v34, s26, v38 +; VI-NEXT: v_add_f16_e32 v49, s9, v38 +; VI-NEXT: v_add_f16_e32 v35, s27, v38 +; VI-NEXT: v_add_f16_e32 v48, s8, v38 +; VI-NEXT: v_add_f16_e32 v12, s28, v38 +; VI-NEXT: v_add_f16_e32 v39, s7, v38 +; VI-NEXT: v_add_f16_e32 v13, s29, v38 +; VI-NEXT: v_add_f16_e32 v38, s6, v38 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v38, s6 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: v_mov_b32_e32 v39, s7 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v48, s8 +; VI-NEXT: v_mov_b32_e32 v35, s27 +; VI-NEXT: v_mov_b32_e32 v49, s9 +; VI-NEXT: v_mov_b32_e32 v34, s26 +; VI-NEXT: v_mov_b32_e32 v50, s10 +; VI-NEXT: v_mov_b32_e32 v33, s25 +; VI-NEXT: v_mov_b32_e32 v51, s11 +; VI-NEXT: v_mov_b32_e32 v32, s24 +; VI-NEXT: v_mov_b32_e32 v52, s12 +; VI-NEXT: v_mov_b32_e32 v31, s23 +; VI-NEXT: v_mov_b32_e32 v53, s13 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: v_mov_b32_e32 v54, s14 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v55, s15 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v40, s40 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v41, s41 +; VI-NEXT: v_mov_b32_e32 v26, s18 +; VI-NEXT: v_mov_b32_e32 v42, s42 +; VI-NEXT: v_mov_b32_e32 v37, s17 +; VI-NEXT: v_mov_b32_e32 v43, s43 +; VI-NEXT: v_mov_b32_e32 v36, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v36, v36, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v28, v28, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v36 +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: v_mov_b32_e32 v2, v26 +; VI-NEXT: v_mov_b32_e32 v3, v27 +; VI-NEXT: v_mov_b32_e32 v4, v28 +; VI-NEXT: v_mov_b32_e32 v5, v29 +; VI-NEXT: v_mov_b32_e32 v6, v30 +; VI-NEXT: v_mov_b32_e32 v7, v31 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v9, v33 +; VI-NEXT: v_mov_b32_e32 v10, v34 +; VI-NEXT: v_mov_b32_e32 v11, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v52f16_to_v52i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v13, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v12, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v35, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v34, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v27, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v26, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v37, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v36, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v35, s27 +; GFX9-NEXT: v_mov_b32_e32 v34, s26 +; GFX9-NEXT: v_mov_b32_e32 v33, s25 +; GFX9-NEXT: v_mov_b32_e32 v32, s24 +; GFX9-NEXT: v_mov_b32_e32 v31, s23 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s16 +; GFX9-NEXT: v_mov_b32_e32 v38, s43 +; GFX9-NEXT: v_mov_b32_e32 v39, s42 +; GFX9-NEXT: v_mov_b32_e32 v48, s41 +; GFX9-NEXT: v_mov_b32_e32 v49, s40 +; GFX9-NEXT: v_mov_b32_e32 v50, s15 +; GFX9-NEXT: v_mov_b32_e32 v51, s14 +; GFX9-NEXT: v_mov_b32_e32 v52, s13 +; GFX9-NEXT: v_mov_b32_e32 v53, s12 +; GFX9-NEXT: v_mov_b32_e32 v54, s11 +; GFX9-NEXT: v_mov_b32_e32 v55, s10 +; GFX9-NEXT: v_mov_b32_e32 v40, s9 +; GFX9-NEXT: v_mov_b32_e32 v41, s8 +; GFX9-NEXT: v_mov_b32_e32 v42, s7 +; GFX9-NEXT: v_mov_b32_e32 v43, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v43, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v42, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v27 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v51, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v50, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v49, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v48, 16, v35 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v36 +; GFX9-NEXT: v_mov_b32_e32 v1, v37 +; GFX9-NEXT: v_mov_b32_e32 v2, v26 +; GFX9-NEXT: v_mov_b32_e32 v3, v27 +; GFX9-NEXT: v_mov_b32_e32 v4, v28 +; GFX9-NEXT: v_mov_b32_e32 v5, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v30 +; GFX9-NEXT: v_mov_b32_e32 v7, v31 +; GFX9-NEXT: v_mov_b32_e32 v8, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-NEXT: v_mov_b32_e32 v10, v34 +; GFX9-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v10, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v9, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v11, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s15, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s14, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s12, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s11, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v52.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v53.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v67.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s10 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v25, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v51, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v36, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v23, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v24, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v38, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v34, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v48, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v21, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v22, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v23, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v50, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v49, 16, v52 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v37, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v26 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v52i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s41 :: v_dual_mov_b32 v39, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s15 :: v_dual_mov_b32 v49, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s11 :: v_dual_mov_b32 v53, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v55, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s6 :: v_dual_mov_b32 v65, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v33 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v65, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v64, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v53, 16, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v50, 16, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v52, 16, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v36, 16, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v49, 16, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v35, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v34, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v37, 16, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v28 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <52 x half> %a, splat (half 0xH0200) + %a2 = bitcast <52 x half> %a1 to <52 x i16> + br label %end + +cmp.false: + %a3 = bitcast <52 x half> %a to <52 x i16> + br label %end + +end: + %phi = phi <52 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <52 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index cdbe26b309831..66242a3cf45d8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -1,52 +1,52 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v28f32: ; VI: ; %bb.0: @@ -188,47 +188,337 @@ end: ret <28 x float> %phi } +define inreg <28 x float> @bitcast_v28i32_to_v28f32_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v28i32_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v28i32_to_v28f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v28i32: ; VI: ; %bb.0: @@ -237,7 +527,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -267,7 +557,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -278,7 +568,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -308,7 +598,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -320,7 +610,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -336,7 +626,7 @@ define <28 x i32> @bitcast_v28f32_to_v28i32(<28 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -356,47 +646,323 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v28f32_to_v28i32_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v28f32_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v28f32_to_v28i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v14i64: ; VI: ; %bb.0: @@ -405,7 +971,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -435,7 +1001,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -446,7 +1012,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -476,7 +1042,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -488,7 +1054,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -518,7 +1084,7 @@ define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -538,71 +1104,361 @@ end: ret <14 x i64> %phi } -define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x i64> @bitcast_v28i32_to_v14i64_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v14i64_to_v28i32: +; VI-LABEL: bitcast_v28i32_to_v14i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v28i32_to_v14i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i64_to_v28i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 @@ -617,7 +1473,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -628,7 +1484,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc @@ -658,7 +1514,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -670,7 +1526,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -707,7 +1563,7 @@ define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -727,47 +1583,344 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v14i64_to_v28i32_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v14i64_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v14i64_to_v28i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v14f64: ; VI: ; %bb.0: @@ -776,7 +1929,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -806,7 +1959,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -817,7 +1970,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -847,7 +2000,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -859,7 +2012,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -889,7 +2042,7 @@ define <14 x double> @bitcast_v28i32_to_v14f64(<28 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -909,42 +2062,332 @@ end: ret <14 x double> %phi } -define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x double> @bitcast_v28i32_to_v14f64_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v14f64_to_v28i32: +; VI-LABEL: bitcast_v28i32_to_v14f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v28i32_to_v14f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v28i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -960,7 +2403,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -971,7 +2414,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -987,7 +2430,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -999,7 +2442,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -1015,7 +2458,7 @@ define <28 x i32> @bitcast_v14f64_to_v28i32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: .LBB10_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1035,312 +2478,569 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_mov_b32_e32 v15, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v14f64_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v29, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v14f64_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v29, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: v_mov_b32_e32 v15, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v14f64_to_v28i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v41 -; GCN-NEXT: v_or_b32_e32 v5, v57, v45 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v41, v58, v56 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v59, v43 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v43, v60, v47 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v47, v61, v62 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v45, v45, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v56, v56, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v44, v58, v44 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v9, v9, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v13, v13, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v20, v20, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28i32_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28i32_to_v56i16: ; VI: ; %bb.0: @@ -1384,7 +3084,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -1414,9 +3114,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -1474,7 +3174,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 @@ -1585,7 +3285,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -1615,9 +3315,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -1675,7 +3375,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 @@ -1725,7 +3425,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -1755,7 +3455,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1794,7 +3494,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -1824,9 +3524,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -1884,7 +3584,7 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 @@ -1933,522 +3633,1672 @@ end: ret <56 x i16> %phi } +define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: s_branch .LBB13_2 +; +; VI-LABEL: bitcast_v28i32_to_v56i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v56i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 +; +; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <56 x i16> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <56 x i16> + br label %end + +end: + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi +} + define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56i16_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v28i32: ; VI: ; %bb.0: @@ -2497,7 +5347,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2584,9 +5434,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 @@ -2673,7 +5523,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v28, 3, v32 ; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -2788,7 +5638,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -2928,9 +5778,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -3032,7 +5882,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -3061,7 +5911,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -3091,7 +5941,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3159,7 +6009,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] @@ -3189,7 +6039,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -3209,640 +6059,1860 @@ end: ret <28 x i32> %phi } -define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v28i32_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v28i32_to_v56f16: +; VI-LABEL: bitcast_v56i16_to_v28i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB15_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + +define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v28i32_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v28i32_to_v56f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill @@ -3879,7 +7949,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_cbranch_execz .LBB16_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -3909,9 +7979,9 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 +; VI-NEXT: s_cbranch_execz .LBB16_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 @@ -3969,7 +8039,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB8_4: ; %end +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 @@ -4080,7 +8150,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -4110,9 +8180,9 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 +; GFX9-NEXT: s_cbranch_execz .LBB16_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 @@ -4170,7 +8240,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 @@ -4220,7 +8290,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -4250,7 +8320,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4289,7 +8359,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -4319,9 +8389,9 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 @@ -4379,7 +8449,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 @@ -4428,704 +8498,2030 @@ end: ret <56 x half> %phi } +define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28i32_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: s_lshr_b32 s47, s21, 16 +; SI-NEXT: s_lshr_b32 s56, s22, 16 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s24, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_lshr_b32 s60, s26, 16 +; SI-NEXT: s_lshr_b32 s61, s27, 16 +; SI-NEXT: s_lshr_b32 s62, s28, 16 +; SI-NEXT: s_lshr_b32 s63, s29, 16 +; SI-NEXT: s_lshr_b32 s72, s43, 16 +; SI-NEXT: s_lshr_b32 s73, s42, 16 +; SI-NEXT: s_lshr_b32 s74, s41, 16 +; SI-NEXT: s_lshr_b32 s75, s40, 16 +; SI-NEXT: s_lshr_b32 s76, s15, 16 +; SI-NEXT: s_lshr_b32 s77, s14, 16 +; SI-NEXT: s_lshr_b32 s78, s13, 16 +; SI-NEXT: s_lshr_b32 s79, s12, 16 +; SI-NEXT: s_lshr_b32 s88, s11, 16 +; SI-NEXT: s_lshr_b32 s89, s10, 16 +; SI-NEXT: s_lshr_b32 s90, s8, 16 +; SI-NEXT: s_lshr_b32 s91, s7, 16 +; SI-NEXT: s_lshr_b32 s92, s6, 16 +; SI-NEXT: s_lshr_b32 s93, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v28i32_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v28i32_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v28i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56f16_to_v28i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56f16_to_v28i32: ; VI: ; %bb.0: @@ -5174,7 +10570,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5261,9 +10657,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5350,7 +10746,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5465,7 +10861,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB18_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -5605,9 +11001,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB18_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -5710,7 +11106,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: .LBB18_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5739,7 +11135,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5769,7 +11165,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5837,7 +11233,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -5867,7 +11263,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -5887,47 +11283,1381 @@ end: ret <28 x i32> %phi } +define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v28i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v56f16_to_v28i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v28i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v14i64: ; VI: ; %bb.0: @@ -5936,7 +12666,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -5966,7 +12696,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: .LBB20_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -5977,7 +12707,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6007,7 +12737,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: .LBB20_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6019,7 +12749,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -6035,7 +12765,7 @@ define <14 x i64> @bitcast_v28f32_to_v14i64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: .LBB20_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6055,47 +12785,323 @@ end: ret <14 x i64> %phi } +define inreg <14 x i64> @bitcast_v28f32_to_v14i64_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v28f32_to_v14i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v28f32_to_v14i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v14i64_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14i64_to_v28f32: ; VI: ; %bb.0: @@ -6104,7 +13110,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 ; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc @@ -6134,7 +13140,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6145,7 +13151,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: s_cbranch_execz .LBB22_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc @@ -6175,7 +13181,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6187,7 +13193,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -6224,7 +13230,7 @@ define <28 x float> @bitcast_v14i64_to_v28f32(<14 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: .LBB22_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6244,47 +13250,344 @@ end: ret <28 x float> %phi } +define inreg <28 x float> @bitcast_v14i64_to_v28f32_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v14i64_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v14i64_to_v28f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v14f64: ; VI: ; %bb.0: @@ -6293,7 +13596,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6323,7 +13626,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: .LBB24_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6334,7 +13637,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6364,7 +13667,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: .LBB24_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6376,7 +13679,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -6392,7 +13695,7 @@ define <14 x double> @bitcast_v28f32_to_v14f64(<28 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: .LBB24_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6412,69 +13715,345 @@ end: ret <14 x double> %phi } -define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x double> @bitcast_v28f32_to_v14f64_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 ; -; VI-LABEL: bitcast_v14f64_to_v28f32: +; VI-LABEL: bitcast_v28f32_to_v14f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v14f64_to_v28f32: +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v28f32_to_v14f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v28f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v28f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -6490,7 +14069,7 @@ define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6502,7 +14081,7 @@ define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 ; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 @@ -6518,7 +14097,7 @@ define <28 x float> @bitcast_v14f64_to_v28f32(<14 x double> %a, i32 %b) { ; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: .LBB26_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6538,312 +14117,569 @@ end: ret <28 x float> %phi } +define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_mov_b32_e32 v15, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v14f64_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v29, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v14f64_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v29, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: v_mov_b32_e32 v15, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v14f64_to_v28f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v48, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v41, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v41 -; GCN-NEXT: v_or_b32_e32 v5, v57, v45 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v41, v58, v56 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v59, v43 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v43, v60, v47 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v47, v61, v62 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v45, v45, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v56, v56, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v44, v58, v44 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v9, v9, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v12, v12, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v13, v13, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v20, v20, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v28f32_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v39, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v49, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v54, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v28f32_to_v56i16: ; VI: ; %bb.0: @@ -6887,7 +14723,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: s_cbranch_execz .LBB28_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -6917,9 +14753,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB28_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB28_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -6977,7 +14813,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: .LBB28_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 @@ -7088,7 +14924,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: s_cbranch_execz .LBB28_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -7118,9 +14954,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 ; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 @@ -7178,7 +15014,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 @@ -7228,7 +15064,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -7244,7 +15080,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7283,7 +15119,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 @@ -7313,9 +15149,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 @@ -7359,7 +15195,7 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 @@ -7408,633 +15244,556 @@ end: ret <56 x i16> %phi } -define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v29, s17 +; SI-NEXT: v_mov_b32_e32 v25, s18 +; SI-NEXT: v_mov_b32_e32 v23, s19 +; SI-NEXT: v_mov_b32_e32 v28, s20 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v19, s25 +; SI-NEXT: v_mov_b32_e32 v18, s26 +; SI-NEXT: v_mov_b32_e32 v17, s27 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 +; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v30, v30, v44 +; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 +; SI-NEXT: v_or_b32_e32 v25, v25, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v25, v29, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_branch .LBB29_2 ; -; VI-LABEL: bitcast_v56i16_to_v28f32: +; VI-LABEL: bitcast_v28f32_to_v56i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v26, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v27, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB29_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v20, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -8049,266 +15808,219 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v59 -; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v57 -; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v56 -; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v47 -; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v46 -; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v45 -; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v44 -; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v43 -; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v42 -; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v41 -; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v40 -; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v55 -; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v54 -; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v53 -; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v52 -; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v51 -; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v50 -; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v49 -; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v48 -; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v39 -; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v38 -; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v37 -; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v36 -; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v35 -; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v34 -; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v33 -; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v32 -; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB15_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: s_branch .LBB29_2 ; -; GFX9-LABEL: bitcast_v56i16_to_v28f32: +; GFX9-LABEL: bitcast_v28f32_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v26, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v27, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -8323,1007 +16035,994 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB15_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_branch .LBB29_2 ; -; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v17, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v13, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 ; -; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <56 x i16> %a, splat (i16 3) - %a2 = bitcast <56 x i16> %a1 to <28 x float> + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <56 x i16> br label %end cmp.false: - %a3 = bitcast <56 x i16> %a to <28 x float> + %a3 = bitcast <28 x float> %a to <56 x i16> br label %end end: - %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <28 x float> %phi + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi } -define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v28f32_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v56i16_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v28f32_to_v56f16: +; VI-LABEL: bitcast_v56i16_to_v28f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -9348,1342 +17047,8366 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: v_mov_b32_e32 v27, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v59 +; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v58 +; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v57 +; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v56 +; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v47 +; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v46 +; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v45 +; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v44 +; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 +; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v42 +; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v41 +; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v40 +; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v55 +; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v54 +; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v53 +; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v52 +; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v51 +; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v50 +; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v49 +; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v48 +; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v39 +; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v38 +; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v37 +; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v36 +; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v35 +; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v34 +; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v33 +; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v32 +; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v28f32_to_v56f16: +; GFX9-LABEL: bitcast_v56i16_to_v28f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB30_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB30_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB30_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v56i16_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v28f32_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v28f32_to_v56f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB32_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB32_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28f32_to_v56f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB32_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB32_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + +define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v28f32_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s6 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s7 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s8 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_add_f32_e64 v14, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v36, s6, 1.0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e64 v10, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v31 +; SI-NEXT: v_add_f32_e64 v12, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; SI-NEXT: v_add_f32_e64 v48, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v8, v34, v8 +; SI-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; SI-NEXT: v_add_i32_e32 v31, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v7 +; SI-NEXT: v_add_i32_e32 v31, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_add_i32_e32 v31, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v11 +; SI-NEXT: v_add_i32_e32 v31, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v48 +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v36 +; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v33 +; SI-NEXT: v_add_i32_e32 v12, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v19 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v28f32_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v20, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v26, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v27, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v20, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v28f32_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v20, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v26, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v27, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v20 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v14, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v19, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v20, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v50, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_branch .LBB33_2 +; +; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v17, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s27 :: v_dual_mov_b32 v13, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 +; +; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <28 x float> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <28 x float> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + +define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v56f16_to_v28f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v56f16_to_v28f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 +; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v56f16_to_v28f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v28f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v56f16_to_v28f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v28f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <28 x float> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <28 x float> + br label %end + +end: + %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x float> %phi +} + +define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i64_to_v14f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14i64_to_v14f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i64_to_v14f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define inreg <14 x double> @bitcast_v14i64_to_v14f64_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v14i64_to_v14f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v14i64_to_v14f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v14i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v14i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14f64_to_v14i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <28 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <28 x float> %a1 to <56 x half> + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <14 x i64> br label %end cmp.false: - %a3 = bitcast <28 x float> %a to <56 x half> + %a3 = bitcast <14 x double> %a to <14 x i64> br label %end end: - %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <56 x half> %phi + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi } -define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v28f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_mov_b32_e32 v15, v29 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 ; -; VI-LABEL: bitcast_v56f16_to_v28f32: +; VI-LABEL: bitcast_v14f64_to_v14i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v29, v1 +; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v14, v28 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v14f64_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v29, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v14, v28 +; GFX9-NEXT: v_mov_b32_e32 v15, v29 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v14f64_to_v14i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i64_to_v56i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -10708,477 +25431,346 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB40_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 +; VI-NEXT: s_cbranch_execz .LBB40_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 -; VI-NEXT: v_or_b32_e32 v22, v23, v22 -; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 -; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 -; VI-NEXT: v_or_b32_e32 v24, v25, v24 -; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v26, v28, v26 -; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB40_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v56f16_to_v28f32: +; GFX9-LABEL: bitcast_v14i64_to_v56i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB40_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 +; GFX9-NEXT: s_cbranch_execz .LBB40_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -11186,989 +25778,2279 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32: +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <56 x half> %a, splat (half 0xH0200) - %a2 = bitcast <56 x half> %a1 to <28 x float> - br label %end - -cmp.false: - %a3 = bitcast <56 x half> %a to <28 x float> - br label %end - -end: - %phi = phi <28 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <28 x float> %phi -} - -define <14 x double> @bitcast_v14i64_to_v14f64(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v14i64_to_v14f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v14i64_to_v14f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v14i64_to_v14f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - cmp.true: %a1 = add <14 x i64> %a, splat (i64 3) - %a2 = bitcast <14 x i64> %a1 to <14 x double> + %a2 = bitcast <14 x i64> %a1 to <56 x i16> br label %end cmp.false: - %a3 = bitcast <14 x i64> %a to <14 x double> + %a3 = bitcast <14 x i64> %a to <56 x i16> br label %end end: - %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x double> %phi + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi } -define <14 x i64> @bitcast_v14f64_to_v14i64(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v2 +; SI-NEXT: v_readfirstlane_b32 s41, v3 +; SI-NEXT: v_readfirstlane_b32 s40, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v6 +; SI-NEXT: v_readfirstlane_b32 s13, v7 +; SI-NEXT: v_readfirstlane_b32 s12, v8 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v10 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_lshr_b32 s45, s8, 16 +; SI-NEXT: s_lshr_b32 s46, s10, 16 +; SI-NEXT: s_lshr_b32 s47, s12, 16 +; SI-NEXT: s_lshr_b32 s56, s14, 16 +; SI-NEXT: s_lshr_b32 s57, s40, 16 +; SI-NEXT: s_lshr_b32 s58, s42, 16 +; SI-NEXT: s_lshr_b32 s59, s29, 16 +; SI-NEXT: s_lshr_b32 s60, s27, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s23, 16 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 16 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s45, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: s_branch .LBB41_2 ; -; VI-LABEL: bitcast_v14f64_to_v14i64: +; VI-LABEL: bitcast_v14i64_to_v56i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: .LBB19_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB41_2 ; -; GFX9-LABEL: bitcast_v14f64_to_v14i64: +; GFX9-LABEL: bitcast_v14i64_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB41_2 ; -; GFX11-LABEL: bitcast_v14f64_to_v14i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <14 x double> %a1 to <14 x i64> + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <56 x i16> br label %end cmp.false: - %a3 = bitcast <14 x double> %a to <14 x i64> + %a3 = bitcast <14 x i64> %a to <56 x i16> br label %end end: - %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x i64> %phi + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi } -define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v50, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v52, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v43, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v60, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v61, 0xffff, v5 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v40 -; GCN-NEXT: v_or_b32_e32 v5, v57, v45 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v40, v58, v56 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v59, v43 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v43, v60, v47 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v47, v61, v62 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v45, v45, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v56, v56, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v44, v58, v44 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v9, v9, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v13, v13, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v14, v14, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v16, v16, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v18, v18, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v20, v20, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v22, v22, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v24, v24, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v36 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v34 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v56i16_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v14i64_to_v56i16: +; VI-LABEL: bitcast_v56i16_to_v14i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v27, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v59 +; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v58 +; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v57 +; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v56 +; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v47 +; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v46 +; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v45 +; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v44 +; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v43 +; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v42 +; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v41 +; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v40 +; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v55 +; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v54 +; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v53 +; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v52 +; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v51 +; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v50 +; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v49 +; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v48 +; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v39 +; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v38 +; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v37 +; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v36 +; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v35 +; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v34 +; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v33 +; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v32 +; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v14i64_to_v56i16: +; GFX9-LABEL: bitcast_v56i16_to_v14i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -12188,150 +28070,207 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16: +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -12339,855 +28278,2014 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16: +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <14 x i64> %a, splat (i64 3) - %a2 = bitcast <14 x i64> %a1 to <56 x i16> + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v56i16_to_v14i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x i64> br label %end cmp.false: - %a3 = bitcast <14 x i64> %a to <56 x i16> + %a3 = bitcast <56 x i16> %a to <14 x i64> br label %end end: - %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <56 x i16> %phi + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi } -define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v14i64_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v28 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_mov_b32_e32 v29, v28 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v56i16_to_v14i64: +; VI-LABEL: bitcast_v14i64_to_v56f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 @@ -13212,476 +30310,346 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v59 -; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v57 -; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v56 -; VI-NEXT: v_add_u16_sdwa v4, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v47 -; VI-NEXT: v_add_u16_sdwa v5, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v46 -; VI-NEXT: v_add_u16_sdwa v6, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v45 -; VI-NEXT: v_add_u16_sdwa v7, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v44 -; VI-NEXT: v_add_u16_sdwa v8, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v43 -; VI-NEXT: v_add_u16_sdwa v9, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v42 -; VI-NEXT: v_add_u16_sdwa v10, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v41 -; VI-NEXT: v_add_u16_sdwa v11, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v40 -; VI-NEXT: v_add_u16_sdwa v12, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v55 -; VI-NEXT: v_add_u16_sdwa v13, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v54 -; VI-NEXT: v_add_u16_sdwa v14, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v53 -; VI-NEXT: v_add_u16_sdwa v15, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v52 -; VI-NEXT: v_add_u16_sdwa v16, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v51 -; VI-NEXT: v_add_u16_sdwa v17, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v50 -; VI-NEXT: v_add_u16_sdwa v18, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v49 -; VI-NEXT: v_add_u16_sdwa v19, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v48 -; VI-NEXT: v_add_u16_sdwa v20, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v39 -; VI-NEXT: v_add_u16_sdwa v21, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v38 -; VI-NEXT: v_add_u16_sdwa v22, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v37 -; VI-NEXT: v_add_u16_sdwa v23, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v36 -; VI-NEXT: v_add_u16_sdwa v24, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v35 -; VI-NEXT: v_add_u16_sdwa v25, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v34 -; VI-NEXT: v_add_u16_sdwa v26, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v33 -; VI-NEXT: v_add_u16_sdwa v28, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v32 -; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB44_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v56i16_to_v14i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14i64_to_v56f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB21_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB44_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB44_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64: +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -13689,995 +30657,2637 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + +define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14i64_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s42, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_readfirstlane_b32 s40, v3 +; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_readfirstlane_b32 s14, v5 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s7, v11 +; SI-NEXT: v_readfirstlane_b32 s8, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s46, s20, 16 +; SI-NEXT: s_lshr_b32 s47, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s56, s22, 16 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s58, s24, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s60, s26, 16 +; SI-NEXT: s_lshr_b32 s61, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s62, s28, 16 +; SI-NEXT: s_lshr_b32 s63, s29, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s72, s42, 16 +; SI-NEXT: s_lshr_b32 s73, s43, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s74, s40, 16 +; SI-NEXT: s_lshr_b32 s75, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s76, s14, 16 +; SI-NEXT: s_lshr_b32 s77, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s78, s12, 16 +; SI-NEXT: s_lshr_b32 s79, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s88, s10, 16 +; SI-NEXT: s_lshr_b32 s89, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s90, s7, 16 +; SI-NEXT: s_lshr_b32 s91, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 s92, s6, 16 +; SI-NEXT: s_lshr_b32 s93, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v45, s5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v43, v43, v44 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v41, v41, v42 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v37, v38, v37 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v37, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v35, v36, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v35, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v35, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v18, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v14i64_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v28, s30, 0 +; VI-NEXT: v_writelane_b32 v28, s31, 1 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_writelane_b32 v28, s34, 2 +; VI-NEXT: v_readfirstlane_b32 s43, v0 +; VI-NEXT: v_readfirstlane_b32 s42, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: v_readfirstlane_b32 s40, v3 +; VI-NEXT: v_readfirstlane_b32 s15, v4 +; VI-NEXT: v_readfirstlane_b32 s14, v5 +; VI-NEXT: v_readfirstlane_b32 s13, v6 +; VI-NEXT: v_readfirstlane_b32 s12, v7 +; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s10, v9 +; VI-NEXT: v_readfirstlane_b32 s9, v10 +; VI-NEXT: v_readfirstlane_b32 s8, v11 +; VI-NEXT: v_readfirstlane_b32 s6, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v13 +; VI-NEXT: v_writelane_b32 v28, s35, 3 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s44, s7, 16 +; VI-NEXT: s_lshr_b32 s45, s6, 16 +; VI-NEXT: s_lshr_b32 s46, s8, 16 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: s_lshr_b32 s56, s10, 16 +; VI-NEXT: s_lshr_b32 s57, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s12, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 16 +; VI-NEXT: s_lshr_b32 s61, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s40, 16 +; VI-NEXT: s_lshr_b32 s63, s41, 16 +; VI-NEXT: s_lshr_b32 s72, s42, 16 +; VI-NEXT: s_lshr_b32 s73, s43, 16 +; VI-NEXT: s_lshr_b32 s74, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 16 +; VI-NEXT: s_lshr_b32 s76, s27, 16 +; VI-NEXT: s_lshr_b32 s77, s26, 16 +; VI-NEXT: s_lshr_b32 s78, s25, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 16 +; VI-NEXT: s_lshr_b32 s88, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s22, 16 +; VI-NEXT: s_lshr_b32 s90, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 +; VI-NEXT: s_lshr_b32 s30, s19, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b32 s34, s17, 16 +; VI-NEXT: s_lshr_b32 s35, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s35, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s34, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s31, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s30, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s91, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s90, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s89, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s88, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s79, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s78, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s77, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s76, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s75, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s74, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s29, s73, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s42, s72, 16 +; VI-NEXT: s_or_b32 s29, s29, s42 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s42, s63, 16 +; VI-NEXT: s_or_b32 s41, s41, s42 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s42, s62, 16 +; VI-NEXT: s_or_b32 s40, s40, s42 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s42, s61, 16 +; VI-NEXT: s_or_b32 s15, s15, s42 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s42, s60, 16 +; VI-NEXT: s_or_b32 s14, s14, s42 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s42, s59, 16 +; VI-NEXT: s_or_b32 s13, s13, s42 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s42, s58, 16 +; VI-NEXT: s_or_b32 s12, s12, s42 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s42, s57, 16 +; VI-NEXT: s_or_b32 s11, s11, s42 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s42, s56, 16 +; VI-NEXT: s_or_b32 s10, s10, s42 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s42, s47, 16 +; VI-NEXT: s_or_b32 s9, s9, s42 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s42, s46, 16 +; VI-NEXT: s_or_b32 s8, s8, s42 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s42, s45, 16 +; VI-NEXT: s_or_b32 s6, s6, s42 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s42, s44, 16 +; VI-NEXT: s_or_b32 s7, s7, s42 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s41 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v18, s15 +; VI-NEXT: v_mov_b32_e32 v19, s14 +; VI-NEXT: v_mov_b32_e32 v20, s13 +; VI-NEXT: v_mov_b32_e32 v21, s12 +; VI-NEXT: v_mov_b32_e32 v22, s11 +; VI-NEXT: v_mov_b32_e32 v23, s10 +; VI-NEXT: v_mov_b32_e32 v24, s9 +; VI-NEXT: v_mov_b32_e32 v25, s8 +; VI-NEXT: v_mov_b32_e32 v26, s6 +; VI-NEXT: v_mov_b32_e32 v27, s7 +; VI-NEXT: v_readlane_b32 s35, v28, 3 +; VI-NEXT: v_readlane_b32 s34, v28, 2 +; VI-NEXT: v_readlane_b32 s31, v28, 1 +; VI-NEXT: v_readlane_b32 s30, v28, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v14i64_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s44, s43, 16 +; GFX9-NEXT: s_lshr_b32 s45, s42, 16 +; GFX9-NEXT: s_lshr_b32 s46, s41, 16 +; GFX9-NEXT: s_lshr_b32 s47, s40, 16 +; GFX9-NEXT: s_lshr_b32 s56, s15, 16 +; GFX9-NEXT: s_lshr_b32 s57, s14, 16 +; GFX9-NEXT: s_lshr_b32 s58, s13, 16 +; GFX9-NEXT: s_lshr_b32 s59, s12, 16 +; GFX9-NEXT: s_lshr_b32 s60, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s10, 16 +; GFX9-NEXT: s_lshr_b32 s62, s9, 16 +; GFX9-NEXT: s_lshr_b32 s63, s8, 16 +; GFX9-NEXT: s_lshr_b32 s72, s7, 16 +; GFX9-NEXT: s_lshr_b32 s73, s6, 16 +; GFX9-NEXT: s_lshr_b32 s74, s29, 16 +; GFX9-NEXT: s_lshr_b32 s75, s28, 16 +; GFX9-NEXT: s_lshr_b32 s76, s27, 16 +; GFX9-NEXT: s_lshr_b32 s77, s26, 16 +; GFX9-NEXT: s_lshr_b32 s78, s25, 16 +; GFX9-NEXT: s_lshr_b32 s79, s24, 16 +; GFX9-NEXT: s_lshr_b32 s88, s23, 16 +; GFX9-NEXT: s_lshr_b32 s89, s22, 16 +; GFX9-NEXT: s_lshr_b32 s90, s21, 16 +; GFX9-NEXT: s_lshr_b32 s91, s20, 16 +; GFX9-NEXT: s_lshr_b32 s92, s19, 16 +; GFX9-NEXT: s_lshr_b32 s93, s18, 16 +; GFX9-NEXT: s_lshr_b32 s94, s17, 16 +; GFX9-NEXT: s_lshr_b32 s95, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s45 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s44 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr45 +; GFX9-NEXT: ; implicit-def: $sgpr44 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v9 +; GFX11-TRUE16-NEXT: s_mov_b32 s90, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s13 :: v_dual_mov_b32 v19, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s11 :: v_dual_mov_b32 v21, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s9 :: v_dual_mov_b32 v23, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s7 :: v_dual_mov_b32 v25, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v27, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 +; +; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v9 +; GFX11-FAKE16-NEXT: s_mov_b32 s90, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s14, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s90 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr15 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <56 x i16> %a, splat (i16 3) - %a2 = bitcast <56 x i16> %a1 to <14 x i64> + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <56 x half> br label %end cmp.false: - %a3 = bitcast <56 x i16> %a to <14 x i64> + %a3 = bitcast <14 x i64> %a to <56 x half> br label %end end: - %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x i64> %phi + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi } -define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v14i64_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v56f16_to_v14i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v14i64_to_v56f16: +; VI-LABEL: bitcast_v56f16_to_v14i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v27 +; VI-NEXT: v_mov_b32_e32 v33, v26 +; VI-NEXT: v_mov_b32_e32 v34, v25 +; VI-NEXT: v_mov_b32_e32 v35, v24 +; VI-NEXT: v_mov_b32_e32 v36, v23 +; VI-NEXT: v_mov_b32_e32 v37, v22 +; VI-NEXT: v_mov_b32_e32 v38, v21 +; VI-NEXT: v_mov_b32_e32 v39, v20 +; VI-NEXT: v_mov_b32_e32 v48, v19 +; VI-NEXT: v_mov_b32_e32 v49, v18 +; VI-NEXT: v_mov_b32_e32 v50, v17 +; VI-NEXT: v_mov_b32_e32 v51, v16 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_mov_b32_e32 v53, v14 +; VI-NEXT: v_mov_b32_e32 v54, v13 +; VI-NEXT: v_mov_b32_e32 v55, v12 +; VI-NEXT: v_mov_b32_e32 v40, v11 +; VI-NEXT: v_mov_b32_e32 v41, v10 +; VI-NEXT: v_mov_b32_e32 v42, v9 +; VI-NEXT: v_mov_b32_e32 v43, v8 +; VI-NEXT: v_mov_b32_e32 v44, v7 +; VI-NEXT: v_mov_b32_e32 v45, v6 +; VI-NEXT: v_mov_b32_e32 v46, v5 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v2 +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_mov_b32_e32 v59, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v27, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB22_4: ; %end +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB46_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 +; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v14i64_to_v56f16: +; GFX9-LABEL: bitcast_v56f16_to_v14i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -14697,150 +33307,208 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -14848,1108 +33516,891 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16: +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <14 x i64> %a, splat (i64 3) - %a2 = bitcast <14 x i64> %a1 to <56 x half> - br label %end - -cmp.false: - %a3 = bitcast <14 x i64> %a to <56 x half> - br label %end - -end: - %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <56 x half> %phi -} - -define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v14i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v56f16_to_v14i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v27 -; VI-NEXT: v_mov_b32_e32 v33, v26 -; VI-NEXT: v_mov_b32_e32 v34, v25 -; VI-NEXT: v_mov_b32_e32 v35, v24 -; VI-NEXT: v_mov_b32_e32 v36, v23 -; VI-NEXT: v_mov_b32_e32 v37, v22 -; VI-NEXT: v_mov_b32_e32 v38, v21 -; VI-NEXT: v_mov_b32_e32 v39, v20 -; VI-NEXT: v_mov_b32_e32 v48, v19 -; VI-NEXT: v_mov_b32_e32 v49, v18 -; VI-NEXT: v_mov_b32_e32 v50, v17 -; VI-NEXT: v_mov_b32_e32 v51, v16 -; VI-NEXT: v_mov_b32_e32 v52, v15 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v54, v13 -; VI-NEXT: v_mov_b32_e32 v55, v12 -; VI-NEXT: v_mov_b32_e32 v40, v11 -; VI-NEXT: v_mov_b32_e32 v41, v10 -; VI-NEXT: v_mov_b32_e32 v42, v9 -; VI-NEXT: v_mov_b32_e32 v43, v8 -; VI-NEXT: v_mov_b32_e32 v44, v7 -; VI-NEXT: v_mov_b32_e32 v45, v6 -; VI-NEXT: v_mov_b32_e32 v46, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v56, v3 -; VI-NEXT: v_mov_b32_e32 v57, v2 -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v14i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v56f16_to_v14i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v27, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v27, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v27, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v27, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v27, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v27, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v27, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v27, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v27, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v27, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v27, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v27, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v27, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v27, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v27, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v27, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v27, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v27, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v46, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v45, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v44, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v43, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v41, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v53, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v52, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v36, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v35, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v34, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v33, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v27, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v56, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v56 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v47, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v47 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v46, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v46 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v45 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v44 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v43, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v42 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v41 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v40, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v40 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v55, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v55 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v54, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v54 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -15988,171 +34439,1077 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v25, v26, v25 ; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v26, v28, v26 -; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB23_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v56f16_to_v14i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v14i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} + +define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v32, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v34, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v36, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v38, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v48, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v51, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v40, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v44 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v56i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v56i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -16172,208 +35529,136 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB23_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64: +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -16381,580 +35666,654 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <56 x half> %a, splat (half 0xH0200) - %a2 = bitcast <56 x half> %a1 to <14 x i64> - br label %end - -cmp.false: - %a3 = bitcast <56 x half> %a to <14 x i64> - br label %end - -end: - %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x i64> %phi -} - -define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_alignbit_b32 v46, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_alignbit_b32 v29, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v30, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v32, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_alignbit_b32 v46, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v46, v57, v46 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v56, v58, v56 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v3, v3, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v4, v4, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v5, v5, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v6, v6, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v7, v7, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v8, v8, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v9, v9, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v10, v10, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v11, v11, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v12, v12, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v13, v13, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v14, v14, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v15, v15, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v16, v16, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v17, v17, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v18, v18, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v19, v19, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v20, v20, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v21, v21, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v22, v22, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v23, v23, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x68, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v24, v24, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_or_b32_e32 v27, v27, v29 -; GCN-NEXT: v_or_b32_e32 v28, v28, v39 -; GCN-NEXT: buffer_store_dword v46, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v14f64_to_v56i16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB24_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <56 x i16> + br label %end + +cmp.false: + %a3 = bitcast <14 x double> %a to <56 x i16> + br label %end + +end: + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi +} + +define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_mov_b32_e32 v28, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v25, s20 +; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_or_b32_e32 v27, v27, v44 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v14f64_to_v56i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v32, s20 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 +; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -16963,53 +36322,87 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_branch .LBB49_2 ; -; GFX9-LABEL: bitcast_v14f64_to_v56i16: +; GFX9-LABEL: bitcast_v14f64_to_v56i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -17018,77 +36411,38 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -17096,45 +36450,84 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -17143,62 +36536,418 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_branch .LBB49_2 ; -; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16: +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v16, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 ; -; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 @@ -17216,127 +36965,7 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -17355,521 +36984,493 @@ end: } define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v26 -; GCN-NEXT: v_mov_b32_e32 v35, v24 -; GCN-NEXT: v_mov_b32_e32 v36, v22 -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v41 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v63 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v62 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v61 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v60 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v59 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v58 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v56 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v36 -; GCN-NEXT: v_or_b32_e32 v11, v11, v46 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v35 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v12, v13 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v34 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v57 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v0, v41, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v63, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v62, v4 -; GCN-NEXT: v_or_b32_e32 v5, v61, v5 -; GCN-NEXT: v_or_b32_e32 v6, v60, v6 -; GCN-NEXT: v_or_b32_e32 v7, v59, v7 -; GCN-NEXT: v_or_b32_e32 v8, v58, v8 -; GCN-NEXT: v_or_b32_e32 v9, v56, v9 -; GCN-NEXT: v_or_b32_e32 v10, v47, v10 -; GCN-NEXT: v_or_b32_e32 v11, v46, v11 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v28, v12 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v28, v13 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v28, v16 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v28, v17 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v28, v18 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v28, v19 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v28, v20 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v28, v21 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v28, v22 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v28, v23 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v28, v24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v28, v25 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v28, v26 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v28, v27 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56i16_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v0, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v51 +; SI-NEXT: v_or_b32_e32 v16, v16, v49 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v57 +; SI-NEXT: v_or_b32_e32 v4, v4, v48 +; SI-NEXT: v_or_b32_e32 v5, v5, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v45 +; SI-NEXT: v_or_b32_e32 v9, v9, v44 +; SI-NEXT: v_or_b32_e32 v10, v10, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v33 +; SI-NEXT: v_or_b32_e32 v12, v12, v32 +; SI-NEXT: v_or_b32_e32 v13, v13, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v61 +; SI-NEXT: v_or_b32_e32 v15, v15, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v56 +; SI-NEXT: v_or_b32_e32 v18, v18, v47 +; SI-NEXT: v_or_b32_e32 v19, v19, v38 +; SI-NEXT: v_or_b32_e32 v20, v20, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v35 +; SI-NEXT: v_or_b32_e32 v22, v22, v43 +; SI-NEXT: v_or_b32_e32 v23, v23, v42 +; SI-NEXT: v_or_b32_e32 v24, v24, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v26, v26, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v2, v51, v2 +; SI-NEXT: v_or_b32_e32 v16, v49, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v3, v57, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 +; SI-NEXT: v_or_b32_e32 v6, v46, v6 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v8, v45, v8 +; SI-NEXT: v_or_b32_e32 v9, v44, v9 +; SI-NEXT: v_or_b32_e32 v10, v34, v10 +; SI-NEXT: v_or_b32_e32 v11, v33, v11 +; SI-NEXT: v_or_b32_e32 v12, v32, v12 +; SI-NEXT: v_or_b32_e32 v13, v63, v13 +; SI-NEXT: v_or_b32_e32 v14, v61, v14 +; SI-NEXT: v_or_b32_e32 v15, v50, v15 +; SI-NEXT: v_or_b32_e32 v17, v56, v17 +; SI-NEXT: v_or_b32_e32 v18, v47, v18 +; SI-NEXT: v_or_b32_e32 v19, v38, v19 +; SI-NEXT: v_or_b32_e32 v20, v36, v20 +; SI-NEXT: v_or_b32_e32 v21, v35, v21 +; SI-NEXT: v_or_b32_e32 v22, v43, v22 +; SI-NEXT: v_or_b32_e32 v23, v42, v23 +; SI-NEXT: v_or_b32_e32 v24, v41, v24 +; SI-NEXT: v_or_b32_e32 v25, v40, v25 +; SI-NEXT: v_or_b32_e32 v26, v62, v26 +; SI-NEXT: v_or_b32_e32 v27, v60, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v27 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v14f64: ; VI: ; %bb.0: @@ -17918,7 +37519,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -18005,9 +37606,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 @@ -18094,7 +37695,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v28, 3, v32 ; VI-NEXT: v_add_u16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -18111,150 +37712,2547 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v56i16_to_v14f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v59, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v37, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v38, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 -; GFX9-NEXT: v_mov_b32_e32 v39, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 -; GFX9-NEXT: v_mov_b32_e32 v48, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v9 -; GFX9-NEXT: v_mov_b32_e32 v43, v8 -; GFX9-NEXT: v_mov_b32_e32 v44, v7 -; GFX9-NEXT: v_mov_b32_e32 v45, v6 -; GFX9-NEXT: v_mov_b32_e32 v46, v5 -; GFX9-NEXT: v_mov_b32_e32 v47, v4 -; GFX9-NEXT: v_mov_b32_e32 v56, v3 -; GFX9-NEXT: v_mov_b32_e32 v57, v2 -; GFX9-NEXT: v_mov_b32_e32 v58, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-LABEL: bitcast_v56i16_to_v14f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v37, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v38, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: v_mov_b32_e32 v39, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_mov_b32_e32 v48, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v9 +; GFX9-NEXT: v_mov_b32_e32 v43, v8 +; GFX9-NEXT: v_mov_b32_e32 v44, v7 +; GFX9-NEXT: v_mov_b32_e32 v45, v6 +; GFX9-NEXT: v_mov_b32_e32 v46, v5 +; GFX9-NEXT: v_mov_b32_e32 v47, v4 +; GFX9-NEXT: v_mov_b32_e32 v56, v3 +; GFX9-NEXT: v_mov_b32_e32 v57, v2 +; GFX9-NEXT: v_mov_b32_e32 v58, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: .LBB50_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 +; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 +; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 +; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB50_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v32, v26 +; SI-NEXT: v_mov_b32_e32 v33, v24 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v36, v18 +; SI-NEXT: v_mov_b32_e32 v37, v16 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v13, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v14, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v16, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v17, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mov_b32_e32 v41, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 +; SI-NEXT: v_mov_b32_e32 v36, v33 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v48 +; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v34, v28 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v57, v46 +; SI-NEXT: v_mov_b32_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: v_mov_b32_e32 v58, v47 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v31, v47 +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v46, v57 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: v_mov_b32_e32 v34, v37 +; SI-NEXT: v_mov_b32_e32 v37, v48 +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v33 +; SI-NEXT: v_mov_b32_e32 v33, v36 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v56, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v56i16_to_v14f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v14f64_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 +; SI-NEXT: v_mov_b32_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14f64_to_v56f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14f64_to_v56f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr45 @@ -18274,207 +40272,136 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB25_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 -; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 -; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 -; GFX9-NEXT: v_perm_b32 v6, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v7, v60, v44, s6 -; GFX9-NEXT: v_perm_b32 v8, v63, v43, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 +; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -18482,890 +40409,843 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <56 x i16> %a, splat (i16 3) - %a2 = bitcast <56 x i16> %a1 to <14 x double> + %a1 = fadd <14 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <14 x double> %a1 to <56 x half> br label %end cmp.false: - %a3 = bitcast <56 x i16> %a to <14 x double> + %a3 = bitcast <14 x double> %a to <56 x half> br label %end end: - %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <14 x double> %phi + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi } -define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v14f64_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; kill: killed $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v40 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v44 -; GCN-NEXT: v_mov_b32_e32 v44, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v46 -; GCN-NEXT: v_mov_b32_e32 v46, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v56 -; GCN-NEXT: v_mov_b32_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_mov_b32_e32 v43, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v45 -; GCN-NEXT: v_mov_b32_e32 v45, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v47 -; GCN-NEXT: v_mov_b32_e32 v47, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v57 -; GCN-NEXT: v_mov_b32_e32 v57, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v59 -; GCN-NEXT: v_mov_b32_e32 v59, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v42, v63 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v39 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v37 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v48 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v38 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v36 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v32 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v61 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v59 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v57 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v47 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v45 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v43 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v42 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v44 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v46 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v56 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v58 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v60 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v62 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v14f64_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; SI-NEXT: v_readfirstlane_b32 s40, v1 +; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_and_b64 s[42:43], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s42, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: s_lshr_b32 s42, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: s_lshr_b32 s42, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s42 +; SI-NEXT: s_lshr_b32 s42, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: s_lshr_b32 s42, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 +; SI-NEXT: s_lshr_b32 s42, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: s_lshr_b32 s42, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s42 +; SI-NEXT: s_lshr_b32 s42, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s42 +; SI-NEXT: s_lshr_b32 s42, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 +; SI-NEXT: s_lshr_b32 s42, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s42 +; SI-NEXT: s_lshr_b32 s42, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s42 +; SI-NEXT: s_lshr_b32 s42, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s42 +; SI-NEXT: s_lshr_b32 s42, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s42 +; SI-NEXT: s_lshr_b32 s42, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 +; SI-NEXT: s_lshr_b32 s42, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s42 +; SI-NEXT: s_lshr_b32 s42, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s42 +; SI-NEXT: s_lshr_b32 s42, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s42 +; SI-NEXT: s_lshr_b32 s42, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s42 +; SI-NEXT: s_lshr_b32 s42, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s42 +; SI-NEXT: s_lshr_b32 s42, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s42 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s42 +; SI-NEXT: s_lshr_b32 s42, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s42 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s42 +; SI-NEXT: s_lshr_b32 s42, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s42 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s42 +; SI-NEXT: s_lshr_b32 s42, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s42 +; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s42 +; SI-NEXT: s_lshr_b32 s42, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s6 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s9 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[42:43], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v42 +; SI-NEXT: v_add_f64 v[49:50], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v44 +; SI-NEXT: v_add_f64 v[53:54], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 +; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v49, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v6, v14, v6 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v43 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: s_branch .LBB53_2 ; -; VI-LABEL: bitcast_v14f64_to_v56f16: +; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: ; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; VI-NEXT: .LBB26_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v15, s18 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v32, s20 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB53_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v19, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 +; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v48, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v49 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v49, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19374,53 +41254,87 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v8, v8, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_branch .LBB53_2 ; -; GFX9-LABEL: bitcast_v14f64_to_v56f16: +; GFX9-LABEL: bitcast_v14f64_to_v56f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v17, s28 +; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -19429,77 +41343,38 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr28 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 @@ -19507,45 +41382,84 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v47, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v46, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v45, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v44, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v43, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v42, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v41, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v40, v7, s4 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19554,62 +41468,418 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v8, v55, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v54, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v53, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v52, v11, s4 -; GFX9-NEXT: v_perm_b32 v12, v51, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v50, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v49, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v48, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v39, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v38, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v37, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v36, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v35, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v34, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v33, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v32, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v31, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v30, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v29, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v53, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v52, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v51, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v50, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_branch .LBB53_2 ; -; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v16, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v64, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v67, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v66, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v65, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 ; -; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16: +; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v71, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v66, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v54, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v51, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v70, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v69, 16, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v68, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v64, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v52, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v38, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v67, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v65, 16, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v55, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v53, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v39, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v50, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v49, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v48, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr10 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 @@ -19627,127 +41897,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -19766,703 +41916,685 @@ end: } define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v14f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v49 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v34 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v31 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; GCN-NEXT: v_or_b32_e32 v0, v46, v0 -; GCN-NEXT: v_or_b32_e32 v1, v44, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; GCN-NEXT: v_or_b32_e32 v2, v42, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v28, v14 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v28, v15 -; GCN-NEXT: v_or_b32_e32 v16, v55, v16 -; GCN-NEXT: v_or_b32_e32 v17, v52, v17 -; GCN-NEXT: v_or_b32_e32 v18, v50, v18 -; GCN-NEXT: v_or_b32_e32 v19, v48, v19 -; GCN-NEXT: v_or_b32_e32 v20, v38, v20 -; GCN-NEXT: v_or_b32_e32 v21, v36, v21 -; GCN-NEXT: v_or_b32_e32 v22, v34, v22 -; GCN-NEXT: v_or_b32_e32 v23, v33, v23 -; GCN-NEXT: v_or_b32_e32 v24, v35, v24 -; GCN-NEXT: v_or_b32_e32 v25, v37, v25 -; GCN-NEXT: v_or_b32_e32 v26, v39, v26 -; GCN-NEXT: v_or_b32_e32 v27, v49, v27 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; kill: killed $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v55 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v52 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v50 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v38 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v36 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_or_b32_e32 v15, v17, v16 -; GCN-NEXT: v_or_b32_e32 v16, v19, v18 -; GCN-NEXT: v_or_b32_e32 v17, v21, v20 -; GCN-NEXT: v_or_b32_e32 v18, v23, v22 -; GCN-NEXT: v_or_b32_e32 v19, v25, v24 -; GCN-NEXT: v_or_b32_e32 v20, v27, v26 -; GCN-NEXT: v_or_b32_e32 v21, v29, v28 -; GCN-NEXT: v_or_b32_e32 v22, v31, v30 -; GCN-NEXT: v_or_b32_e32 v23, v33, v32 -; GCN-NEXT: v_or_b32_e32 v24, v35, v34 -; GCN-NEXT: v_or_b32_e32 v25, v37, v36 -; GCN-NEXT: v_or_b32_e32 v26, v39, v38 -; GCN-NEXT: v_or_b32_e32 v27, v49, v48 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56f16_to_v14f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v46, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v33 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v21, v60, v21 +; SI-NEXT: v_or_b32_e32 v22, v58, v22 +; SI-NEXT: v_or_b32_e32 v23, v48, v23 +; SI-NEXT: v_or_b32_e32 v24, v38, v24 +; SI-NEXT: v_or_b32_e32 v25, v36, v25 +; SI-NEXT: v_or_b32_e32 v26, v34, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v56 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v63 +; SI-NEXT: v_or_b32_e32 v20, v62, v20 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v63 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v57 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v35 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56f16_to_v14f64: ; VI: ; %bb.0: @@ -20511,7 +42643,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v27, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v27, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -20598,9 +42730,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -20687,7 +42819,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -20802,7 +42934,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload @@ -20942,9 +43074,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; kill: killed $vgpr28 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -21047,7 +43179,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -21076,7 +43208,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -21106,7 +43238,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21174,7 +43306,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -21204,9 +43336,1343 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <14 x double> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <14 x double> + br label %end + +end: + %phi = phi <14 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x double> %phi +} + +define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v14f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v7, v47, v7 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_or_b32_e32 v8, v58, v8 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 +; SI-NEXT: v_or_b32_e32 v11, v45, v11 +; SI-NEXT: v_or_b32_e32 v12, v38, v12 +; SI-NEXT: v_or_b32_e32 v13, v36, v13 +; SI-NEXT: v_or_b32_e32 v14, v35, v14 +; SI-NEXT: v_or_b32_e32 v15, v32, v15 +; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v50, v27 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: v_mov_b32_e32 v39, v11 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v52, v37 +; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v52 +; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v56f16_to_v14f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: v_mov_b32_e32 v32, v13 +; VI-NEXT: v_mov_b32_e32 v33, v12 +; VI-NEXT: v_mov_b32_e32 v34, v11 +; VI-NEXT: v_mov_b32_e32 v35, v10 +; VI-NEXT: v_mov_b32_e32 v36, v9 +; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_mov_b32_e32 v38, v7 +; VI-NEXT: v_mov_b32_e32 v39, v6 +; VI-NEXT: v_mov_b32_e32 v48, v5 +; VI-NEXT: v_mov_b32_e32 v49, v4 +; VI-NEXT: v_mov_b32_e32 v50, v3 +; VI-NEXT: v_mov_b32_e32 v51, v2 +; VI-NEXT: v_mov_b32_e32 v52, v1 +; VI-NEXT: v_mov_b32_e32 v53, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v27, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v53, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v51, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v50, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v49, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v48, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v39, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v38, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v37, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v36, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v35, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v34, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v26, v28, v26 +; VI-NEXT: v_add_f16_sdwa v27, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v56f16_to_v14f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v13 +; GFX9-NEXT: v_mov_b32_e32 v33, v12 +; GFX9-NEXT: v_mov_b32_e32 v34, v11 +; GFX9-NEXT: v_mov_b32_e32 v35, v10 +; GFX9-NEXT: v_mov_b32_e32 v36, v9 +; GFX9-NEXT: v_mov_b32_e32 v37, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v7 +; GFX9-NEXT: v_mov_b32_e32 v39, v6 +; GFX9-NEXT: v_mov_b32_e32 v48, v5 +; GFX9-NEXT: v_mov_b32_e32 v49, v4 +; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v51, v2 +; GFX9-NEXT: v_mov_b32_e32 v52, v1 +; GFX9-NEXT: v_mov_b32_e32 v53, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v53 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v56, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v47, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v46, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v45, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v44, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v43, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v42, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v41, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v40, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v55, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v54, 16, v27 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v33, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v7 :: v_dual_mov_b32 v35, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v5 :: v_dual_mov_b32 v37, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_mov_b32 v49, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s59 :: v_dual_mov_b32 v5, s60 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s61 :: v_dual_mov_b32 v7, s62 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s42 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s40 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s18 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s14 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s22 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s10 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s24 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v31, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s28 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff, v9 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v49, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v48, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v39, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v38, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v37, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v36, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v35, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v34, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v33, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -21225,882 +44691,901 @@ end: } define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v56i16_to_v56f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; kill: killed $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v6 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v58 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v42 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v44 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v41 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v7, v7, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v40 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v9, v9, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v11, v11, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v13, v13, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v15, v15, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v17, v17, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v19, v19, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v21, v21, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v23, v23, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v25, v25, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_or_b32_e32 v45, v47, v46 -; GCN-NEXT: v_or_b32_e32 v46, v57, v56 -; GCN-NEXT: v_or_b32_e32 v47, v59, v58 -; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v45, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56i16_to_v56f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56i16_to_v56f16: ; VI: ; %bb.0: @@ -22146,7 +45631,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v56, 3, v56 @@ -22204,7 +45689,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_add_u16_e32 v29, 3, v29 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 ; VI-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -22318,7 +45803,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v27, v56, v27, s6 @@ -22405,7 +45890,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v40, v19, s4 @@ -22456,7 +45941,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] @@ -22486,7 +45971,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22526,7 +46011,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 @@ -22612,7 +46097,7 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 @@ -22660,609 +46145,2378 @@ end: ret <56 x half> %phi } +define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56i16_to_v56f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v57 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v61 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_branch .LBB57_3 +; SI-NEXT: .LBB57_2: +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: .LBB57_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: v_mov_b32_e32 v36, v48 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v52, v54 +; SI-NEXT: v_mov_b32_e32 v54, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: s_cbranch_vccnz .LBB57_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v63 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v56i16_to_v56f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v56i16_to_v56f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v49, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v48, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v39, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v38, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v37, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v36, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v35, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v34, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v29, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v49, s29 +; GFX9-NEXT: v_mov_b32_e32 v48, s28 +; GFX9-NEXT: v_mov_b32_e32 v39, s27 +; GFX9-NEXT: v_mov_b32_e32 v38, s26 +; GFX9-NEXT: v_mov_b32_e32 v37, s25 +; GFX9-NEXT: v_mov_b32_e32 v36, s24 +; GFX9-NEXT: v_mov_b32_e32 v35, s23 +; GFX9-NEXT: v_mov_b32_e32 v34, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v31, s19 +; GFX9-NEXT: v_mov_b32_e32 v30, s18 +; GFX9-NEXT: v_mov_b32_e32 v29, s17 +; GFX9-NEXT: v_mov_b32_e32 v28, s16 +; GFX9-NEXT: v_mov_b32_e32 v50, s43 +; GFX9-NEXT: v_mov_b32_e32 v51, s42 +; GFX9-NEXT: v_mov_b32_e32 v52, s41 +; GFX9-NEXT: v_mov_b32_e32 v53, s40 +; GFX9-NEXT: v_mov_b32_e32 v54, s15 +; GFX9-NEXT: v_mov_b32_e32 v55, s14 +; GFX9-NEXT: v_mov_b32_e32 v40, s13 +; GFX9-NEXT: v_mov_b32_e32 v41, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v43, s10 +; GFX9-NEXT: v_mov_b32_e32 v44, s9 +; GFX9-NEXT: v_mov_b32_e32 v45, s8 +; GFX9-NEXT: v_mov_b32_e32 v46, s7 +; GFX9-NEXT: v_mov_b32_e32 v47, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v28, v47, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v46, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v45, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v44, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v43, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v42, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v41, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v40, 16, v35 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v36, v55, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v54, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v53, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v52, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v51, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v50, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v12, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v10, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v12, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s13, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v33, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v71.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s12 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v38, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v50, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v39, 16, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v8.h +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v51, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v56f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v34, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v35, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s41 :: v_dual_mov_b32 v51, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s13 :: v_dual_mov_b32 v55, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s11 :: v_dual_mov_b32 v65, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s9 :: v_dual_mov_b32 v67, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <56 x i16> %a, splat (i16 3) + %a2 = bitcast <56 x i16> %a1 to <56 x half> + br label %end + +cmp.false: + %a3 = bitcast <56 x i16> %a to <56 x half> + br label %end + +end: + %phi = phi <56 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x half> %phi +} + define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v56f16_to_v56i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v11 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v15 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v24 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v28 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v30 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v52 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v49 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v48 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v35 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v34 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v38 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v61 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v62, v40 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v61 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v11 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v15 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v16 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v32 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v33 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_or_b32_e32 v61, v13, v3 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; GCN-NEXT: v_or_b32_e32 v41, v2, v15 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v57, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v18 -; GCN-NEXT: v_or_b32_e32 v15, v4, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v62 -; GCN-NEXT: v_or_b32_e32 v16, v8, v17 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v19 -; GCN-NEXT: v_or_b32_e32 v17, v9, v3 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v31 -; GCN-NEXT: v_or_b32_e32 v18, v10, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: v_or_b32_e32 v19, v11, v4 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: v_or_b32_e32 v20, v20, v8 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v36 -; GCN-NEXT: v_or_b32_e32 v21, v21, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GCN-NEXT: v_or_b32_e32 v14, v14, v2 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v38 -; GCN-NEXT: v_or_b32_e32 v46, v46, v24 -; GCN-NEXT: v_or_b32_e32 v44, v44, v47 -; GCN-NEXT: v_or_b32_e32 v3, v42, v45 -; GCN-NEXT: v_or_b32_e32 v11, v40, v43 -; GCN-NEXT: v_or_b32_e32 v2, v54, v37 -; GCN-NEXT: v_or_b32_e32 v10, v52, v55 -; GCN-NEXT: v_or_b32_e32 v8, v50, v53 -; GCN-NEXT: v_or_b32_e32 v25, v35, v51 -; GCN-NEXT: v_or_b32_e32 v22, v56, v49 -; GCN-NEXT: v_or_b32_e32 v29, v29, v39 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_or_b32_e32 v54, v59, v28 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v23, v58 -; GCN-NEXT: v_or_b32_e32 v31, v34, v60 -; GCN-NEXT: v_or_b32_e32 v5, v5, v4 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v6, v6, v13 -; GCN-NEXT: v_or_b32_e32 v1, v48, v26 -; GCN-NEXT: v_alignbit_b32 v56, v1, v24, 16 -; GCN-NEXT: v_alignbit_b32 v47, v6, v47, 16 -; GCN-NEXT: v_alignbit_b32 v45, v7, v45, 16 -; GCN-NEXT: v_alignbit_b32 v43, v5, v43, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v37, 16 -; GCN-NEXT: v_alignbit_b32 v4, v21, v55, 16 -; GCN-NEXT: v_alignbit_b32 v26, v20, v53, 16 -; GCN-NEXT: v_alignbit_b32 v9, v19, v51, 16 -; GCN-NEXT: v_alignbit_b32 v24, v18, v49, 16 -; GCN-NEXT: v_alignbit_b32 v23, v17, v39, 16 -; GCN-NEXT: v_alignbit_b32 v30, v16, v30, 16 -; GCN-NEXT: v_alignbit_b32 v28, v15, v28, 16 -; GCN-NEXT: v_alignbit_b32 v63, v41, v58, 16 -; GCN-NEXT: v_alignbit_b32 v59, v61, v60, 16 -; GCN-NEXT: v_mov_b32_e32 v60, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v31 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v56 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v47 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v45 -; GCN-NEXT: v_or_b32_e32 v42, v35, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v35 -; GCN-NEXT: v_or_b32_e32 v37, v1, v37 -; GCN-NEXT: v_mov_b32_e32 v55, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v43 -; GCN-NEXT: v_or_b32_e32 v43, v39, v49 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v11 -; GCN-NEXT: v_or_b32_e32 v35, v6, v50 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_or_b32_e32 v3, v3, v51 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v55 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v53 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v5, v5, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v2, v2, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v14, v14, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v4, v10, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v62 -; GCN-NEXT: v_or_b32_e32 v21, v21, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v8, v8, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v20, v20, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_or_b32_e32 v9, v25, v9 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v19, v19, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v22, v22, v24 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v18, v18, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v23, v29, v23 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v17, v17, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v63 -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v54 -; GCN-NEXT: v_or_b32_e32 v16, v16, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 -; GCN-NEXT: v_or_b32_e32 v28, v48, v28 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v15, v15, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x6c, v0 -; GCN-NEXT: v_or_b32_e32 v38, v38, v47 -; GCN-NEXT: v_or_b32_e32 v41, v41, v56 -; GCN-NEXT: v_or_b32_e32 v54, v54, v58 -; GCN-NEXT: v_or_b32_e32 v47, v59, v57 -; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v41, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v47, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v56f16_to_v56i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v36 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v37 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v51 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v54 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v41 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v48, v7 +; SI-NEXT: v_mov_b32_e32 v51, v9 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v54, v13 +; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v31, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v38 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v31 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v38, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v7, v31, v45 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v44 +; SI-NEXT: v_or_b32_e32 v41, v37, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v51 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v54, v39, v43 +; SI-NEXT: v_or_b32_e32 v52, v38, v42 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v51, v25, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v48, v37, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v62 +; SI-NEXT: v_or_b32_e32 v57, v21, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_or_b32_e32 v62, v25, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v61, v29, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 +; SI-NEXT: v_or_b32_e32 v49, v21, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 +; SI-NEXT: v_or_b32_e32 v35, v25, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21 +; SI-NEXT: v_or_b32_e32 v33, v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v53 +; SI-NEXT: v_or_b32_e32 v36, v21, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_or_b32_e32 v55, v7, v21 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v50 +; SI-NEXT: v_alignbit_b32 v26, v36, v26, 16 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 +; SI-NEXT: v_or_b32_e32 v53, v25, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 +; SI-NEXT: v_or_b32_e32 v59, v13, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_alignbit_b32 v29, v59, v28, 16 +; SI-NEXT: v_alignbit_b32 v28, v53, v27, 16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_alignbit_b32 v27, v55, v60, 16 +; SI-NEXT: v_or_b32_e32 v22, v21, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 +; SI-NEXT: v_or_b32_e32 v24, v24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 +; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v37, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v21 +; SI-NEXT: v_alignbit_b32 v56, v3, v47, 16 +; SI-NEXT: v_alignbit_b32 v47, v6, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v1, v31, 16 +; SI-NEXT: v_alignbit_b32 v21, v24, v38, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v56 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v56f16_to_v56i16: ; VI: ; %bb.0: @@ -23308,7 +48562,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v56, 0x200, v56 @@ -23366,7 +48620,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 ; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 ; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 ; VI-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -23480,7 +48734,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v27, v56, v27, s6 @@ -23568,7 +48822,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v25 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v19, v40, v19, s4 @@ -23619,7 +48873,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] @@ -23649,7 +48903,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23689,7 +48943,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100 @@ -23775,7 +49029,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100 @@ -23822,3 +49076,1521 @@ end: %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <56 x i16> %phi } + +define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v56f16_to_v56i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v45 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v29, v61 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v33, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v41 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v53 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v54 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v57, v54, v29 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v6, v6, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v16, v16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v15, v15, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v34, v34, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_or_b32_e32 v37, v37, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v36, v36, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v51, v51, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v52, v52, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v21 +; SI-NEXT: v_or_b32_e32 v27, v27, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v54 +; SI-NEXT: v_or_b32_e32 v28, v28, v47 +; SI-NEXT: v_or_b32_e32 v26, v26, v46 +; SI-NEXT: v_or_b32_e32 v23, v23, v45 +; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_or_b32_e32 v50, v50, v43 +; SI-NEXT: v_or_b32_e32 v48, v48, v42 +; SI-NEXT: v_or_b32_e32 v38, v38, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v40 +; SI-NEXT: v_or_b32_e32 v18, v18, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v59 +; SI-NEXT: v_or_b32_e32 v12, v12, v53 +; SI-NEXT: v_or_b32_e32 v10, v10, v30 +; SI-NEXT: v_or_b32_e32 v8, v8, v60 +; SI-NEXT: v_alignbit_b32 v56, v25, v47, 16 +; SI-NEXT: v_alignbit_b32 v47, v27, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v20, v45, 16 +; SI-NEXT: v_alignbit_b32 v45, v52, v33, 16 +; SI-NEXT: v_alignbit_b32 v44, v51, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v36, v42, 16 +; SI-NEXT: v_alignbit_b32 v42, v37, v58, 16 +; SI-NEXT: v_alignbit_b32 v41, v34, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v15, v55, 16 +; SI-NEXT: v_alignbit_b32 v55, v16, v59, 16 +; SI-NEXT: v_alignbit_b32 v54, v13, v53, 16 +; SI-NEXT: v_alignbit_b32 v53, v6, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v7, v60, 16 +; SI-NEXT: v_alignbit_b32 v29, v4, v29, 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v28, v28, v33 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v24, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v46 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v3, v3, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v56f16_to_v56i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v50, 0x200 +; VI-NEXT: v_add_f16_e32 v28, s16, v50 +; VI-NEXT: v_add_f16_e32 v47, s43, v50 +; VI-NEXT: v_add_f16_e32 v29, s17, v50 +; VI-NEXT: v_add_f16_e32 v46, s42, v50 +; VI-NEXT: v_add_f16_e32 v30, s18, v50 +; VI-NEXT: v_add_f16_e32 v45, s41, v50 +; VI-NEXT: v_add_f16_e32 v31, s19, v50 +; VI-NEXT: v_add_f16_e32 v44, s40, v50 +; VI-NEXT: v_add_f16_e32 v32, s20, v50 +; VI-NEXT: v_add_f16_e32 v43, s15, v50 +; VI-NEXT: v_add_f16_e32 v33, s21, v50 +; VI-NEXT: v_add_f16_e32 v42, s14, v50 +; VI-NEXT: v_add_f16_e32 v34, s22, v50 +; VI-NEXT: v_add_f16_e32 v41, s13, v50 +; VI-NEXT: v_add_f16_e32 v35, s23, v50 +; VI-NEXT: v_add_f16_e32 v40, s12, v50 +; VI-NEXT: v_add_f16_e32 v36, s24, v50 +; VI-NEXT: v_add_f16_e32 v55, s11, v50 +; VI-NEXT: v_add_f16_e32 v37, s25, v50 +; VI-NEXT: v_add_f16_e32 v54, s10, v50 +; VI-NEXT: v_add_f16_e32 v38, s26, v50 +; VI-NEXT: v_add_f16_e32 v53, s9, v50 +; VI-NEXT: v_add_f16_e32 v39, s27, v50 +; VI-NEXT: v_add_f16_e32 v52, s8, v50 +; VI-NEXT: v_add_f16_e32 v48, s28, v50 +; VI-NEXT: v_add_f16_e32 v51, s7, v50 +; VI-NEXT: v_add_f16_e32 v49, s29, v50 +; VI-NEXT: v_add_f16_e32 v50, s6, v50 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v50, s6 +; VI-NEXT: v_mov_b32_e32 v49, s29 +; VI-NEXT: v_mov_b32_e32 v51, s7 +; VI-NEXT: v_mov_b32_e32 v48, s28 +; VI-NEXT: v_mov_b32_e32 v52, s8 +; VI-NEXT: v_mov_b32_e32 v39, s27 +; VI-NEXT: v_mov_b32_e32 v53, s9 +; VI-NEXT: v_mov_b32_e32 v38, s26 +; VI-NEXT: v_mov_b32_e32 v54, s10 +; VI-NEXT: v_mov_b32_e32 v37, s25 +; VI-NEXT: v_mov_b32_e32 v55, s11 +; VI-NEXT: v_mov_b32_e32 v36, s24 +; VI-NEXT: v_mov_b32_e32 v40, s12 +; VI-NEXT: v_mov_b32_e32 v35, s23 +; VI-NEXT: v_mov_b32_e32 v41, s13 +; VI-NEXT: v_mov_b32_e32 v34, s22 +; VI-NEXT: v_mov_b32_e32 v42, s14 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v43, s15 +; VI-NEXT: v_mov_b32_e32 v32, s20 +; VI-NEXT: v_mov_b32_e32 v44, s40 +; VI-NEXT: v_mov_b32_e32 v31, s19 +; VI-NEXT: v_mov_b32_e32 v45, s41 +; VI-NEXT: v_mov_b32_e32 v30, s18 +; VI-NEXT: v_mov_b32_e32 v46, s42 +; VI-NEXT: v_mov_b32_e32 v29, s17 +; VI-NEXT: v_mov_b32_e32 v47, s43 +; VI-NEXT: v_mov_b32_e32 v28, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v28, v28, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v36, v36, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v38, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v39, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v49, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v28 +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: v_mov_b32_e32 v2, v30 +; VI-NEXT: v_mov_b32_e32 v3, v31 +; VI-NEXT: v_mov_b32_e32 v4, v32 +; VI-NEXT: v_mov_b32_e32 v5, v33 +; VI-NEXT: v_mov_b32_e32 v6, v34 +; VI-NEXT: v_mov_b32_e32 v7, v35 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v9, v37 +; VI-NEXT: v_mov_b32_e32 v10, v38 +; VI-NEXT: v_mov_b32_e32 v11, v39 +; VI-NEXT: v_mov_b32_e32 v12, v48 +; VI-NEXT: v_mov_b32_e32 v13, v49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v56f16_to_v56i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v25, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v24, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v23, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v22, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v21, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v20, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v18, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v14, 0x200 +; GFX9-NEXT: v_pk_add_f16 v49, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v48, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v39, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v38, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v37, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v36, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v35, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v34, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v29, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v28, s4, v14 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v49, s29 +; GFX9-NEXT: v_mov_b32_e32 v48, s28 +; GFX9-NEXT: v_mov_b32_e32 v39, s27 +; GFX9-NEXT: v_mov_b32_e32 v38, s26 +; GFX9-NEXT: v_mov_b32_e32 v37, s25 +; GFX9-NEXT: v_mov_b32_e32 v36, s24 +; GFX9-NEXT: v_mov_b32_e32 v35, s23 +; GFX9-NEXT: v_mov_b32_e32 v34, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s20 +; GFX9-NEXT: v_mov_b32_e32 v31, s19 +; GFX9-NEXT: v_mov_b32_e32 v30, s18 +; GFX9-NEXT: v_mov_b32_e32 v29, s17 +; GFX9-NEXT: v_mov_b32_e32 v28, s16 +; GFX9-NEXT: v_mov_b32_e32 v50, s43 +; GFX9-NEXT: v_mov_b32_e32 v51, s42 +; GFX9-NEXT: v_mov_b32_e32 v52, s41 +; GFX9-NEXT: v_mov_b32_e32 v53, s40 +; GFX9-NEXT: v_mov_b32_e32 v54, s15 +; GFX9-NEXT: v_mov_b32_e32 v55, s14 +; GFX9-NEXT: v_mov_b32_e32 v40, s13 +; GFX9-NEXT: v_mov_b32_e32 v41, s12 +; GFX9-NEXT: v_mov_b32_e32 v42, s11 +; GFX9-NEXT: v_mov_b32_e32 v43, s10 +; GFX9-NEXT: v_mov_b32_e32 v44, s9 +; GFX9-NEXT: v_mov_b32_e32 v45, s8 +; GFX9-NEXT: v_mov_b32_e32 v46, s7 +; GFX9-NEXT: v_mov_b32_e32 v47, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v28, v47, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v46, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v45, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v31, v44, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v32, v43, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v42, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v41, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v40, 16, v35 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v15, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v36, v55, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v54, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v53, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v52, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v51, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v50, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v28 +; GFX9-NEXT: v_mov_b32_e32 v1, v29 +; GFX9-NEXT: v_mov_b32_e32 v2, v30 +; GFX9-NEXT: v_mov_b32_e32 v3, v31 +; GFX9-NEXT: v_mov_b32_e32 v4, v32 +; GFX9-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-NEXT: v_mov_b32_e32 v6, v34 +; GFX9-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v9, v37 +; GFX9-NEXT: v_mov_b32_e32 v10, v38 +; GFX9-NEXT: v_mov_b32_e32 v11, v39 +; GFX9-NEXT: v_mov_b32_e32 v12, v48 +; GFX9-NEXT: v_mov_b32_e32 v13, v49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v11, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v12, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v13, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v10, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s22, s13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v12, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v10, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v13, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s13, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v33, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v71.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s12 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v34, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v35, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v29, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v38, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v36, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v23, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v24, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v37, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v50, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v39, 16, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v49, 16, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v52, 16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v8.h +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v51, 16, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v54 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v48, 16, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v5, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v56i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v34, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v35, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, s43 :: v_dual_mov_b32 v49, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s41 :: v_dual_mov_b32 v51, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s13 :: v_dual_mov_b32 v55, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s11 :: v_dual_mov_b32 v65, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s9 :: v_dual_mov_b32 v67, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s6 :: v_dual_mov_b32 v69, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v68, 16, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v66, 16, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v71, 16, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v69, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v64, 16, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v67, 16, v36 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v65, 16, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v55, 16, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v54, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v52, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v51, 16, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v49, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v48, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v39, 16, v51 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v38, 16, v52 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v34 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v28 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <56 x half> %a, splat (half 0xH0200) + %a2 = bitcast <56 x half> %a1 to <56 x i16> + br label %end + +cmp.false: + %a3 = bitcast <56 x half> %a to <56 x i16> + br label %end + +end: + %phi = phi <56 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <56 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 2837f2b2bd7fa..b480e89dfcc30 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -1,54 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v30i32_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB0_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30i32_to_v30f32: ; VI: ; %bb.0: @@ -196,49 +196,354 @@ end: ret <30 x float> %phi } +define inreg <30 x float> @bitcast_v30i32_to_v30f32_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v30i32_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v30i32_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v30i32_to_v30f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_3: +; GFX11-NEXT: .LBB1_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v30f32_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB2_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30f32_to_v30i32: ; VI: ; %bb.0: @@ -247,7 +552,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: s_cbranch_execz .LBB2_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 @@ -279,7 +584,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: .LBB2_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -290,7 +595,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 @@ -322,7 +627,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: .LBB2_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -334,7 +639,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -351,7 +656,7 @@ define <30 x i32> @bitcast_v30f32_to_v30i32(<30 x float> %a, i32 %b) { ; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: .LBB2_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -371,49 +676,339 @@ end: ret <30 x i32> %phi } +define inreg <30 x i32> @bitcast_v30f32_to_v30i32_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB3_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB3_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_4: +; SI-NEXT: s_branch .LBB3_2 +; +; VI-LABEL: bitcast_v30f32_to_v30i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB3_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB3_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_4: +; VI-NEXT: s_branch .LBB3_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v30i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB3_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: s_branch .LBB3_2 +; +; GFX11-LABEL: bitcast_v30f32_to_v30i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB3_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v30i32_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v30i32_to_v15i64: ; VI: ; %bb.0: @@ -422,7 +1017,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 @@ -454,7 +1049,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: .LBB4_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -465,7 +1060,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 ; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 @@ -497,7 +1092,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: .LBB4_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -509,7 +1104,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 ; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 @@ -541,7 +1136,7 @@ define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { ; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: .LBB4_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -561,63 +1156,368 @@ end: ret <15 x i64> %phi } -define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <15 x i64> @bitcast_v30i32_to_v15i64_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: s_branch .LBB5_2 ; -; VI-LABEL: bitcast_v15i64_to_v30i32: +; VI-LABEL: bitcast_v30i32_to_v15i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v30i32_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-LABEL: bitcast_v30i32_to_v15i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB5_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v30i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 ; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 @@ -644,7 +1544,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -655,7 +1555,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc @@ -687,7 +1587,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -699,7 +1599,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -739,7 +1639,7 @@ define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: .LBB6_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -759,282 +1659,450 @@ end: ret <30 x i32> %phi } -define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <30 x i32> @bitcast_v15i64_to_v30i32_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: s_branch .LBB7_2 ; -; VI-LABEL: bitcast_v30i32_to_v15f64: +; VI-LABEL: bitcast_v15i64_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB4_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: .LBB4_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: s_branch .LBB7_2 ; -; GFX9-LABEL: bitcast_v30i32_to_v15f64: +; GFX9-LABEL: bitcast_v15i64_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB4_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: .LBB4_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: s_branch .LBB7_2 ; -; GFX11-LABEL: bitcast_v30i32_to_v15f64: +; GFX11-LABEL: bitcast_v15i64_to_v30i32_scalar: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB4_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-NEXT: .LBB4_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB7_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_3: +; GFX11-NEXT: .LBB7_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <30 x i32> %a, splat (i32 3) - %a2 = bitcast <30 x i32> %a1 to <15 x double> + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <30 x i32> %a to <15 x double> + %a3 = bitcast <15 x i64> %a to <30 x i32> br label %end end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi } -define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x double> @bitcast_v30i32_to_v15f64(<30 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v30i32_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15f64_to_v30i32: +; VI-LABEL: bitcast_v30i32_to_v15f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB8_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15f64_to_v30i32: +; GFX9-LABEL: bitcast_v30i32_to_v15f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v15f64_to_v30i32: +; GFX11-LABEL: bitcast_v30i32_to_v15f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, exec_lo @@ -1042,456 +2110,173 @@ define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB8_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <30 x i32> + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <15 x double> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <30 x i32> + %a3 = bitcast <30 x i32> %a to <15 x double> br label %end end: - %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x i32> %phi + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi } -define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB6_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB6_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v3, v3, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v5, v5, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v9, v9, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v11, v11, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v13, v13, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v14, v14, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v15, v15, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v17, v17, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v18, v18, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v20, v20, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v22, v22, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v28, v28, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <15 x double> @bitcast_v30i32_to_v15f64_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: s_branch .LBB9_2 ; -; VI-LABEL: bitcast_v30i32_to_v60i16: +; VI-LABEL: bitcast_v30i32_to_v15f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB6_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB6_4 -; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 ; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 @@ -1505,7 +2290,7 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 ; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 ; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 ; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 @@ -1522,197 +2307,51 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB6_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 ; -; GFX9-LABEL: bitcast_v30i32_to_v60i16: +; GFX9-LABEL: bitcast_v30i32_to_v15f64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB6_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB6_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 ; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 ; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 @@ -1726,7 +2365,7 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 ; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 ; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 ; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 ; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 @@ -1743,989 +2382,842 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v30i32_to_v15f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB9_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_3: +; GFX11-NEXT: .LBB9_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <30 x i32> @bitcast_v15f64_to_v30i32(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v30i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v30i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB6_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: bitcast_v15f64_to_v30i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + +define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: v_mov_b32_e32 v17, v31 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: s_branch .LBB11_2 ; -; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB6_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: bitcast_v15f64_to_v30i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v31, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB11_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB11_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: v_mov_b32_e32 v17, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_4: +; VI-NEXT: s_branch .LBB11_2 +; +; GFX9-LABEL: bitcast_v15f64_to_v30i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB11_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: v_mov_b32_e32 v17, v31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: s_branch .LBB11_2 +; +; GFX11-LABEL: bitcast_v15f64_to_v30i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <30 x i32> %a, splat (i32 3) - %a2 = bitcast <30 x i32> %a1 to <60 x i16> + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <30 x i32> %a to <60 x i16> + %a3 = bitcast <15 x double> %a to <30 x i32> br label %end end: - %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x i16> %phi + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi } -define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB7_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB7_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v30i32_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB12_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB12_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v60i16_to_v30i32: +; VI-LABEL: bitcast_v30i32_to_v60i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -2754,268 +3246,199 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: s_cbranch_execz .LBB12_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v61 -; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v59 -; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v58 -; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v57 -; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v56 -; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v47 -; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v46 -; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v45 -; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v44 -; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v43 -; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v42 -; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v41 -; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v40 -; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v55 -; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v54 -; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v52 -; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v51 -; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v50 -; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v49 -; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v48 -; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v39 -; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v38 -; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v37 -; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v36 -; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v35 -; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: v_add_u16_e32 v27, 3, v34 -; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v33 -; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v28, v30 -; VI-NEXT: v_add_u16_e32 v30, 3, v32 -; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB7_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v60i16_to_v30i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30i32_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -3037,233 +3460,162 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32: +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -3271,2275 +3623,2044 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB7_2: ; %end +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB12_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32: +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB7_2: ; %end +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <30 x i32> + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <30 x i32> + %a3 = bitcast <30 x i32> %a to <60 x i16> br label %end end: - %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x i32> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v30i32_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB8_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB8_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v41 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v36, v33 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v62 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v49, v32 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v63 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: s_branch .LBB13_2 ; -; VI-LABEL: bitcast_v30i32_to_v60f16: +; VI-LABEL: bitcast_v30i32_to_v60i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB8_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB8_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB8_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB13_2 ; -; GFX9-LABEL: bitcast_v30i32_to_v60f16: +; GFX9-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB8_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB8_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 -; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 -; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 -; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 -; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 -; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 -; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 -; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB8_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s45, s45, 3 +; GFX9-NEXT: s_add_i32 s44, s44, 3 +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB13_2 ; -; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB13_3: ; %end ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: .LBB8_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB13_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB13_2 ; -; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB8_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-FAKE16-NEXT: .LBB13_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB13_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB13_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: %a1 = add <30 x i32> %a, splat (i32 3) - %a2 = bitcast <30 x i32> %a1 to <60 x half> + %a2 = bitcast <30 x i32> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <30 x i32> %a to <60 x half> + %a3 = bitcast <30 x i32> %a to <60 x i16> br label %end end: - %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x half> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v30i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB9_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB9_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v60i16_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB14_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB14_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v60f16_to_v30i32: +; VI-LABEL: bitcast_v60i16_to_v30i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -5590,7 +5711,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v29, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -5683,102 +5804,102 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB9_2: ; %Flow +; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_4 +; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 -; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 -; VI-NEXT: v_or_b32_e32 v16, v17, v16 -; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 -; VI-NEXT: v_or_b32_e32 v17, v18, v17 -; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 -; VI-NEXT: v_or_b32_e32 v18, v19, v18 -; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 -; VI-NEXT: v_or_b32_e32 v19, v20, v19 -; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 -; VI-NEXT: v_or_b32_e32 v20, v21, v20 -; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 -; VI-NEXT: v_or_b32_e32 v21, v22, v21 -; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 -; VI-NEXT: v_or_b32_e32 v22, v23, v22 -; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 -; VI-NEXT: v_or_b32_e32 v23, v24, v23 -; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 -; VI-NEXT: v_or_b32_e32 v24, v25, v24 -; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 -; VI-NEXT: v_or_b32_e32 v25, v26, v25 -; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 -; VI-NEXT: v_or_b32_e32 v26, v27, v26 -; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 -; VI-NEXT: v_or_b32_e32 v27, v28, v27 -; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 -; VI-NEXT: v_or_b32_e32 v28, v30, v28 -; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_mov_b32_e32 v29, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v61 +; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v60 +; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v59 +; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v58 +; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v57 +; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v56 +; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v47 +; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v46 +; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v45 +; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v44 +; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v43 +; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v42 +; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v41 +; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v40 +; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v55 +; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v54 +; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v52 +; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v51 +; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v50 +; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v49 +; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v48 +; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v39 +; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v38 +; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v37 +; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v36 +; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v35 +; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: v_add_u16_e32 v27, 3, v34 +; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v33 +; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v30 +; VI-NEXT: v_add_u16_e32 v30, 3, v32 +; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB9_4: ; %end +; VI-NEXT: .LBB14_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -5797,7 +5918,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v60f16_to_v30i32: +; GFX9-LABEL: bitcast_v60i16_to_v30i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -5901,7 +6022,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -6057,9 +6178,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB9_2: ; %Flow +; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB9_4 +; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload @@ -6079,7 +6200,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 @@ -6088,15 +6208,15 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -6129,50 +6249,50 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 ; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB9_4: ; %end +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB14_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -6193,7 +6313,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -6201,43 +6321,43 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB9_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 @@ -6305,51 +6425,51 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB9_2: ; %end +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB14_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <60 x half> %a, splat (half 0xH0200) - %a2 = bitcast <60 x half> %a1 to <30 x i32> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <60 x half> %a to <30 x i32> + %a3 = bitcast <60 x i16> %a to <30 x i32> br label %end end: @@ -6357,1029 +6477,4967 @@ end: ret <30 x i32> %phi } -define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB15_2 ; -; VI-LABEL: bitcast_v30f32_to_v15i64: +; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB10_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB15_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB15_3 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB15_3: ; %end ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB15_2 ; -; GFX9-LABEL: bitcast_v30f32_to_v15i64: +; GFX9-LABEL: bitcast_v60i16_to_v30i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB10_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v30f32_to_v15i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB10_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB10_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB15_3 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB15_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB15_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB15_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB15_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB15_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 +; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB15_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB15_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB15_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <30 x float> %a1 to <15 x i64> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x i32> br label %end cmp.false: - %a3 = bitcast <30 x float> %a to <15 x i64> + %a3 = bitcast <60 x i16> %a to <30 x i32> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi } -define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { +; SI-LABEL: bitcast_v30i32_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: .LBB16_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v63, v25 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: .LBB16_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15i64_to_v30f32: +; VI-LABEL: bitcast_v30i32_to_v60f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB16_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 ; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 ; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 ; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 ; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 ; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 ; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 ; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 ; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB16_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v15i64_to_v30f32: +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30i32_to_v60f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB16_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v15i64_to_v30f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <30 x float> - br label %end - -cmp.false: - %a3 = bitcast <15 x i64> %a to <30 x float> - br label %end - -end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi -} - -define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v30f32_to_v15f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: .LBB12_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v30f32_to_v15f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB12_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB16_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: bitcast_v30f32_to_v15f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB12_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: .LBB12_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <30 x float> %a1 to <15 x double> - br label %end - -cmp.false: - %a3 = bitcast <30 x float> %a to <15 x double> - br label %end - -end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi -} - -define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v15f64_to_v30f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB13_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15f64_to_v30f32: +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB16_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30i32_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_lshr_b32 s5, s17, 16 +; SI-NEXT: s_lshr_b32 s46, s18, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_lshr_b32 s56, s20, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_lshr_b32 s58, s22, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_lshr_b32 s60, s24, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_lshr_b32 s62, s26, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s28, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_lshr_b32 s74, s45, 16 +; SI-NEXT: s_lshr_b32 s75, s44, 16 +; SI-NEXT: s_lshr_b32 s76, s43, 16 +; SI-NEXT: s_lshr_b32 s77, s42, 16 +; SI-NEXT: s_lshr_b32 s78, s41, 16 +; SI-NEXT: s_lshr_b32 s79, s40, 16 +; SI-NEXT: s_lshr_b32 s88, s15, 16 +; SI-NEXT: s_lshr_b32 s89, s14, 16 +; SI-NEXT: s_lshr_b32 s90, s13, 16 +; SI-NEXT: s_lshr_b32 s91, s12, 16 +; SI-NEXT: s_lshr_b32 s92, s11, 16 +; SI-NEXT: s_lshr_b32 s93, s10, 16 +; SI-NEXT: s_lshr_b32 s94, s8, 16 +; SI-NEXT: s_lshr_b32 s95, s7, 16 +; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v2, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v59, v59, v60 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v59, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v57, v57, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: buffer_store_dword v57, v59, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v57, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v43, v44, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v41, v42, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v36, v38, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v36, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v30i32_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v30i32_to_v60f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB13_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s45, s45, 3 +; GFX9-NEXT: s_add_i32 s44, s44, 3 +; GFX9-NEXT: s_add_i32 s43, s43, 3 +; GFX9-NEXT: s_add_i32 s42, s42, 3 +; GFX9-NEXT: s_add_i32 s41, s41, 3 +; GFX9-NEXT: s_add_i32 s40, s40, 3 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_add_i32 s11, s11, 3 +; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: s_add_i32 s9, s9, 3 +; GFX9-NEXT: s_add_i32 s8, s8, 3 +; GFX9-NEXT: s_add_i32 s7, s7, 3 +; GFX9-NEXT: s_add_i32 s6, s6, 3 +; GFX9-NEXT: s_add_i32 s29, s29, 3 +; GFX9-NEXT: s_add_i32 s28, s28, 3 +; GFX9-NEXT: s_add_i32 s27, s27, 3 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_add_i32 s25, s25, 3 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB17_2 ; -; GFX11-LABEL: bitcast_v15f64_to_v30f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB13_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-TRUE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB17_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB17_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB17_2 +; +; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-FAKE16-NEXT: .LBB17_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s14, s14, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s13, s13, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s11, s11, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s9, s9, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s7, s7, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s29, s29, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s27, s27, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s25, s25, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s23, s23, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s21, s21, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s19, s19, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s17, s17, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB17_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB17_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB17_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <30 x float> + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <30 x float> + %a3 = bitcast <30 x i32> %a to <60 x half> br label %end end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB14_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v53, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v44, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB14_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v3, v3, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v5, v5, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v9, v9, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v11, v11, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v13, v13, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v14, v14, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v15, v15, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v17, v17, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v18, v18, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v20, v20, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v22, v22, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v24, v24, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v28, v28, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v30, v30, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v60f16_to_v30i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB18_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB18_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v30f32_to_v60i16: +; VI-LABEL: bitcast_v60f16_to_v30i32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -7408,244 +11466,4613 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB14_2: ; %Flow +; VI-NEXT: .LBB18_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB14_4 +; VI-NEXT: s_cbranch_execz .LBB18_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB14_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 +; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB18_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v30f32_to_v60i16: +; GFX9-LABEL: bitcast_v60f16_to_v30i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB18_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB18_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + +define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v30i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB19_3 +; SI-NEXT: .LBB19_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB19_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB19_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB19_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v30i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v30i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB19_3 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB19_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB19_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB19_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB19_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 +; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB19_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB19_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB19_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + +define <15 x i64> @bitcast_v30f32_to_v15i64(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v15i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v15i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v30f32_to_v15i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v30f32_to_v15i64_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB21_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB21_3 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB21_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_4: +; SI-NEXT: s_branch .LBB21_2 +; +; VI-LABEL: bitcast_v30f32_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB21_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB21_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_4: +; VI-NEXT: s_branch .LBB21_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB21_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: s_branch .LBB21_2 +; +; GFX11-LABEL: bitcast_v30f32_to_v15i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB21_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <30 x float> @bitcast_v15i64_to_v30f32(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v30f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15i64_to_v30f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15i64_to_v30f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v15i64_to_v30f32_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: s_branch .LBB23_2 +; +; VI-LABEL: bitcast_v15i64_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v15i64_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v15i64_to_v30f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB23_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: +; GFX11-NEXT: .LBB23_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <15 x double> @bitcast_v30f32_to_v15f64(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v15f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v15f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v30f32_to_v15f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define inreg <15 x double> @bitcast_v30f32_to_v15f64_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB25_3 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: .LBB25_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_branch .LBB25_2 +; +; VI-LABEL: bitcast_v30f32_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB25_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB25_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_4: +; VI-NEXT: s_branch .LBB25_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB25_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: s_branch .LBB25_2 +; +; GFX11-LABEL: bitcast_v30f32_to_v15f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB25_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <30 x float> @bitcast_v15f64_to_v30f32(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v30f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v30f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15f64_to_v30f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: v_mov_b32_e32 v17, v31 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v15f64_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v31, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB27_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB27_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: v_mov_b32_e32 v17, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_4: +; VI-NEXT: s_branch .LBB27_2 +; +; GFX9-LABEL: bitcast_v15f64_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB27_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: v_mov_b32_e32 v17, v31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: s_branch .LBB27_2 +; +; GFX11-LABEL: bitcast_v15f64_to_v30f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB27_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: .LBB27_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB28_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v48, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v53, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v44, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB28_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v60i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB28_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB28_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 @@ -7665,9 +16092,9 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB14_2: ; %Flow +; GFX9-NEXT: .LBB28_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB14_4 +; GFX9-NEXT: s_cbranch_execz .LBB28_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 ; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 @@ -7729,7 +16156,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB14_4: ; %end +; GFX9-NEXT: .LBB28_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 @@ -7785,7 +16212,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -7802,7 +16229,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB14_2: ; %end +; GFX11-TRUE16-NEXT: .LBB28_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7843,7 +16270,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 @@ -7875,9 +16302,9 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB28_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 @@ -7924,7 +16351,7 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB14_4: ; %end +; GFX11-FAKE16-NEXT: .LBB28_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 @@ -7975,566 +16402,1819 @@ end: ret <60 x i16> %phi } +define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v28, s17 +; SI-NEXT: v_mov_b32_e32 v33, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v32, s19 +; SI-NEXT: v_mov_b32_e32 v29, s20 +; SI-NEXT: v_mov_b32_e32 v27, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v19, s27 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v17, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v48, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v48, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 +; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 +; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 +; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v30, v30, v56 +; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v60 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_or_b32_e32 v28, v28, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v30f32_to_v60i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v27, s21 +; VI-NEXT: v_mov_b32_e32 v26, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB29_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: .LBB29_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB29_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v60i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v28, s19 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v29, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s21 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: .LBB29_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB29_2 +; +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-TRUE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: .LBB29_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB29_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB29_2 +; +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB29_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB29_3 +; GFX11-FAKE16-NEXT: .LBB29_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB29_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB29_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: s_branch .LBB29_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <60 x i16> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <60 x i16> + br label %end + +end: + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi +} + define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB15_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB15_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60i16_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB30_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB30_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30f32: ; VI: ; %bb.0: @@ -8587,7 +18267,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v29, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -8680,9 +18360,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB15_2: ; %Flow +; VI-NEXT: .LBB30_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_4 +; VI-NEXT: s_cbranch_execz .LBB30_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v29, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v61 @@ -8775,7 +18455,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v30, 3, v32 ; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB15_4: ; %end +; VI-NEXT: .LBB30_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8898,7 +18578,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_cbranch_execz .LBB30_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -9054,9 +18734,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB15_2: ; %Flow +; GFX9-NEXT: .LBB30_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload @@ -9168,926 +18848,5135 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB15_4: ; %end +; GFX9-NEXT: .LBB30_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB30_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB30_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB31_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB31_3 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB31_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB31_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB31_3 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB31_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB31_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB31_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB31_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB31_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB31_3 +; GFX11-FAKE16-NEXT: .LBB31_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB31_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB31_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB31_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v30f32_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: .LBB32_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v63, v25 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: .LBB32_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30f32_to_v60f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB32_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB32_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30f32_to_v60f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB32_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: .LBB32_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB32_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB32_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <30 x float> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v30f32_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB33_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, s12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s16 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v6, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_add_f32_e64 v3, s18, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_add_f32_e64 v5, s19, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_add_f32_e64 v7, s20, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_add_f32_e64 v16, s12, 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v11, s22, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v16 +; SI-NEXT: v_add_f32_e64 v40, s6, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e64 v17, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_add_f32_e64 v19, s26, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_add_f32_e64 v48, s8, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v43 +; SI-NEXT: v_add_f32_e64 v15, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s41, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_add_f32_e64 v18, s13, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 +; SI-NEXT: v_add_f32_e64 v13, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 +; SI-NEXT: v_add_f32_e64 v29, s43, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 +; SI-NEXT: v_add_f32_e64 v9, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v44, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v43, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v44, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 +; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v4, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v10, v6 +; SI-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v4, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v40 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v52 +; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 +; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v48 +; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v20 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v62 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v61 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_branch .LBB33_2 +; +; VI-LABEL: bitcast_v30f32_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v27, s21 +; VI-NEXT: v_mov_b32_e32 v26, s22 +; VI-NEXT: v_mov_b32_e32 v25, s23 +; VI-NEXT: v_mov_b32_e32 v24, s24 +; VI-NEXT: v_mov_b32_e32 v23, s25 +; VI-NEXT: v_mov_b32_e32 v22, s26 +; VI-NEXT: v_mov_b32_e32 v21, s27 +; VI-NEXT: v_mov_b32_e32 v20, s28 +; VI-NEXT: v_mov_b32_e32 v16, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB33_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: s_cbranch_execnz .LBB33_3 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: .LBB33_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB33_2 +; +; GFX9-LABEL: bitcast_v30f32_to_v60f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v28, s19 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v29, s20 +; GFX9-NEXT: v_mov_b32_e32 v27, s21 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v25, s23 +; GFX9-NEXT: v_mov_b32_e32 v24, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s25 +; GFX9-NEXT: v_mov_b32_e32 v22, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s27 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v16, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: s_cbranch_execnz .LBB33_3 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: .LBB33_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v16, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB33_2 ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB15_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v15, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-TRUE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-TRUE16-NEXT: .LBB33_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB33_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB33_2 ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32: +; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB15_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB33_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB33_3 +; GFX11-FAKE16-NEXT: .LBB33_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB33_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v21 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB33_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: s_branch .LBB33_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <30 x float> + %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <30 x float> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <30 x float> + %a3 = bitcast <30 x float> %a to <60 x half> br label %end end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v30f32_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB16_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB16_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v41 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v36, v33 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v62 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v49, v32 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v63 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v60f16_to_v30f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB34_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB34_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v30f32_to_v60f16: +; VI-LABEL: bitcast_v60f16_to_v30f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -10116,199 +24005,268 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB16_2: ; %Flow +; VI-NEXT: .LBB34_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB16_4 +; VI-NEXT: s_cbranch_execz .LBB34_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB16_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 +; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB34_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v30f32_to_v60f16: +; GFX9-LABEL: bitcast_v60f16_to_v30f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -10330,162 +24288,234 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB16_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB34_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB16_4 +; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 -; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 -; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 -; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 -; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 -; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 -; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 -; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 -; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 -; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 -; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 -; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 -; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 -; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 -; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 -; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 -; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 -; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 -; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 -; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 -; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 -; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB16_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB34_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -10493,1139 +24523,983 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-TRUE16-NEXT: .LBB16_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB34_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16: +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB16_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = fadd <30 x float> %a, splat (float 1.000000e+00) - %a2 = bitcast <30 x float> %a1 to <60 x half> - br label %end - -cmp.false: - %a3 = bitcast <30 x float> %a to <60 x half> - br label %end - -end: - %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x half> %phi -} - -define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v30f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB17_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB17_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v60f16_to_v30f32: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB34_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v30f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB35_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB35_3 +; SI-NEXT: .LBB35_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB35_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB35_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB35_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v30f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB35_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB17_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -11674,422 +25548,2272 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB17_4: ; %end +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v30f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB35_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB35_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB35_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB35_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB35_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB35_3 +; GFX11-FAKE16-NEXT: .LBB35_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB35_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB35_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB35_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <30 x float> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <30 x float> + br label %end + +end: + %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x float> %phi +} + +define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: .LBB36_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v15f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v60f16_to_v30f32: +; GFX9-LABEL: bitcast_v15i64_to_v15f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15i64_to_v15f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define inreg <15 x double> @bitcast_v15i64_to_v15f64_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v15i64_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v15i64_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB17_2: ; %Flow +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v15i64_to_v15f64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB37_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_3: +; GFX11-NEXT: .LBB37_4: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v15i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v15i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB17_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB17_4: ; %end +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15f64_to_v15i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v23, v9 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v21, v7 +; SI-NEXT: v_mov_b32_e32 v20, v6 +; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v30, v2 +; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s23 +; SI-NEXT: v_mov_b32_e32 v8, s24 +; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: v_mov_b32_e32 v13, s29 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v16, v30 +; SI-NEXT: v_mov_b32_e32 v17, v31 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v15f64_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v29, v15 +; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 +; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 +; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 +; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v18, v4 +; VI-NEXT: v_mov_b32_e32 v31, v3 +; VI-NEXT: v_mov_b32_e32 v30, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v14, v0 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 +; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 +; VI-NEXT: s_cbranch_scc0 .LBB39_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: .LBB39_3: ; %end +; VI-NEXT: v_mov_b32_e32 v16, v30 +; VI-NEXT: v_mov_b32_e32 v17, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_4: +; VI-NEXT: s_branch .LBB39_2 +; +; GFX9-LABEL: bitcast_v15f64_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v29, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v14 +; GFX9-NEXT: v_mov_b32_e32 v27, v13 +; GFX9-NEXT: v_mov_b32_e32 v26, v12 +; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_mov_b32_e32 v24, v10 +; GFX9-NEXT: v_mov_b32_e32 v23, v9 +; GFX9-NEXT: v_mov_b32_e32 v22, v8 +; GFX9-NEXT: v_mov_b32_e32 v21, v7 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-NEXT: v_mov_b32_e32 v30, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: .LBB39_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v16, v30 +; GFX9-NEXT: v_mov_b32_e32 v17, v31 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: s_branch .LBB39_2 +; +; GFX11-LABEL: bitcast_v15f64_to_v15i64_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11 +; GFX11-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9 +; GFX11-NEXT: v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5 +; GFX11-NEXT: v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3 +; GFX11-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17 +; GFX11-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21 +; GFX11-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23 +; GFX11-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25 +; GFX11-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 +; GFX11-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29 +; GFX11-NEXT: v_mov_b32_e32 v16, s28 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB39_4 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v60i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB40_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB40_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15i64_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB40_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB40_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32: +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -12097,835 +27821,2175 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB17_2: ; %end +; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32: +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB17_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <60 x half> %a, splat (half 0xH0200) - %a2 = bitcast <60 x half> %a1 to <30 x float> - br label %end - -cmp.false: - %a3 = bitcast <60 x half> %a to <30 x float> - br label %end - -end: - %phi = phi <30 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <30 x float> %phi -} - -define <15 x double> @bitcast_v15i64_to_v15f64(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; GCN-NEXT: .LBB18_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v15i64_to_v15f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: .LBB18_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v15i64_to_v15f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: .LBB18_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v15i64_to_v15f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-NEXT: .LBB18_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - cmp.true: %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <15 x double> + %a2 = bitcast <15 x i64> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <15 x i64> %a to <15 x double> + %a3 = bitcast <15 x i64> %a to <60 x i16> br label %end end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <15 x i64> @bitcast_v15f64_to_v15i64(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s45, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v2 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s41, v5 +; SI-NEXT: v_readfirstlane_b32 s40, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v7 +; SI-NEXT: v_readfirstlane_b32 s14, v8 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: v_readfirstlane_b32 s12, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_readfirstlane_b32 s9, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s45, s45, 3 +; SI-NEXT: s_addc_u32 s44, s44, 0 +; SI-NEXT: s_add_u32 s43, s43, 3 +; SI-NEXT: s_addc_u32 s42, s42, 0 +; SI-NEXT: s_add_u32 s41, s41, 3 +; SI-NEXT: s_addc_u32 s40, s40, 0 +; SI-NEXT: s_add_u32 s15, s15, 3 +; SI-NEXT: s_addc_u32 s14, s14, 0 +; SI-NEXT: s_add_u32 s13, s13, 3 +; SI-NEXT: s_addc_u32 s12, s12, 0 +; SI-NEXT: s_add_u32 s11, s11, 3 +; SI-NEXT: s_addc_u32 s10, s10, 0 +; SI-NEXT: s_add_u32 s9, s9, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s6, s6, 0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v7, s43 +; SI-NEXT: v_mov_b32_e32 v8, s45 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 +; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 +; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 +; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 +; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 +; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 +; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 +; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 +; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 +; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 +; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 +; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 +; SI-NEXT: s_lshr_b32 s46, s6, 16 +; SI-NEXT: s_lshr_b32 s47, s8, 16 +; SI-NEXT: s_lshr_b32 s56, s10, 16 +; SI-NEXT: s_lshr_b32 s57, s12, 16 +; SI-NEXT: s_lshr_b32 s58, s14, 16 +; SI-NEXT: s_lshr_b32 s59, s40, 16 +; SI-NEXT: s_lshr_b32 s60, s42, 16 +; SI-NEXT: s_lshr_b32 s61, s44, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, s4, v15 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v16, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s19, 0xffff +; SI-NEXT: s_lshl_b32 s5, s75, 16 +; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v13, s4, v13 +; SI-NEXT: s_and_b32 s4, s21, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 +; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v12, s4, v12 +; SI-NEXT: s_and_b32 s4, s23, 0xffff +; SI-NEXT: s_lshl_b32 s5, s73, 16 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v10, s4, v10 +; SI-NEXT: s_and_b32 s4, s27, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v11, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v9, s4, v9 +; SI-NEXT: s_and_b32 s4, s29, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s61, 16 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: v_or_b32_e32 v7, s4, v7 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: v_or_b32_e32 v6, s4, v6 +; SI-NEXT: s_and_b32 s4, s40, 0xffff +; SI-NEXT: s_lshl_b32 s5, s59, 16 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s58, 16 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s56, 16 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s9, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s47, 16 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s6, 0xffff +; SI-NEXT: s_lshl_b32 s5, s46, 16 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: s_branch .LBB41_2 ; -; VI-LABEL: bitcast_v15f64_to_v15i64: +; VI-LABEL: bitcast_v15i64_to_v60i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: .LBB19_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s45, s45, 3 +; VI-NEXT: s_addc_u32 s44, s44, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB41_2 ; -; GFX9-LABEL: bitcast_v15f64_to_v15i64: +; GFX9-LABEL: bitcast_v15i64_to_v60i16_scalar: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: .LBB19_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v15f64_to_v15i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB19_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-NEXT: .LBB19_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s44, s44, 3 +; GFX9-NEXT: s_addc_u32 s45, s45, 0 +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-TRUE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB41_3: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB41_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB41_2 +; +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-FAKE16-NEXT: .LBB41_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB41_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB41_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB41_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <15 x i64> + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <15 x i64> + %a3 = bitcast <15 x i64> %a to <60 x i16> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v39, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v50, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v52, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v55, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v41, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v6, v5, 16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_alignbit_b32 v46, v4, v3, 16 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_alignbit_b32 v56, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v1, v1, v56 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 4, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v3, v3, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v5, v5, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v7, v7, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v9, v9, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v10, v10, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v11, v11, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v12, v12, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v13, v13, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v14, v14, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v15, v15, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v16, v16, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v18, v18, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v20, v20, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v22, v22, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v24, v24, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v26, v26, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v28, v28, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v30, v30, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v60i16_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB42_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB42_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15i64_to_v60i16: +; VI-LABEL: bitcast_v60i16_to_v15i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -12954,199 +30018,268 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB20_2: ; %Flow +; VI-NEXT: .LBB42_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB20_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v61 +; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v60 +; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v59 +; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v58 +; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v57 +; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v56 +; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v47 +; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v46 +; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v45 +; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v44 +; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v43 +; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v42 +; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v41 +; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v40 +; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v55 +; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v54 +; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v52 +; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v51 +; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v50 +; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v49 +; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v48 +; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v39 +; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v38 +; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v37 +; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v36 +; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v35 +; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: v_add_u16_e32 v27, 3, v34 +; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v33 +; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v30 +; VI-NEXT: v_add_u16_e32 v30, 3, v32 +; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15i64_to_v60i16: +; GFX9-LABEL: bitcast_v60i16_to_v15i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -13168,162 +30301,233 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB20_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB20_4: ; %end +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16: +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -13331,919 +30535,2163 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB42_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16: +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB42_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <60 x i16> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB43_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB43_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB43_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB43_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB43_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB43_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x i64> br label %end cmp.false: - %a3 = bitcast <15 x i64> %a to <60 x i16> + %a3 = bitcast <60 x i16> %a to <15 x i64> br label %end end: - %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x i16> %phi + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi } -define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB21_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB21_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { +; SI-LABEL: bitcast_v15i64_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: .LBB44_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v63, v25 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: .LBB44_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v60i16_to_v15i64: +; VI-LABEL: bitcast_v15i64_to_v60f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -14272,268 +32720,199 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB21_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB44_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB44_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v61 -; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v59 -; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v58 -; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v57 -; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v56 -; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v47 -; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v46 -; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v45 -; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v44 -; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v43 -; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v42 -; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v41 -; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v40 -; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v55 -; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v54 -; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v52 -; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v51 -; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v50 -; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v49 -; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v48 -; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v39 -; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v38 -; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v37 -; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v36 -; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v35 -; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: v_add_u16_e32 v27, 3, v34 -; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v33 -; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v28, v30 -; VI-NEXT: v_add_u16_e32 v30, 3, v32 -; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB44_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v60i16_to_v15i64: +; GFX9-LABEL: bitcast_v15i64_to_v60f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -14543,1143 +32922,2753 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB44_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB44_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB44_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB21_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB44_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15i64_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s7, v13 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s9, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_lshr_b32 s4, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 +; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 +; SI-NEXT: s_lshr_b32 s4, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 +; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s4 +; SI-NEXT: s_lshr_b32 s4, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: s_lshr_b32 s4, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 +; SI-NEXT: s_lshr_b32 s4, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 +; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 +; SI-NEXT: s_lshr_b32 s4, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 +; SI-NEXT: s_lshr_b32 s4, s45, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 +; SI-NEXT: s_lshr_b32 s4, s44, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 +; SI-NEXT: s_lshr_b32 s4, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 +; SI-NEXT: s_lshr_b32 s4, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 +; SI-NEXT: s_lshr_b32 s4, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 +; SI-NEXT: s_lshr_b32 s4, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 +; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 +; SI-NEXT: s_lshr_b32 s4, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 +; SI-NEXT: s_lshr_b32 s4, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 +; SI-NEXT: s_lshr_b32 s4, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 +; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 +; SI-NEXT: s_lshr_b32 s4, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 +; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s16, 3 +; SI-NEXT: s_addc_u32 s5, s17, 0 +; SI-NEXT: s_lshr_b32 s16, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s5, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_lshr_b32 s46, s18, 16 +; SI-NEXT: s_lshr_b32 s47, s19, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_lshr_b32 s56, s20, 16 +; SI-NEXT: s_lshr_b32 s57, s21, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_lshr_b32 s58, s22, 16 +; SI-NEXT: s_lshr_b32 s59, s23, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_lshr_b32 s60, s24, 16 +; SI-NEXT: s_lshr_b32 s61, s25, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_lshr_b32 s62, s26, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_lshr_b32 s72, s28, 16 +; SI-NEXT: s_lshr_b32 s73, s29, 16 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 +; SI-NEXT: s_lshr_b32 s74, s44, 16 +; SI-NEXT: s_lshr_b32 s75, s45, 16 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_lshr_b32 s76, s42, 16 +; SI-NEXT: s_lshr_b32 s77, s43, 16 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_lshr_b32 s78, s40, 16 +; SI-NEXT: s_lshr_b32 s79, s41, 16 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_lshr_b32 s88, s14, 16 +; SI-NEXT: s_lshr_b32 s89, s15, 16 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_lshr_b32 s90, s12, 16 +; SI-NEXT: s_lshr_b32 s91, s13, 16 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_lshr_b32 s92, s10, 16 +; SI-NEXT: s_lshr_b32 s93, s11, 16 +; SI-NEXT: s_add_u32 s7, s7, 3 +; SI-NEXT: s_addc_u32 s8, s8, 0 +; SI-NEXT: s_lshr_b32 s94, s7, 16 +; SI-NEXT: s_lshr_b32 s95, s8, 16 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_lshr_b32 vcc_lo, s6, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v47, s18 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi +; SI-NEXT: v_cvt_f32_f16_e32 v2, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v3, s95 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s94 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s93 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s92 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s91 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s90 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s89 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s88 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s79 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s78 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s77 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s76 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s75 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s74 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s73 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s72 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s63 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s62 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s61 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s60 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s59 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s58 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s47 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_or_b32_e32 v59, v59, v60 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v59, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v59, vcc, 4, v0 +; SI-NEXT: v_or_b32_e32 v57, v57, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: buffer_store_dword v57, v59, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v57, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v47, v47, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v47, vcc, 12, v0 +; SI-NEXT: v_or_b32_e32 v45, v45, v46 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_store_dword v45, v47, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v45, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v43, v44, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: buffer_store_dword v43, v45, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v43, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v41, v42, v41 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v41, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v55, v40, v55 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v55, vcc, 28, v0 +; SI-NEXT: v_or_b32_e32 v53, v54, v53 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v53, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v51, v52, v51 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v51, vcc, 36, v0 +; SI-NEXT: v_or_b32_e32 v49, v50, v49 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: buffer_store_dword v49, v51, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v49, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v39, v48, v39 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: buffer_store_dword v39, v49, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 +; SI-NEXT: v_or_b32_e32 v36, v38, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 +; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_or_b32_e32 v34, v36, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v34, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v32, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v30, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v22, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: buffer_store_dword v17, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v18 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v11, v14 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v15i64_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v30, s30, 0 +; VI-NEXT: v_writelane_b32 v30, s31, 1 +; VI-NEXT: v_writelane_b32 v30, s34, 2 +; VI-NEXT: v_writelane_b32 v30, s35, 3 +; VI-NEXT: v_writelane_b32 v30, s36, 4 +; VI-NEXT: v_writelane_b32 v30, s37, 5 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_writelane_b32 v30, s38, 6 +; VI-NEXT: v_readfirstlane_b32 s45, v0 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s43, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s41, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s15, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s13, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s11, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s9, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s6, v14 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s7, v15 +; VI-NEXT: v_writelane_b32 v30, s39, 7 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_u32 s6, s6, 3 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_add_u32 s9, s9, 3 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: s_add_u32 s11, s11, 3 +; VI-NEXT: s_addc_u32 s10, s10, 0 +; VI-NEXT: s_add_u32 s13, s13, 3 +; VI-NEXT: s_addc_u32 s12, s12, 0 +; VI-NEXT: s_add_u32 s15, s15, 3 +; VI-NEXT: s_addc_u32 s14, s14, 0 +; VI-NEXT: s_add_u32 s41, s41, 3 +; VI-NEXT: s_addc_u32 s40, s40, 0 +; VI-NEXT: s_add_u32 s43, s43, 3 +; VI-NEXT: s_addc_u32 s42, s42, 0 +; VI-NEXT: s_add_u32 s45, s45, 3 +; VI-NEXT: s_addc_u32 s44, s44, 0 +; VI-NEXT: s_add_u32 s28, s28, 3 +; VI-NEXT: s_addc_u32 s29, s29, 0 +; VI-NEXT: s_add_u32 s26, s26, 3 +; VI-NEXT: s_addc_u32 s27, s27, 0 +; VI-NEXT: s_add_u32 s24, s24, 3 +; VI-NEXT: s_addc_u32 s25, s25, 0 +; VI-NEXT: s_add_u32 s22, s22, 3 +; VI-NEXT: s_addc_u32 s23, s23, 0 +; VI-NEXT: s_add_u32 s20, s20, 3 +; VI-NEXT: s_addc_u32 s21, s21, 0 +; VI-NEXT: s_add_u32 s18, s18, 3 +; VI-NEXT: s_addc_u32 s19, s19, 0 +; VI-NEXT: s_add_u32 s16, s16, 3 +; VI-NEXT: s_addc_u32 s17, s17, 0 +; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: s_lshr_b32 s56, s8, 16 +; VI-NEXT: s_lshr_b32 s57, s9, 16 +; VI-NEXT: s_lshr_b32 s58, s10, 16 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: s_lshr_b32 s60, s12, 16 +; VI-NEXT: s_lshr_b32 s61, s13, 16 +; VI-NEXT: s_lshr_b32 s62, s14, 16 +; VI-NEXT: s_lshr_b32 s63, s15, 16 +; VI-NEXT: s_lshr_b32 s72, s40, 16 +; VI-NEXT: s_lshr_b32 s73, s41, 16 +; VI-NEXT: s_lshr_b32 s74, s42, 16 +; VI-NEXT: s_lshr_b32 s75, s43, 16 +; VI-NEXT: s_lshr_b32 s76, s44, 16 +; VI-NEXT: s_lshr_b32 s77, s45, 16 +; VI-NEXT: s_lshr_b32 s78, s29, 16 +; VI-NEXT: s_lshr_b32 s79, s28, 16 +; VI-NEXT: s_lshr_b32 s88, s27, 16 +; VI-NEXT: s_lshr_b32 s89, s26, 16 +; VI-NEXT: s_lshr_b32 s90, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 +; VI-NEXT: s_lshr_b32 s30, s23, 16 +; VI-NEXT: s_lshr_b32 s31, s22, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 16 +; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s38, s17, 16 +; VI-NEXT: s_lshr_b32 s39, s16, 16 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s39, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s38, 16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s37, 16 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s36, 16 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s19, s35, 16 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s20, s34, 16 +; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_and_b32 s20, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s21, s31, 16 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s21, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s22, s30, 16 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s22, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s23, s91, 16 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s23, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s24, s90, 16 +; VI-NEXT: s_or_b32 s23, s23, s24 +; VI-NEXT: s_and_b32 s24, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s25, s89, 16 +; VI-NEXT: s_or_b32 s24, s24, s25 +; VI-NEXT: s_and_b32 s25, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s26, s88, 16 +; VI-NEXT: s_or_b32 s25, s25, s26 +; VI-NEXT: s_and_b32 s26, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s27, s79, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: s_and_b32 s27, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s28, s78, 16 +; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s28, 0xffff, s45 +; VI-NEXT: s_lshl_b32 s29, s77, 16 +; VI-NEXT: s_or_b32 s28, s28, s29 +; VI-NEXT: s_and_b32 s29, 0xffff, s44 +; VI-NEXT: s_lshl_b32 s44, s76, 16 +; VI-NEXT: s_or_b32 s29, s29, s44 +; VI-NEXT: s_and_b32 s43, 0xffff, s43 +; VI-NEXT: s_lshl_b32 s44, s75, 16 +; VI-NEXT: s_or_b32 s43, s43, s44 +; VI-NEXT: s_and_b32 s42, 0xffff, s42 +; VI-NEXT: s_lshl_b32 s44, s74, 16 +; VI-NEXT: s_or_b32 s42, s42, s44 +; VI-NEXT: s_and_b32 s41, 0xffff, s41 +; VI-NEXT: s_lshl_b32 s44, s73, 16 +; VI-NEXT: s_or_b32 s41, s41, s44 +; VI-NEXT: s_and_b32 s40, 0xffff, s40 +; VI-NEXT: s_lshl_b32 s44, s72, 16 +; VI-NEXT: s_or_b32 s40, s40, s44 +; VI-NEXT: s_and_b32 s15, 0xffff, s15 +; VI-NEXT: s_lshl_b32 s44, s63, 16 +; VI-NEXT: s_or_b32 s15, s15, s44 +; VI-NEXT: s_and_b32 s14, 0xffff, s14 +; VI-NEXT: s_lshl_b32 s44, s62, 16 +; VI-NEXT: s_or_b32 s14, s14, s44 +; VI-NEXT: s_and_b32 s13, 0xffff, s13 +; VI-NEXT: s_lshl_b32 s44, s61, 16 +; VI-NEXT: s_or_b32 s13, s13, s44 +; VI-NEXT: s_and_b32 s12, 0xffff, s12 +; VI-NEXT: s_lshl_b32 s44, s60, 16 +; VI-NEXT: s_or_b32 s12, s12, s44 +; VI-NEXT: s_and_b32 s11, 0xffff, s11 +; VI-NEXT: s_lshl_b32 s44, s59, 16 +; VI-NEXT: s_or_b32 s11, s11, s44 +; VI-NEXT: s_and_b32 s10, 0xffff, s10 +; VI-NEXT: s_lshl_b32 s44, s58, 16 +; VI-NEXT: s_or_b32 s10, s10, s44 +; VI-NEXT: s_and_b32 s9, 0xffff, s9 +; VI-NEXT: s_lshl_b32 s44, s57, 16 +; VI-NEXT: s_or_b32 s9, s9, s44 +; VI-NEXT: s_and_b32 s8, 0xffff, s8 +; VI-NEXT: s_lshl_b32 s44, s56, 16 +; VI-NEXT: s_or_b32 s8, s8, s44 +; VI-NEXT: s_and_b32 s6, 0xffff, s6 +; VI-NEXT: s_lshl_b32 s44, s47, 16 +; VI-NEXT: s_or_b32 s6, s6, s44 +; VI-NEXT: s_and_b32 s7, 0xffff, s7 +; VI-NEXT: s_lshl_b32 s44, s46, 16 +; VI-NEXT: s_or_b32 s7, s7, s44 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s18 +; VI-NEXT: v_mov_b32_e32 v5, s19 +; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v7, s21 +; VI-NEXT: v_mov_b32_e32 v8, s22 +; VI-NEXT: v_mov_b32_e32 v9, s23 +; VI-NEXT: v_mov_b32_e32 v10, s24 +; VI-NEXT: v_mov_b32_e32 v11, s25 +; VI-NEXT: v_mov_b32_e32 v12, s26 +; VI-NEXT: v_mov_b32_e32 v13, s27 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 +; VI-NEXT: v_mov_b32_e32 v16, s43 +; VI-NEXT: v_mov_b32_e32 v17, s42 +; VI-NEXT: v_mov_b32_e32 v18, s41 +; VI-NEXT: v_mov_b32_e32 v19, s40 +; VI-NEXT: v_mov_b32_e32 v20, s15 +; VI-NEXT: v_mov_b32_e32 v21, s14 +; VI-NEXT: v_mov_b32_e32 v22, s13 +; VI-NEXT: v_mov_b32_e32 v23, s12 +; VI-NEXT: v_mov_b32_e32 v24, s11 +; VI-NEXT: v_mov_b32_e32 v25, s10 +; VI-NEXT: v_mov_b32_e32 v26, s9 +; VI-NEXT: v_mov_b32_e32 v27, s8 +; VI-NEXT: v_mov_b32_e32 v28, s6 +; VI-NEXT: v_mov_b32_e32 v29, s7 +; VI-NEXT: v_readlane_b32 s39, v30, 7 +; VI-NEXT: v_readlane_b32 s38, v30, 6 +; VI-NEXT: v_readlane_b32 s37, v30, 5 +; VI-NEXT: v_readlane_b32 s36, v30, 4 +; VI-NEXT: v_readlane_b32 s35, v30, 3 +; VI-NEXT: v_readlane_b32 s34, v30, 2 +; VI-NEXT: v_readlane_b32 s31, v30, 1 +; VI-NEXT: v_readlane_b32 s30, v30, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v15i64_to_v60f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v30, s30, 0 +; GFX9-NEXT: v_writelane_b32 v30, s31, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_writelane_b32 v30, s34, 2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v4 +; GFX9-NEXT: v_readfirstlane_b32 s11, v5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v6 +; GFX9-NEXT: v_readfirstlane_b32 s13, v7 +; GFX9-NEXT: v_readfirstlane_b32 s14, v8 +; GFX9-NEXT: v_readfirstlane_b32 s15, v9 +; GFX9-NEXT: v_readfirstlane_b32 s40, v10 +; GFX9-NEXT: v_readfirstlane_b32 s41, v11 +; GFX9-NEXT: v_readfirstlane_b32 s42, v12 +; GFX9-NEXT: v_readfirstlane_b32 s43, v13 +; GFX9-NEXT: v_readfirstlane_b32 s44, v14 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_readfirstlane_b32 s45, v15 +; GFX9-NEXT: v_writelane_b32 v30, s35, 3 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_u32 s44, s44, 3 +; GFX9-NEXT: s_addc_u32 s45, s45, 0 +; GFX9-NEXT: s_add_u32 s42, s42, 3 +; GFX9-NEXT: s_addc_u32 s43, s43, 0 +; GFX9-NEXT: s_add_u32 s40, s40, 3 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_add_u32 s14, s14, 3 +; GFX9-NEXT: s_addc_u32 s15, s15, 0 +; GFX9-NEXT: s_add_u32 s12, s12, 3 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_add_u32 s10, s10, 3 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_add_u32 s8, s8, 3 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_add_u32 s6, s6, 3 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_add_u32 s28, s28, 3 +; GFX9-NEXT: s_addc_u32 s29, s29, 0 +; GFX9-NEXT: s_add_u32 s26, s26, 3 +; GFX9-NEXT: s_addc_u32 s27, s27, 0 +; GFX9-NEXT: s_add_u32 s24, s24, 3 +; GFX9-NEXT: s_addc_u32 s25, s25, 0 +; GFX9-NEXT: s_add_u32 s22, s22, 3 +; GFX9-NEXT: s_addc_u32 s23, s23, 0 +; GFX9-NEXT: s_add_u32 s20, s20, 3 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: s_add_u32 s18, s18, 3 +; GFX9-NEXT: s_addc_u32 s19, s19, 0 +; GFX9-NEXT: s_add_u32 s16, s16, 3 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: s_lshr_b32 s46, s45, 16 +; GFX9-NEXT: s_lshr_b32 s47, s44, 16 +; GFX9-NEXT: s_lshr_b32 s56, s43, 16 +; GFX9-NEXT: s_lshr_b32 s57, s42, 16 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 16 +; GFX9-NEXT: s_lshr_b32 s60, s15, 16 +; GFX9-NEXT: s_lshr_b32 s61, s14, 16 +; GFX9-NEXT: s_lshr_b32 s62, s13, 16 +; GFX9-NEXT: s_lshr_b32 s63, s12, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s73, s10, 16 +; GFX9-NEXT: s_lshr_b32 s74, s9, 16 +; GFX9-NEXT: s_lshr_b32 s75, s8, 16 +; GFX9-NEXT: s_lshr_b32 s76, s7, 16 +; GFX9-NEXT: s_lshr_b32 s77, s6, 16 +; GFX9-NEXT: s_lshr_b32 s78, s29, 16 +; GFX9-NEXT: s_lshr_b32 s79, s28, 16 +; GFX9-NEXT: s_lshr_b32 s88, s27, 16 +; GFX9-NEXT: s_lshr_b32 s89, s26, 16 +; GFX9-NEXT: s_lshr_b32 s90, s25, 16 +; GFX9-NEXT: s_lshr_b32 s91, s24, 16 +; GFX9-NEXT: s_lshr_b32 s92, s23, 16 +; GFX9-NEXT: s_lshr_b32 s93, s22, 16 +; GFX9-NEXT: s_lshr_b32 s94, s21, 16 +; GFX9-NEXT: s_lshr_b32 s95, s20, 16 +; GFX9-NEXT: s_lshr_b32 s30, s19, 16 +; GFX9-NEXT: s_lshr_b32 s31, s18, 16 +; GFX9-NEXT: s_lshr_b32 s34, s17, 16 +; GFX9-NEXT: s_lshr_b32 s35, s16, 16 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s17, s34 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s18, s31 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s19, s30 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s20, s95 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s21, s94 +; GFX9-NEXT: s_pack_ll_b32_b16 s20, s22, s93 +; GFX9-NEXT: s_pack_ll_b32_b16 s21, s23, s92 +; GFX9-NEXT: s_pack_ll_b32_b16 s22, s24, s91 +; GFX9-NEXT: s_pack_ll_b32_b16 s23, s25, s90 +; GFX9-NEXT: s_pack_ll_b32_b16 s24, s26, s89 +; GFX9-NEXT: s_pack_ll_b32_b16 s25, s27, s88 +; GFX9-NEXT: s_pack_ll_b32_b16 s26, s28, s79 +; GFX9-NEXT: s_pack_ll_b32_b16 s27, s29, s78 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s77 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s76 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s74 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s72 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s62 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s61 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s15, s60 +; GFX9-NEXT: s_pack_ll_b32_b16 s28, s40, s59 +; GFX9-NEXT: s_pack_ll_b32_b16 s29, s41, s58 +; GFX9-NEXT: s_pack_ll_b32_b16 s40, s42, s57 +; GFX9-NEXT: s_pack_ll_b32_b16 s41, s43, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s42, s44, s47 +; GFX9-NEXT: s_pack_ll_b32_b16 s43, s45, s46 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s19 +; GFX9-NEXT: v_mov_b32_e32 v6, s20 +; GFX9-NEXT: v_mov_b32_e32 v7, s21 +; GFX9-NEXT: v_mov_b32_e32 v8, s22 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_mov_b32_e32 v10, s24 +; GFX9-NEXT: v_mov_b32_e32 v11, s25 +; GFX9-NEXT: v_mov_b32_e32 v12, s26 +; GFX9-NEXT: v_mov_b32_e32 v13, s27 +; GFX9-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s28 +; GFX9-NEXT: v_mov_b32_e32 v25, s29 +; GFX9-NEXT: v_mov_b32_e32 v26, s40 +; GFX9-NEXT: v_mov_b32_e32 v27, s41 +; GFX9-NEXT: v_mov_b32_e32 v28, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s43 +; GFX9-NEXT: v_readlane_b32 s35, v30, 3 +; GFX9-NEXT: v_readlane_b32 s34, v30, 2 +; GFX9-NEXT: v_readlane_b32 s31, v30, 1 +; GFX9-NEXT: v_readlane_b32 s30, v30, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: s_branch .LBB45_2 ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64: +; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v3 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v4 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v5 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v6 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v7 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v10 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v11 +; GFX11-TRUE16-NEXT: s_mov_b32 s94, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-TRUE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s4, s4, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s7, s7, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s9, s9, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s8, s8, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s11, s11, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s10, s10, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s13, s13, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s12, s12, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s4, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s6, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s7, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s8, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s9, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s10, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s11, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s12, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s57, s13, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s58, s14, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s59, s15, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-TRUE16-NEXT: .LBB45_3: ; %end ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB21_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s15, s59 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s14, s58 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s13, s57 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s12, s56 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s11, s47 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s10, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s9, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s40 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v19, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s13 :: v_dual_mov_b32 v21, s12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, s11 :: v_dual_mov_b32 v23, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, s9 :: v_dual_mov_b32 v25, s8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v27, s5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s6 :: v_dual_mov_b32 v29, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB45_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr91_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr89_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr76_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr41_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr40_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB45_2 ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64: +; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB21_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s8, v4 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s9, v5 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s10, v6 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s11, v7 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s12, v8 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v9 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s15, v10 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s14, v11 +; GFX11-FAKE16-NEXT: s_mov_b32 s94, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s40, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s94 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-FAKE16-NEXT: .LBB45_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_u32 s15, s15, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s14, s14, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s12, s12, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s13, s13, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s10, s10, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s8, s8, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s9, s9, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s6, s6, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s7, s7, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s4, s4, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s28, s28, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s29, s29, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s26, s26, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s27, s27, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s24, s24, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s25, s25, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s22, s22, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s23, s23, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s20, s20, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s21, s21, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s18, s18, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s19, s19, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s16, s16, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s17, s17, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s14, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s15, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s12, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s11, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s10, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s9, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s47, s8, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s56, s7, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s57, s6, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s58, s5, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s59, s4, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s60, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s61, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s62, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s63, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s72, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s73, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s74, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s75, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s76, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s77, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s78, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s79, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s88, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s89, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s90, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s91, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s92, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s93, s0, 16 +; GFX11-FAKE16-NEXT: .LBB45_3: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s93 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s92 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s91 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s90 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s16, s89 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s17, s88 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s18, s79 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s19, s19, s78 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s20, s20, s77 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s21, s21, s76 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s22, s22, s75 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s23, s23, s74 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s73 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s72 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s63 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s62 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s61 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s60 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s4, s59 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s58 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s57 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s56 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s47 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s9, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s10, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s11, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s12, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s13, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s15, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s14, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s4 :: v_dual_mov_b32 v19, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s8 :: v_dual_mov_b32 v23, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s10 :: v_dual_mov_b32 v25, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB45_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr93 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr92 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr91 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr90 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr89 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr88 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr79 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr78 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr77 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr76 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr75 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr61 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr43 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr40 +; GFX11-FAKE16-NEXT: s_branch .LBB45_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <15 x i64> + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <15 x i64> + %a3 = bitcast <15 x i64> %a to <60 x half> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { -; GCN-LABEL: bitcast_v15i64_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; kill: killed $vgpr36 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v50 -; GCN-NEXT: v_mov_b32_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v61 -; GCN-NEXT: v_mov_b32_e32 v61, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: .LBB22_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB22_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v42 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v41 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v36, v33 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v37, v31 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v62 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v49, v32 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v63 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v60f16_to_v15i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB46_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB46_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15i64_to_v60f16: +; VI-LABEL: bitcast_v60f16_to_v15i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -15708,199 +35697,268 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB22_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 -; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc -; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 -; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc -; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 -; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc -; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 -; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 -; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc -; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 -; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 -; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB22_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 +; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB46_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15i64_to_v60f16: +; GFX9-LABEL: bitcast_v60f16_to_v15i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -15922,162 +35980,234 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB22_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 -; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc -; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc -; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 -; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc -; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc -; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 -; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc -; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 -; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB22_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: s_movk_i32 s7, 0x200 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -16085,1185 +36215,983 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB46_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <15 x i64> %a, splat (i64 3) - %a2 = bitcast <15 x i64> %a1 to <60 x half> - br label %end - -cmp.false: - %a3 = bitcast <15 x i64> %a to <60 x half> - br label %end - -end: - %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x half> %phi -} - -define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v15i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB23_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB23_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v60f16_to_v15i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB23_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 0x200 -; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB46_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v15i64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB47_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB47_3 +; SI-NEXT: .LBB47_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB47_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB47_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB47_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v15i64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f16_sdwa v3, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v58 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 ; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: v_add_f16_sdwa v4, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v57 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_add_f16_sdwa v5, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v56 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: v_add_f16_sdwa v6, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v47 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 ; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: v_add_f16_sdwa v7, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v46 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 ; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_f16_sdwa v8, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v45 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 ; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_f16_sdwa v9, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v44 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 ; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_f16_sdwa v10, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v43 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 ; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_f16_sdwa v11, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v42 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 ; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_f16_sdwa v12, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v41 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_f16_sdwa v13, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v40 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 ; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 ; VI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -17312,916 +37240,3416 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v15i64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB47_3 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB47_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB47_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB47_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_3 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB47_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} + +define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB48_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; SI-NEXT: v_alignbit_b32 v32, v28, v27, 16 +; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; SI-NEXT: v_alignbit_b32 v34, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v35, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v37, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v39, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v50, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v52, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v55, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v41, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v43, v6, v5, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 +; SI-NEXT: .LBB48_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v60i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v60i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_2: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB48_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <60 x i16> + br label %end + +cmp.false: + %a3 = bitcast <15 x double> %a to <60 x i16> + br label %end + +end: + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi +} + +define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_mov_b32_e32 v28, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v25, s20 +; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v23, s22 +; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v21, s24 +; SI-NEXT: v_mov_b32_e32 v22, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v37, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v39, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 +; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 +; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v37, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v39, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 +; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v56 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v46 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v43 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v15f64_to_v60i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB49_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: .LBB49_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB49_2 ; -; GFX9-LABEL: bitcast_v60f16_to_v15i64: +; GFX9-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v29, s18 +; GFX9-NEXT: v_mov_b32_e32 v30, s19 +; GFX9-NEXT: v_mov_b32_e32 v27, s20 +; GFX9-NEXT: v_mov_b32_e32 v28, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: .LBB49_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_4: ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB23_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: s_movk_i32 s7, 0x200 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, v15, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, v16, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, v17, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v18, v18, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v20, v20, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, v21, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, v22, s7 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v23, v23, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v24, v24, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v25, v25, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v26, v26, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v27, v27, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_f16 v28, v28, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB23_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB49_2 ; -; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB23_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s0 :: v_dual_mov_b32 v30, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s2 :: v_dual_mov_b32 v28, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s18 :: v_dual_mov_b32 v24, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-TRUE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: .LBB49_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_and_b32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB49_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB49_2 ; -; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64: +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB23_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB49_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB49_3 +; GFX11-FAKE16-NEXT: .LBB49_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB49_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB49_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: s_branch .LBB49_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <60 x half> %a, splat (half 0xH0200) - %a2 = bitcast <60 x half> %a1 to <15 x i64> + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <60 x i16> br label %end cmp.false: - %a3 = bitcast <60 x half> %a to <15 x i64> + %a3 = bitcast <15 x double> %a to <60 x i16> br label %end end: - %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x i64> %phi + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi } -define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v49, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v52, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v55, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB24_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 -; GCN-NEXT: v_alignbit_b32 v32, v28, v27, 16 -; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; GCN-NEXT: v_alignbit_b32 v34, v24, v23, 16 -; GCN-NEXT: v_alignbit_b32 v35, v22, v21, 16 -; GCN-NEXT: v_alignbit_b32 v36, v20, v19, 16 -; GCN-NEXT: v_alignbit_b32 v37, v18, v17, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; GCN-NEXT: v_alignbit_b32 v39, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v48, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v49, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v52, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v55, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; GCN-NEXT: .LBB24_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v1, v1, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v60 -; GCN-NEXT: v_or_b32_e32 v2, v2, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v3, v3, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 8, v0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_or_b32_e32 v4, v4, v59 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_or_b32_e32 v5, v5, v55 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v6, v6, v58 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 20, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 24, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_or_b32_e32 v8, v8, v57 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 28, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; GCN-NEXT: v_or_b32_e32 v9, v9, v49 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 32, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v10, v10, v56 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 36, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v11, v11, v48 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 40, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GCN-NEXT: v_or_b32_e32 v13, v13, v39 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v14, v14, v46 -; GCN-NEXT: v_add_i32_e32 v46, vcc, 52, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v15, v15, v38 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 56, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_or_b32_e32 v16, v16, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; GCN-NEXT: v_or_b32_e32 v17, v17, v37 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_or_b32_e32 v18, v18, v43 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x44, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v19, v19, v36 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x48, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v20, v20, v42 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x4c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; GCN-NEXT: v_or_b32_e32 v21, v21, v35 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x50, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v22, v22, v40 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x54, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v23, v23, v34 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x58, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v24, v24, v54 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x5c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v25, v25, v33 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x60, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v26, v26, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x64, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v27, v27, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x68, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GCN-NEXT: v_or_b32_e32 v28, v28, v51 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x6c, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v29, v29, v31 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x70, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v30, v30, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { +; SI-LABEL: bitcast_v60i16_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v0, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v43 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v60 +; SI-NEXT: v_or_b32_e32 v2, v2, v53 +; SI-NEXT: v_or_b32_e32 v3, v3, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v59 +; SI-NEXT: v_or_b32_e32 v6, v6, v50 +; SI-NEXT: v_or_b32_e32 v7, v7, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: v_or_b32_e32 v9, v9, v58 +; SI-NEXT: v_or_b32_e32 v10, v10, v57 +; SI-NEXT: v_or_b32_e32 v11, v11, v39 +; SI-NEXT: v_or_b32_e32 v12, v12, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v47 +; SI-NEXT: v_or_b32_e32 v14, v14, v46 +; SI-NEXT: v_or_b32_e32 v15, v15, v38 +; SI-NEXT: v_or_b32_e32 v16, v16, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v44 +; SI-NEXT: v_or_b32_e32 v19, v19, v42 +; SI-NEXT: v_or_b32_e32 v20, v20, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v40 +; SI-NEXT: v_or_b32_e32 v22, v22, v37 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 +; SI-NEXT: v_or_b32_e32 v24, v24, v35 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_or_b32_e32 v26, v26, v33 +; SI-NEXT: v_or_b32_e32 v27, v27, v32 +; SI-NEXT: v_or_b32_e32 v28, v28, v63 +; SI-NEXT: v_or_b32_e32 v29, v29, v62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB50_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v18, v43, v18 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v3, v52, v3 +; SI-NEXT: v_or_b32_e32 v4, v51, v4 +; SI-NEXT: v_or_b32_e32 v5, v59, v5 +; SI-NEXT: v_or_b32_e32 v6, v50, v6 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 +; SI-NEXT: v_or_b32_e32 v9, v58, v9 +; SI-NEXT: v_or_b32_e32 v10, v57, v10 +; SI-NEXT: v_or_b32_e32 v11, v39, v11 +; SI-NEXT: v_or_b32_e32 v12, v56, v12 +; SI-NEXT: v_or_b32_e32 v13, v47, v13 +; SI-NEXT: v_or_b32_e32 v14, v46, v14 +; SI-NEXT: v_or_b32_e32 v15, v38, v15 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 +; SI-NEXT: v_or_b32_e32 v17, v44, v17 +; SI-NEXT: v_or_b32_e32 v19, v42, v19 +; SI-NEXT: v_or_b32_e32 v20, v41, v20 +; SI-NEXT: v_or_b32_e32 v21, v40, v21 +; SI-NEXT: v_or_b32_e32 v22, v37, v22 +; SI-NEXT: v_or_b32_e32 v23, v36, v23 +; SI-NEXT: v_or_b32_e32 v24, v35, v24 +; SI-NEXT: v_or_b32_e32 v25, v34, v25 +; SI-NEXT: v_or_b32_e32 v26, v33, v26 +; SI-NEXT: v_or_b32_e32 v27, v32, v27 +; SI-NEXT: v_or_b32_e32 v28, v63, v28 +; SI-NEXT: v_or_b32_e32 v29, v62, v29 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; SI-NEXT: .LBB50_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bitcast_v15f64_to_v60i16: +; VI-LABEL: bitcast_v60i16_to_v15f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v29 +; VI-NEXT: v_mov_b32_e32 v33, v28 +; VI-NEXT: v_mov_b32_e32 v34, v27 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v36, v25 +; VI-NEXT: v_mov_b32_e32 v37, v24 +; VI-NEXT: v_mov_b32_e32 v38, v23 +; VI-NEXT: v_mov_b32_e32 v39, v22 +; VI-NEXT: v_mov_b32_e32 v48, v21 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v50, v19 +; VI-NEXT: v_mov_b32_e32 v51, v18 +; VI-NEXT: v_mov_b32_e32 v52, v17 +; VI-NEXT: v_mov_b32_e32 v53, v16 +; VI-NEXT: v_mov_b32_e32 v54, v15 +; VI-NEXT: v_mov_b32_e32 v55, v14 +; VI-NEXT: v_mov_b32_e32 v40, v13 +; VI-NEXT: v_mov_b32_e32 v41, v12 +; VI-NEXT: v_mov_b32_e32 v42, v11 +; VI-NEXT: v_mov_b32_e32 v43, v10 +; VI-NEXT: v_mov_b32_e32 v44, v9 +; VI-NEXT: v_mov_b32_e32 v45, v8 +; VI-NEXT: v_mov_b32_e32 v46, v7 +; VI-NEXT: v_mov_b32_e32 v47, v6 +; VI-NEXT: v_mov_b32_e32 v56, v5 +; VI-NEXT: v_mov_b32_e32 v57, v4 +; VI-NEXT: v_mov_b32_e32 v58, v3 +; VI-NEXT: v_mov_b32_e32 v59, v2 +; VI-NEXT: v_mov_b32_e32 v60, v1 +; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v29, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -18250,184 +40678,268 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: .LBB50_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: s_cbranch_execz .LBB50_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: v_mov_b32_e32 v29, 3 +; VI-NEXT: v_add_u16_e32 v0, 3, v61 +; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v60 +; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v59 +; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v58 +; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v57 +; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v56 +; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v47 +; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v46 +; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v45 +; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v44 +; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v43 +; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v42 +; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v41 +; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v40 +; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v55 +; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: v_add_u16_e32 v15, 3, v54 +; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: v_add_u16_e32 v17, 3, v52 +; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: v_add_u16_e32 v18, 3, v51 +; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: v_add_u16_e32 v19, 3, v50 +; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: v_add_u16_e32 v20, 3, v49 +; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: v_add_u16_e32 v21, 3, v48 +; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: v_add_u16_e32 v22, 3, v39 +; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: v_add_u16_e32 v23, 3, v38 +; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: v_add_u16_e32 v24, 3, v37 +; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: v_add_u16_e32 v25, 3, v36 +; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: v_add_u16_e32 v26, 3, v35 +; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: v_add_u16_e32 v27, 3, v34 +; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: v_add_u16_e32 v28, 3, v33 +; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v30 +; VI-NEXT: v_add_u16_e32 v30, 3, v32 +; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB50_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: bitcast_v15f64_to_v60i16: +; GFX9-LABEL: bitcast_v60i16_to_v15f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v61, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_mov_b32_e32 v48, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: v_mov_b32_e32 v50, v19 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 +; GFX9-NEXT: v_mov_b32_e32 v51, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: v_mov_b32_e32 v52, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 +; GFX9-NEXT: v_mov_b32_e32 v53, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 +; GFX9-NEXT: v_mov_b32_e32 v54, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: v_mov_b32_e32 v55, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_mov_b32_e32 v40, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 +; GFX9-NEXT: v_mov_b32_e32 v41, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 +; GFX9-NEXT: v_mov_b32_e32 v42, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_mov_b32_e32 v43, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 +; GFX9-NEXT: v_mov_b32_e32 v44, v9 +; GFX9-NEXT: v_mov_b32_e32 v45, v8 +; GFX9-NEXT: v_mov_b32_e32 v46, v7 +; GFX9-NEXT: v_mov_b32_e32 v47, v6 +; GFX9-NEXT: v_mov_b32_e32 v56, v5 +; GFX9-NEXT: v_mov_b32_e32 v57, v4 +; GFX9-NEXT: v_mov_b32_e32 v58, v3 +; GFX9-NEXT: v_mov_b32_e32 v59, v2 +; GFX9-NEXT: v_mov_b32_e32 v60, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -18445,151 +40957,237 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: .LBB50_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 +; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 +; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 +; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 +; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 +; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 +; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB50_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -18597,873 +41195,2138 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16: +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB24_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB50_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <15 x double> %a1 to <60 x i16> + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x double> br label %end cmp.false: - %a3 = bitcast <15 x double> %a to <60 x i16> + %a3 = bitcast <60 x i16> %a to <15 x double> br label %end end: - %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <60 x i16> %phi + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi } -define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v37, v20 -; GCN-NEXT: v_mov_b32_e32 v38, v18 -; GCN-NEXT: v_mov_b32_e32 v39, v16 -; GCN-NEXT: v_mov_b32_e32 v48, v14 -; GCN-NEXT: v_mov_b32_e32 v49, v12 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v51, v8 -; GCN-NEXT: v_mov_b32_e32 v52, v6 -; GCN-NEXT: v_mov_b32_e32 v53, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; GCN-NEXT: v_or_b32_e32 v0, v0, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v42 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 -; GCN-NEXT: v_or_b32_e32 v2, v2, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; GCN-NEXT: v_or_b32_e32 v3, v3, v32 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v4, v4, v41 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 -; GCN-NEXT: v_or_b32_e32 v5, v5, v63 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 -; GCN-NEXT: v_or_b32_e32 v6, v6, v62 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 -; GCN-NEXT: v_or_b32_e32 v7, v7, v61 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 -; GCN-NEXT: v_or_b32_e32 v8, v8, v60 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 -; GCN-NEXT: v_or_b32_e32 v9, v9, v59 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 -; GCN-NEXT: v_or_b32_e32 v10, v10, v58 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v11, v11, v57 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v12, v12, v47 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v14, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v44 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v45 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v43 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v16, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v17, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v18, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v19, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v20, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v22, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v23, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v24, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v25, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v26, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v27, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v28, v30 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v29, v30 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: .LBB25_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v44 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v43 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v40 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v0, v34, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_or_b32_e32 v3, v32, v3 -; GCN-NEXT: v_or_b32_e32 v4, v41, v4 -; GCN-NEXT: v_or_b32_e32 v5, v63, v5 -; GCN-NEXT: v_or_b32_e32 v6, v62, v6 -; GCN-NEXT: v_or_b32_e32 v7, v61, v7 -; GCN-NEXT: v_or_b32_e32 v8, v60, v8 -; GCN-NEXT: v_or_b32_e32 v9, v59, v9 -; GCN-NEXT: v_or_b32_e32 v10, v58, v10 -; GCN-NEXT: v_or_b32_e32 v11, v57, v11 -; GCN-NEXT: v_or_b32_e32 v12, v47, v12 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v30, v13 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v30, v14 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v30, v15 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v30, v18 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v30, v19 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v30, v20 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v30, v21 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v22, v30, v22 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v30, v23 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v30, v24 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v30, v25 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v26, v30, v26 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v30, v27 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v30, v28 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v30, v29 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 -; GCN-NEXT: .LBB25_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v36, v20 +; SI-NEXT: v_mov_b32_e32 v37, v18 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v7, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v9, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v10, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v11, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v12, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v14, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v17, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v19, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v28 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_or_b32_e32 v22, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v23, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_or_b32_e32 v24, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v25, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v27, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v58 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v57, v56 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v62, v60 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v31, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v60 +; SI-NEXT: v_mov_b32_e32 v60, v62 +; SI-NEXT: v_mov_b32_e32 v62, v47 +; SI-NEXT: v_mov_b32_e32 v47, v56 +; SI-NEXT: v_mov_b32_e32 v56, v57 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v58, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: s_branch .LBB51_2 ; -; VI-LABEL: bitcast_v60i16_to_v15f64: +; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB51_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_and_b32 s4, s16, 0xffff +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s17, 0xffff +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s5, s16, s5 +; VI-NEXT: s_and_b32 s16, s18, 0xffff +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s16, s17, s16 +; VI-NEXT: s_and_b32 s17, s19, 0xffff +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s17, s18, s17 +; VI-NEXT: s_and_b32 s18, s20, 0xffff +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s15, s15, s18 +; VI-NEXT: s_and_b32 s18, s21, 0xffff +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s14, s14, s18 +; VI-NEXT: s_and_b32 s18, s22, 0xffff +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s13, s13, s18 +; VI-NEXT: s_and_b32 s18, s23, 0xffff +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s12, s12, s18 +; VI-NEXT: s_and_b32 s18, s24, 0xffff +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s11, s11, s18 +; VI-NEXT: s_and_b32 s18, s25, 0xffff +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s10, s10, s18 +; VI-NEXT: s_and_b32 s18, s26, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s9, s9, s18 +; VI-NEXT: s_and_b32 s18, s27, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s8, s8, s18 +; VI-NEXT: s_and_b32 s18, s28, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s7, s7, s18 +; VI-NEXT: s_and_b32 s18, s29, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: s_or_b32 s6, s6, s18 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: .LBB51_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB51_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB51_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: .LBB51_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB51_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB51_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3 +; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: .LBB51_3: ; %end +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB51_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB51_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { +; SI-LABEL: bitcast_v15f64_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr58 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v60 +; SI-NEXT: v_mov_b32_e32 v60, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v62 +; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v59 +; SI-NEXT: v_mov_b32_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 +; SI-NEXT: v_mov_b32_e32 v61, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v63 +; SI-NEXT: v_mov_b32_e32 v63, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v38 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: .LBB52_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; SI-NEXT: v_add_f64 v[49:50], v[3:4], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: .LBB52_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15f64_to_v60f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v29 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v34, v27 -; VI-NEXT: v_mov_b32_e32 v35, v26 -; VI-NEXT: v_mov_b32_e32 v36, v25 -; VI-NEXT: v_mov_b32_e32 v37, v24 -; VI-NEXT: v_mov_b32_e32 v38, v23 -; VI-NEXT: v_mov_b32_e32 v39, v22 -; VI-NEXT: v_mov_b32_e32 v48, v21 -; VI-NEXT: v_mov_b32_e32 v49, v20 -; VI-NEXT: v_mov_b32_e32 v50, v19 -; VI-NEXT: v_mov_b32_e32 v51, v18 -; VI-NEXT: v_mov_b32_e32 v52, v17 -; VI-NEXT: v_mov_b32_e32 v53, v16 -; VI-NEXT: v_mov_b32_e32 v54, v15 -; VI-NEXT: v_mov_b32_e32 v55, v14 -; VI-NEXT: v_mov_b32_e32 v40, v13 -; VI-NEXT: v_mov_b32_e32 v41, v12 -; VI-NEXT: v_mov_b32_e32 v42, v11 -; VI-NEXT: v_mov_b32_e32 v43, v10 -; VI-NEXT: v_mov_b32_e32 v44, v9 -; VI-NEXT: v_mov_b32_e32 v45, v8 -; VI-NEXT: v_mov_b32_e32 v46, v7 -; VI-NEXT: v_mov_b32_e32 v47, v6 -; VI-NEXT: v_mov_b32_e32 v56, v5 -; VI-NEXT: v_mov_b32_e32 v57, v4 -; VI-NEXT: v_mov_b32_e32 v58, v3 -; VI-NEXT: v_mov_b32_e32 v59, v2 -; VI-NEXT: v_mov_b32_e32 v60, v1 -; VI-NEXT: v_mov_b32_e32 v61, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_mov_b32_e32 v29, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v1, v29, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v29, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v29, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v4, v29, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v5, v29, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v6, v29, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v7, v29, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v8, v29, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v9, v29, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v10, v29, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v11, v29, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v12, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v13, v29, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v14, v29, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v15, v29, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v16, v29, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v17, v29, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v18, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v19, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v20, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v21, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v22, v29, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v23, v29, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v24, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v25, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v26, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v27, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v28, v29, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v47, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v46, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v45, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v44, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v43, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v41, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v40, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v55, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v54, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v38, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v36, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v34, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 @@ -19492,268 +43355,184 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB52_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: s_cbranch_execz .LBB52_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v29, 3 -; VI-NEXT: v_add_u16_e32 v0, 3, v61 -; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_add_u16_e32 v2, 3, v59 -; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v3, 3, v58 -; VI-NEXT: v_add_u16_sdwa v4, v58, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v4 -; VI-NEXT: v_add_u16_e32 v4, 3, v57 -; VI-NEXT: v_add_u16_sdwa v5, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v4, v5 -; VI-NEXT: v_add_u16_e32 v5, 3, v56 -; VI-NEXT: v_add_u16_sdwa v6, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_add_u16_e32 v6, 3, v47 -; VI-NEXT: v_add_u16_sdwa v7, v47, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v6, v7 -; VI-NEXT: v_add_u16_e32 v7, 3, v46 -; VI-NEXT: v_add_u16_sdwa v8, v46, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v45 -; VI-NEXT: v_add_u16_sdwa v9, v45, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v44 -; VI-NEXT: v_add_u16_sdwa v10, v44, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v9, v10 -; VI-NEXT: v_add_u16_e32 v10, 3, v43 -; VI-NEXT: v_add_u16_sdwa v11, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v10, v11 -; VI-NEXT: v_add_u16_e32 v11, 3, v42 -; VI-NEXT: v_add_u16_sdwa v12, v42, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v11, v12 -; VI-NEXT: v_add_u16_e32 v12, 3, v41 -; VI-NEXT: v_add_u16_sdwa v13, v41, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v12, v13 -; VI-NEXT: v_add_u16_e32 v13, 3, v40 -; VI-NEXT: v_add_u16_sdwa v14, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v55 -; VI-NEXT: v_add_u16_sdwa v15, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v14, v15 -; VI-NEXT: v_add_u16_e32 v15, 3, v54 -; VI-NEXT: v_add_u16_sdwa v16, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: v_add_u16_e32 v16, 3, v53 -; VI-NEXT: v_add_u16_sdwa v17, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v16, v16, v17 -; VI-NEXT: v_add_u16_e32 v17, 3, v52 -; VI-NEXT: v_add_u16_sdwa v18, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v18 -; VI-NEXT: v_add_u16_e32 v18, 3, v51 -; VI-NEXT: v_add_u16_sdwa v19, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v18, v19 -; VI-NEXT: v_add_u16_e32 v19, 3, v50 -; VI-NEXT: v_add_u16_sdwa v20, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: v_add_u16_e32 v20, 3, v49 -; VI-NEXT: v_add_u16_sdwa v21, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v20, v21 -; VI-NEXT: v_add_u16_e32 v21, 3, v48 -; VI-NEXT: v_add_u16_sdwa v22, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: v_add_u16_e32 v22, 3, v39 -; VI-NEXT: v_add_u16_sdwa v23, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v22, v23 -; VI-NEXT: v_add_u16_e32 v23, 3, v38 -; VI-NEXT: v_add_u16_sdwa v24, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v23, v24 -; VI-NEXT: v_add_u16_e32 v24, 3, v37 -; VI-NEXT: v_add_u16_sdwa v25, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: v_add_u16_e32 v25, 3, v36 -; VI-NEXT: v_add_u16_sdwa v26, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: v_add_u16_e32 v26, 3, v35 -; VI-NEXT: v_add_u16_sdwa v27, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: v_add_u16_e32 v27, 3, v34 -; VI-NEXT: v_add_u16_sdwa v28, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: v_add_u16_e32 v28, 3, v33 -; VI-NEXT: v_add_u16_sdwa v30, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v28, v30 -; VI-NEXT: v_add_u16_e32 v30, 3, v32 -; VI-NEXT: v_add_u16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; VI-NEXT: .LBB52_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v60i16_to_v15f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 -; GFX9-NEXT: v_mov_b32_e32 v48, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GFX9-NEXT: v_mov_b32_e32 v50, v19 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v49 -; GFX9-NEXT: v_mov_b32_e32 v51, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 -; GFX9-NEXT: v_mov_b32_e32 v52, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v51 -; GFX9-NEXT: v_mov_b32_e32 v53, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v52 -; GFX9-NEXT: v_mov_b32_e32 v54, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 -; GFX9-NEXT: v_mov_b32_e32 v55, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_mov_b32_e32 v40, v13 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v55 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v40 -; GFX9-NEXT: v_mov_b32_e32 v42, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: v_mov_b32_e32 v43, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v42 -; GFX9-NEXT: v_mov_b32_e32 v44, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v8 -; GFX9-NEXT: v_mov_b32_e32 v46, v7 -; GFX9-NEXT: v_mov_b32_e32 v47, v6 -; GFX9-NEXT: v_mov_b32_e32 v56, v5 -; GFX9-NEXT: v_mov_b32_e32 v57, v4 -; GFX9-NEXT: v_mov_b32_e32 v58, v3 -; GFX9-NEXT: v_mov_b32_e32 v59, v2 -; GFX9-NEXT: v_mov_b32_e32 v60, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v42 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v41 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v55 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15f64_to_v60f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 @@ -19775,233 +43554,147 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB25_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB25_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 -; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 -; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 -; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 -; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 -; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v10, v10, v43, s6 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_perm_b32 v11, v11, v42, s6 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_perm_b32 v12, v12, v41, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_perm_b32 v13, v13, v40, s6 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_perm_b32 v14, v14, v55, s6 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_perm_b32 v15, v15, v54, s6 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_perm_b32 v16, v16, v53, s6 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_perm_b32 v17, v17, v52, s6 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_perm_b32 v18, v18, v51, s6 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_perm_b32 v19, v19, v50, s6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_perm_b32 v20, v20, v49, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 +; GFX9-NEXT: .LBB52_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 +; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 +; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 +; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 +; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 +; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 +; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 +; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 +; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 +; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 +; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 +; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 +; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 +; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 +; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 +; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 +; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 +; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 +; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 +; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 +; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 +; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 +; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 +; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 +; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 +; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 +; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64: +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo @@ -20009,845 +43702,773 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB25_2: ; %end +; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64: +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3 ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: .LBB25_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 +; GFX11-FAKE16-NEXT: .LBB52_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: - %a1 = add <60 x i16> %a, splat (i16 3) - %a2 = bitcast <60 x i16> %a1 to <15 x double> + %a1 = fadd <15 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <15 x double> %a1 to <60 x half> br label %end cmp.false: - %a3 = bitcast <60 x i16> %a to <15 x double> + %a3 = bitcast <15 x double> %a to <60 x half> br label %end end: - %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <15 x double> %phi + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi } -define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { -; GCN-LABEL: bitcast_v15f64_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; kill: killed $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; kill: killed $vgpr38 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v8 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v58 -; GCN-NEXT: v_mov_b32_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v60 -; GCN-NEXT: v_mov_b32_e32 v60, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v62 -; GCN-NEXT: v_mov_b32_e32 v62, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v63 -; GCN-NEXT: v_mov_b32_e32 v63, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: .LBB26_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v56 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 -; GCN-NEXT: .LBB26_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v45 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v44 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v43 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v42 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v41 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v54 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v53 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v49 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v35 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v35, v30 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 60, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_or_b32_e32 v33, v37, v33 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v38, v32 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v39, v36 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x48, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v48, v34 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x4c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v31, v49, v31 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v63 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v58 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v62 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v15f64_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; SI-NEXT: v_readfirstlane_b32 s42, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_readfirstlane_b32 s40, v3 +; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_readfirstlane_b32 s14, v5 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_and_b64 s[44:45], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s44, s5, 16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 +; SI-NEXT: s_lshr_b32 s44, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 +; SI-NEXT: s_lshr_b32 s44, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 +; SI-NEXT: s_lshr_b32 s44, s6, 16 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 +; SI-NEXT: s_lshr_b32 s44, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 +; SI-NEXT: s_lshr_b32 s44, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s44 +; SI-NEXT: s_lshr_b32 s44, s10, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 +; SI-NEXT: s_lshr_b32 s44, s13, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: s_lshr_b32 s44, s12, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s44 +; SI-NEXT: s_lshr_b32 s44, s15, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s44 +; SI-NEXT: s_lshr_b32 s44, s14, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s44 +; SI-NEXT: s_lshr_b32 s44, s41, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s44 +; SI-NEXT: s_lshr_b32 s44, s40, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s44 +; SI-NEXT: s_lshr_b32 s44, s43, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s44 +; SI-NEXT: s_lshr_b32 s44, s42, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s44 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s44 +; SI-NEXT: s_lshr_b32 s44, s28, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s44 +; SI-NEXT: s_lshr_b32 s44, s27, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s44 +; SI-NEXT: s_lshr_b32 s44, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s44 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s44 +; SI-NEXT: s_lshr_b32 s44, s24, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s44 +; SI-NEXT: s_lshr_b32 s44, s23, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s44 +; SI-NEXT: s_lshr_b32 s44, s22, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s44 +; SI-NEXT: s_lshr_b32 s44, s21, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s44 +; SI-NEXT: s_lshr_b32 s44, s20, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s44 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s44 +; SI-NEXT: s_lshr_b32 s44, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 +; SI-NEXT: s_lshr_b32 s44, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s10 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v62, s13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s16 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_add_f64 v[57:58], s[18:19], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_add_f64 v[41:42], s[20:21], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_add_f64 v[53:54], s[22:23], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[34:35], s[28:29], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_add_f64 v[30:31], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 +; SI-NEXT: v_add_f64 v[49:50], s[24:25], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v56 +; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 +; SI-NEXT: v_mov_b32_e32 v59, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v46, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v60, v6 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v42 +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v54 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v50 +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v9, v6 +; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 +; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 +; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 +; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_branch .LBB53_2 ; -; VI-LABEL: bitcast_v15f64_to_v60f16: +; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v17, s16 +; VI-NEXT: v_mov_b32_e32 v18, s17 +; VI-NEXT: v_mov_b32_e32 v29, s18 +; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: v_mov_b32_e32 v27, s20 +; VI-NEXT: v_mov_b32_e32 v28, s21 +; VI-NEXT: v_mov_b32_e32 v25, s22 +; VI-NEXT: v_mov_b32_e32 v26, s23 +; VI-NEXT: v_mov_b32_e32 v23, s24 +; VI-NEXT: v_mov_b32_e32 v24, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v19, s28 +; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -20860,145 +44481,141 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB26_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: .LBB26_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; VI-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v10, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: .LBB53_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 +; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 +; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 +; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 +; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v46 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v44 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v41 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -21011,49 +44628,81 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_or_b32_sdwa v12, v12, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v14, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v21, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v24, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v55 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v54 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_4: +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_branch .LBB53_2 ; -; GFX9-LABEL: bitcast_v15f64_to_v60f16: +; GFX9-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 +; GFX9-NEXT: v_mov_b32_e32 v18, s17 +; GFX9-NEXT: v_mov_b32_e32 v29, s18 +; GFX9-NEXT: v_mov_b32_e32 v30, s19 +; GFX9-NEXT: v_mov_b32_e32 v27, s20 +; GFX9-NEXT: v_mov_b32_e32 v28, s21 +; GFX9-NEXT: v_mov_b32_e32 v25, s22 +; GFX9-NEXT: v_mov_b32_e32 v26, s23 +; GFX9-NEXT: v_mov_b32_e32 v23, s24 +; GFX9-NEXT: v_mov_b32_e32 v24, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v19, s28 +; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -21066,81 +44715,40 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB26_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -21149,51 +44757,100 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: .LBB26_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v59, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v58, v1, s4 -; GFX9-NEXT: v_perm_b32 v2, v57, v2, s4 -; GFX9-NEXT: v_perm_b32 v3, v56, v3, s4 -; GFX9-NEXT: v_perm_b32 v4, v47, v4, s4 -; GFX9-NEXT: v_perm_b32 v5, v46, v5, s4 -; GFX9-NEXT: v_perm_b32 v6, v45, v6, s4 -; GFX9-NEXT: v_perm_b32 v7, v44, v7, s4 -; GFX9-NEXT: v_perm_b32 v8, v43, v8, s4 -; GFX9-NEXT: v_perm_b32 v9, v42, v9, s4 -; GFX9-NEXT: v_perm_b32 v10, v41, v10, s4 -; GFX9-NEXT: v_perm_b32 v11, v40, v11, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: .LBB53_3: ; %end +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v40, 16, v0 ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -21206,62 +44863,438 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v12, v55, v12, s4 -; GFX9-NEXT: v_perm_b32 v13, v54, v13, s4 -; GFX9-NEXT: v_perm_b32 v14, v53, v14, s4 -; GFX9-NEXT: v_perm_b32 v15, v52, v15, s4 -; GFX9-NEXT: v_perm_b32 v16, v51, v16, s4 -; GFX9-NEXT: v_perm_b32 v17, v50, v17, s4 -; GFX9-NEXT: v_perm_b32 v18, v49, v18, s4 -; GFX9-NEXT: v_perm_b32 v19, v48, v19, s4 -; GFX9-NEXT: v_perm_b32 v20, v39, v20, s4 -; GFX9-NEXT: v_perm_b32 v21, v38, v21, s4 -; GFX9-NEXT: v_perm_b32 v22, v37, v22, s4 -; GFX9-NEXT: v_perm_b32 v23, v36, v23, s4 -; GFX9-NEXT: v_perm_b32 v24, v35, v24, s4 -; GFX9-NEXT: v_perm_b32 v25, v34, v25, s4 -; GFX9-NEXT: v_perm_b32 v26, v33, v26, s4 -; GFX9-NEXT: v_perm_b32 v27, v32, v27, s4 -; GFX9-NEXT: v_perm_b32 v28, v31, v28, s4 -; GFX9-NEXT: v_perm_b32 v29, v30, v29, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v54, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_branch .LBB53_2 ; -; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16: +; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, s0 :: v_dual_mov_b32 v30, s1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v27, s2 :: v_dual_mov_b32 v28, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, s18 :: v_dual_mov_b32 v24, s19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v21, s20 :: v_dual_mov_b32 v22, s21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v20, s23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, s26 :: v_dual_mov_b32 v18, s27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-TRUE16-NEXT: .LBB53_2: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end -; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-TRUE16-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX11-TRUE16-NEXT: .LBB53_3: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v35, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v34, 16, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v82, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v81, 16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v80, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v70, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v37, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v48, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v71, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_and_b32 v1, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v83, 16, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v49, 16, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v36, 16, v29 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v38, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v39, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v0, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v33, 16, v22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v9, v33 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB53_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB53_2 ; -; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16: +; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB53_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB53_3 +; GFX11-FAKE16-NEXT: .LBB53_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-FAKE16-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GFX11-FAKE16-NEXT: .LBB53_3: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v34, 16, v20 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v48, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v19 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v68, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v37, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v39, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v82, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v33, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v35, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v12, 16, v18 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v69, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v66, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v38, 16, v28 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v32, 16, v22 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v67, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v65, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v64, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v49, 16, v26 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v53, 16, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v36, 16, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v83, 16, v24 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v55, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v81, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v80, 16, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v70, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v52, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v51, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v50, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v32 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB53_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr13 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 @@ -21278,136 +45311,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30 -; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow -; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4 -; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0 -; GFX11-FAKE16-NEXT: .LBB26_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: s_branch .LBB53_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -21426,756 +45330,757 @@ end: } define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v15f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v9 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v37 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v47 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v44 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v60 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GCN-NEXT: v_or_b32_e32 v0, v58, v0 -; GCN-NEXT: v_or_b32_e32 v1, v56, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v30, v16 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v30, v17 -; GCN-NEXT: v_or_b32_e32 v18, v46, v18 -; GCN-NEXT: v_or_b32_e32 v19, v44, v19 -; GCN-NEXT: v_or_b32_e32 v20, v43, v20 -; GCN-NEXT: v_or_b32_e32 v21, v41, v21 -; GCN-NEXT: v_or_b32_e32 v22, v48, v22 -; GCN-NEXT: v_or_b32_e32 v23, v38, v23 -; GCN-NEXT: v_or_b32_e32 v24, v36, v24 -; GCN-NEXT: v_or_b32_e32 v25, v35, v25 -; GCN-NEXT: v_or_b32_e32 v26, v37, v26 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_or_b32_e32 v28, v49, v28 -; GCN-NEXT: v_or_b32_e32 v29, v51, v29 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: .LBB27_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v59 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v57 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v6, v5 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_or_b32_e32 v7, v8, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_or_b32_e32 v9, v10, v9 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v12, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v46 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_or_b32_e32 v17, v19, v18 -; GCN-NEXT: v_or_b32_e32 v18, v21, v20 -; GCN-NEXT: v_or_b32_e32 v19, v23, v22 -; GCN-NEXT: v_or_b32_e32 v20, v25, v24 -; GCN-NEXT: v_or_b32_e32 v21, v27, v26 -; GCN-NEXT: v_or_b32_e32 v22, v29, v28 -; GCN-NEXT: v_or_b32_e32 v23, v31, v30 -; GCN-NEXT: v_or_b32_e32 v24, v33, v32 -; GCN-NEXT: v_or_b32_e32 v25, v35, v34 -; GCN-NEXT: v_or_b32_e32 v26, v37, v36 -; GCN-NEXT: v_or_b32_e32 v27, v39, v38 -; GCN-NEXT: v_or_b32_e32 v28, v49, v48 -; GCN-NEXT: v_or_b32_e32 v29, v51, v50 -; GCN-NEXT: .LBB27_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60f16_to_v15f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v23, v52, v23 +; SI-NEXT: v_or_b32_e32 v24, v50, v24 +; SI-NEXT: v_or_b32_e32 v25, v48, v25 +; SI-NEXT: v_or_b32_e32 v26, v38, v26 +; SI-NEXT: v_or_b32_e32 v27, v36, v27 +; SI-NEXT: v_or_b32_e32 v28, v34, v28 +; SI-NEXT: v_or_b32_e32 v29, v32, v29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v22, v62, v22 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: .LBB54_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v39 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v62 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v51 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v49 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v37 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB54_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60f16_to_v15f64: ; VI: ; %bb.0: @@ -22228,7 +46133,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_mov_b32_e32 v29, 16 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v29, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -22321,9 +46226,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: .LBB54_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: s_cbranch_execz .LBB54_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v29, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -22416,7 +46321,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 ; VI-NEXT: v_or_b32_e32 v29, v30, v29 -; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: .LBB54_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -22539,7 +46444,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -22695,9 +46600,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; kill: killed $vgpr30 -; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: .LBB54_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload @@ -22810,7 +46715,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 ; GFX9-NEXT: v_pk_add_f16 v29, v29, s7 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: .LBB54_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -22839,7 +46744,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -22871,7 +46776,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB27_2: ; %end +; GFX11-TRUE16-NEXT: .LBB54_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22943,7 +46848,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] @@ -22975,9 +46880,1484 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: .LBB27_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: .LBB54_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <15 x double> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <15 x double> + br label %end + +end: + %phi = phi <15 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x double> %phi +} + +define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v15f64_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB55_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_or_b32_e32 v10, v32, v10 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v11, v2 +; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_or_b32_e32 v11, v35, v11 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v61, v15 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v21, v51, v21 +; SI-NEXT: v_or_b32_e32 v22, v30, v22 +; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v17, v32, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v54, v29 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: s_branch .LBB55_3 +; SI-NEXT: .LBB55_2: +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v47, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v39 +; SI-NEXT: v_mov_b32_e32 v56, v34 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v63 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v31, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: .LBB55_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: s_cbranch_vccnz .LBB55_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v26, v24 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v28, v26 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: .LBB55_5: ; %end +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v15f64_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v34, v13 +; VI-NEXT: v_mov_b32_e32 v35, v12 +; VI-NEXT: v_mov_b32_e32 v36, v11 +; VI-NEXT: v_mov_b32_e32 v37, v10 +; VI-NEXT: v_mov_b32_e32 v38, v9 +; VI-NEXT: v_mov_b32_e32 v39, v8 +; VI-NEXT: v_mov_b32_e32 v48, v7 +; VI-NEXT: v_mov_b32_e32 v49, v6 +; VI-NEXT: v_mov_b32_e32 v50, v5 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v52, v3 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v54, v1 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s44, s42, 16 +; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s5, s5, s44 +; VI-NEXT: s_and_b32 s44, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s45, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s44, s44, s45 +; VI-NEXT: s_and_b32 s45, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s46, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s45, s45, s46 +; VI-NEXT: s_and_b32 s46, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s47, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s46, s46, s47 +; VI-NEXT: s_and_b32 s47, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s56, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s47, s47, s56 +; VI-NEXT: s_and_b32 s56, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s57, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s56, s56, s57 +; VI-NEXT: s_and_b32 s57, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s58, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s57, s57, s58 +; VI-NEXT: s_and_b32 s58, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s59, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s58, s58, s59 +; VI-NEXT: s_and_b32 s59, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s60, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s59, s59, s60 +; VI-NEXT: s_and_b32 s60, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s61, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s60, s60, s61 +; VI-NEXT: s_and_b32 s61, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s62, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s61, s61, s62 +; VI-NEXT: s_and_b32 s62, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s63, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s62, s62, s63 +; VI-NEXT: s_and_b32 s63, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s72, s6, 16 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s44 +; VI-NEXT: v_mov_b32_e32 v3, s45 +; VI-NEXT: v_mov_b32_e32 v4, s46 +; VI-NEXT: v_mov_b32_e32 v5, s47 +; VI-NEXT: v_mov_b32_e32 v6, s56 +; VI-NEXT: v_mov_b32_e32 v7, s57 +; VI-NEXT: v_mov_b32_e32 v8, s58 +; VI-NEXT: v_mov_b32_e32 v9, s59 +; VI-NEXT: v_mov_b32_e32 v10, s60 +; VI-NEXT: v_mov_b32_e32 v11, s61 +; VI-NEXT: v_mov_b32_e32 v12, s62 +; VI-NEXT: v_mov_b32_e32 v13, s63 +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v13, 0x200 +; VI-NEXT: v_mov_b32_e32 v0, s43 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s16, v13 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v13 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s41 +; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s18, v13 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, s40 +; VI-NEXT: v_add_f16_sdwa v3, v3, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s19, v13 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_add_f16_sdwa v4, v4, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s20, v13 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_add_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s21, v13 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_add_f16_sdwa v6, v6, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s22, v13 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_add_f16_sdwa v7, v7, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s23, v13 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_add_f16_sdwa v8, v8, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, s24, v13 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_add_f16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, s25, v13 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_add_f16_sdwa v10, v10, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, s26, v13 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_add_f16_sdwa v11, v11, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, s27, v13 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_add_f16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, s28, v13 +; VI-NEXT: v_or_b32_e32 v12, v14, v12 +; VI-NEXT: v_mov_b32_e32 v14, s6 +; VI-NEXT: v_add_f16_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, s29, v13 +; VI-NEXT: v_mov_b32_e32 v29, 0x200 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_f16_sdwa v14, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v55 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: v_add_f16_sdwa v15, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_f16_sdwa v16, v53, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v53 +; VI-NEXT: v_or_b32_e32 v16, v17, v16 +; VI-NEXT: v_add_f16_sdwa v17, v52, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v52 +; VI-NEXT: v_or_b32_e32 v17, v18, v17 +; VI-NEXT: v_add_f16_sdwa v18, v51, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v51 +; VI-NEXT: v_or_b32_e32 v18, v19, v18 +; VI-NEXT: v_add_f16_sdwa v19, v50, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v50 +; VI-NEXT: v_or_b32_e32 v19, v20, v19 +; VI-NEXT: v_add_f16_sdwa v20, v49, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v49 +; VI-NEXT: v_or_b32_e32 v20, v21, v20 +; VI-NEXT: v_add_f16_sdwa v21, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v21, v22, v21 +; VI-NEXT: v_add_f16_sdwa v22, v39, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v39 +; VI-NEXT: v_or_b32_e32 v22, v23, v22 +; VI-NEXT: v_add_f16_sdwa v23, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v23, v24, v23 +; VI-NEXT: v_add_f16_sdwa v24, v37, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v37 +; VI-NEXT: v_or_b32_e32 v24, v25, v24 +; VI-NEXT: v_add_f16_sdwa v25, v36, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v36 +; VI-NEXT: v_or_b32_e32 v25, v26, v25 +; VI-NEXT: v_add_f16_sdwa v26, v35, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v35 +; VI-NEXT: v_or_b32_e32 v26, v27, v26 +; VI-NEXT: v_add_f16_sdwa v27, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v34 +; VI-NEXT: v_or_b32_e32 v27, v28, v27 +; VI-NEXT: v_add_f16_sdwa v28, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v33 +; VI-NEXT: v_or_b32_e32 v28, v30, v28 +; VI-NEXT: v_add_f16_sdwa v29, v32, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v29, v30, v29 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v60f16_to_v15f64_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v15 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v12 +; GFX9-NEXT: v_mov_b32_e32 v36, v11 +; GFX9-NEXT: v_mov_b32_e32 v37, v10 +; GFX9-NEXT: v_mov_b32_e32 v38, v9 +; GFX9-NEXT: v_mov_b32_e32 v39, v8 +; GFX9-NEXT: v_mov_b32_e32 v48, v7 +; GFX9-NEXT: v_mov_b32_e32 v49, v6 +; GFX9-NEXT: v_mov_b32_e32 v50, v5 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v52, v3 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v54, v1 +; GFX9-NEXT: v_mov_b32_e32 v55, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s40, s29, 16 +; GFX9-NEXT: s_lshr_b32 s41, s28, 16 +; GFX9-NEXT: s_lshr_b32 s42, s27, 16 +; GFX9-NEXT: s_lshr_b32 s43, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s16, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s17, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s18, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s19, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s20, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s21, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s22, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s23, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s14, s24, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s15, s25, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s16, s26, s43 +; GFX9-NEXT: s_pack_ll_b32_b16 s17, s27, s42 +; GFX9-NEXT: s_pack_ll_b32_b16 s18, s28, s41 +; GFX9-NEXT: s_pack_ll_b32_b16 s19, s29, s40 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v55 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mov_b32_e32 v12, s18 +; GFX9-NEXT: v_mov_b32_e32 v13, s19 +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v58, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v57, 16, v20 +; GFX9-NEXT: v_lshl_or_b32 v21, v56, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v22, v47, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v23, v46, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v45, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v25, v44, 16, v25 +; GFX9-NEXT: v_lshl_or_b32 v26, v43, 16, v26 +; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 +; GFX9-NEXT: v_pk_add_f16 v0, s6, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s7, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s8, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s9, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s10, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s11, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s12, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s13, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s14, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s15, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s16, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s17, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s18, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s19, v13 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s4 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_3: ; %end +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_branch .LBB55_2 +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v33, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v9 :: v_dual_mov_b32 v35, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v7 :: v_dual_mov_b32 v37, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v5 :: v_dual_mov_b32 v39, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v3 :: v_dual_mov_b32 v49, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v1 :: v_dual_mov_b32 v51, v0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_mov_b32 s75, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v37.h +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s75, s75, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, s75 :: v_dual_and_b32 v9, 0xffff, v37 +; GFX11-TRUE16-NEXT: s_mov_b32 s47, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s57, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s63, s20 +; GFX11-TRUE16-NEXT: s_mov_b32 s73, s22 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s47, s47, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s57, s57, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s63, s63, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s73, s73, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v38.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_and_b32 v7, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX11-TRUE16-NEXT: s_mov_b32 s59, s16 +; GFX11-TRUE16-NEXT: s_mov_b32 s61, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s59, s59, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s61, s61, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s63 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_mov_b32 s77, s26 +; GFX11-TRUE16-NEXT: s_mov_b32 s79, s28 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s77, s77, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s79, s79, s5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s61 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v35.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, s79 :: v_dual_and_b32 v3, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s59 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v36.h +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s77 :: v_dual_and_b32 v1, 0xffff, v36 +; GFX11-TRUE16-NEXT: s_mov_b32 s56, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s58, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s60, s17 +; GFX11-TRUE16-NEXT: s_mov_b32 s62, s19 +; GFX11-TRUE16-NEXT: s_mov_b32 s72, s21 +; GFX11-TRUE16-NEXT: s_mov_b32 s74, s23 +; GFX11-TRUE16-NEXT: s_mov_b32 s76, s25 +; GFX11-TRUE16-NEXT: s_mov_b32 s78, s27 +; GFX11-TRUE16-NEXT: s_mov_b32 s88, s29 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s56, s56, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s58, s58, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s60, s60, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s62, s62, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s72, s72, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s74, s74, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s76, s76, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s78, s78, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s88, s88, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s56 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s58 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s60 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, s62 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s72 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s74 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, s76 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s78 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, s88 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s44 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s2 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s43 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s42 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s16 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s41 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s40 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v51.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s15 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s20 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v50.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v16, 16, v17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s22 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v18, 16, v19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v49.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v48.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v38.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v37.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s24 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s25 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v22, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v24, 16, v25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v26, 16, v27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v28, 16, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v36.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v35.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s27 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v27, 16, v28 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v29, 16, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v31, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v34, 16, v32 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s29 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: .LBB55_3: ; %end +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB55_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-TRUE16-NEXT: s_branch .LBB55_2 +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v70, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff, v11 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, 0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s16, s27, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s17, s28, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s18, s29, s40 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB55_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s15 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB55_3 +; GFX11-FAKE16-NEXT: .LBB55_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v51, 16, v71 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v50, 16, v70 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v49, 16, v69 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v48, 16, v68 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v39, 16, v67 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v38, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v37, 16, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v36, 16, v64 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v35, 16, v55 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v34, 16, v54 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v33, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v32, 16, v52 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: .LBB55_3: ; %end ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB55_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: s_branch .LBB55_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -22996,974 +48376,996 @@ end: } define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v60i16_to_v60f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; kill: killed $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: .LBB28_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 -; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v2 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v39 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: .LBB28_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v10, v11, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v13, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v14, v15, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v16, v17, v16 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v18, v19, v18 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v20, v21, v20 -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v22, v23, v22 -; GCN-NEXT: v_add_i32_e32 v23, vcc, 44, v0 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v25, v24 -; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v26, v27, v26 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 52, v0 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_or_b32_e32 v28, v29, v28 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 56, v0 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_or_b32_e32 v30, v31, v30 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v32, v33, v32 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 64, v0 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GCN-NEXT: v_or_b32_e32 v34, v35, v34 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x44, v0 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; GCN-NEXT: v_or_b32_e32 v36, v37, v36 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x48, v0 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; GCN-NEXT: v_or_b32_e32 v38, v39, v38 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x4c, v0 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; GCN-NEXT: v_or_b32_e32 v48, v49, v48 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_or_b32_e32 v50, v51, v50 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v52, v53, v52 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_or_b32_e32 v54, v55, v54 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x5c, v0 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_or_b32_e32 v40, v41, v40 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_or_b32_e32 v42, v43, v42 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v44, v45, v44 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x68, v0 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_or_b32_e32 v46, v47, v46 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 0x6c, v0 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_or_b32_e32 v56, v57, v56 -; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x70, v0 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v58 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60i16_to_v60f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB56_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: .LBB56_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v60f16: ; VI: ; %bb.0: @@ -24014,7 +49416,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v30, 3, v30 @@ -24076,7 +49478,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 3, v32 ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_add_u16_e32 v31, 3, v31 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_sdwa v0, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -24202,7 +49604,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v29, v59, v29, s6 @@ -24295,7 +49697,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v29 -; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: .LBB56_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v18, v40, v18, s4 @@ -24351,7 +49753,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] @@ -24383,7 +49785,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: .LBB28_2: ; %end +; GFX11-TRUE16-NEXT: .LBB56_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24425,7 +49827,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 @@ -24482,73 +49884,2020 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-FAKE16-NEXT: .LBB56_2: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <60 x i16> %a, splat (i16 3) + %a2 = bitcast <60 x i16> %a1 to <60 x half> + br label %end + +cmp.false: + %a3 = bitcast <60 x i16> %a to <60 x half> + br label %end + +end: + %phi = phi <60 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x half> %phi +} + +define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60i16_to_v60f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB57_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 +; SI-NEXT: s_branch .LBB57_3 +; SI-NEXT: .LBB57_2: +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v32, v50 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: .LBB57_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v49, v50 +; SI-NEXT: v_mov_b32_e32 v50, v52 +; SI-NEXT: v_mov_b32_e32 v52, v54 +; SI-NEXT: v_mov_b32_e32 v54, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v46, v56 +; SI-NEXT: v_mov_b32_e32 v56, v31 +; SI-NEXT: s_cbranch_vccnz .LBB57_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60i16_to_v60f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_3 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s41, s41, 3 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s40, s40, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s25, s25, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s27, s27, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s29, s29, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: .LBB57_3: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: s_and_b32 s4, 0xffff, s16 +; VI-NEXT: s_lshl_b32 s5, s43, 16 +; VI-NEXT: v_or_b32_sdwa v30, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, 0xffff, s17 +; VI-NEXT: s_lshl_b32 s16, s42, 16 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: s_or_b32 s5, s5, s16 +; VI-NEXT: s_and_b32 s16, 0xffff, s18 +; VI-NEXT: s_lshl_b32 s17, s41, 16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s17, 0xffff, s19 +; VI-NEXT: s_lshl_b32 s18, s40, 16 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: s_or_b32 s17, s17, s18 +; VI-NEXT: s_and_b32 s18, 0xffff, s20 +; VI-NEXT: s_lshl_b32 s15, s15, 16 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_or_b32 s15, s18, s15 +; VI-NEXT: s_and_b32 s18, 0xffff, s21 +; VI-NEXT: s_lshl_b32 s14, s14, 16 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: s_or_b32 s14, s18, s14 +; VI-NEXT: s_and_b32 s18, 0xffff, s22 +; VI-NEXT: s_lshl_b32 s13, s13, 16 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: s_or_b32 s13, s18, s13 +; VI-NEXT: s_and_b32 s18, 0xffff, s23 +; VI-NEXT: s_lshl_b32 s12, s12, 16 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: s_or_b32 s12, s18, s12 +; VI-NEXT: s_and_b32 s18, 0xffff, s24 +; VI-NEXT: s_lshl_b32 s11, s11, 16 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: s_or_b32 s11, s18, s11 +; VI-NEXT: s_and_b32 s18, 0xffff, s25 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: s_or_b32 s10, s18, s10 +; VI-NEXT: s_and_b32 s18, 0xffff, s26 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: s_or_b32 s9, s18, s9 +; VI-NEXT: s_and_b32 s18, 0xffff, s27 +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: s_or_b32 s8, s18, s8 +; VI-NEXT: s_and_b32 s18, 0xffff, s28 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: s_or_b32 s7, s18, s7 +; VI-NEXT: s_and_b32 s18, 0xffff, s29 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: s_or_b32 s6, s18, s6 +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s16 +; VI-NEXT: v_mov_b32_e32 v3, s17 +; VI-NEXT: v_mov_b32_e32 v4, s15 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s11 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v12, s7 +; VI-NEXT: v_mov_b32_e32 v13, s6 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_4: +; VI-NEXT: s_branch .LBB57_2 +; +; GFX9-LABEL: bitcast_v60i16_to_v60f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_pk_add_u16 v30, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_u16 v31, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_u16 v51, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_u16 v50, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_u16 v49, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_u16 v48, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_u16 v39, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_u16 v38, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_u16 v37, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_u16 v36, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_u16 v35, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_u16 v34, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_lshl_or_b32 v15, v29, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v14, v28, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v55, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v54, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v19, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: v_pk_add_u16 v33, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: s_branch .LBB57_5 +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v30, s29 +; GFX9-NEXT: v_mov_b32_e32 v31, s28 +; GFX9-NEXT: v_mov_b32_e32 v51, s27 +; GFX9-NEXT: v_mov_b32_e32 v50, s26 +; GFX9-NEXT: v_mov_b32_e32 v49, s25 +; GFX9-NEXT: v_mov_b32_e32 v48, s24 +; GFX9-NEXT: v_mov_b32_e32 v39, s23 +; GFX9-NEXT: v_mov_b32_e32 v38, s22 +; GFX9-NEXT: v_mov_b32_e32 v37, s21 +; GFX9-NEXT: v_mov_b32_e32 v36, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s18 +; GFX9-NEXT: v_mov_b32_e32 v33, s17 +; GFX9-NEXT: v_mov_b32_e32 v32, s16 +; GFX9-NEXT: v_mov_b32_e32 v53, s43 +; GFX9-NEXT: v_mov_b32_e32 v52, s42 +; GFX9-NEXT: v_mov_b32_e32 v40, s41 +; GFX9-NEXT: v_mov_b32_e32 v41, s40 +; GFX9-NEXT: v_mov_b32_e32 v42, s15 +; GFX9-NEXT: v_mov_b32_e32 v43, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v45, s12 +; GFX9-NEXT: v_mov_b32_e32 v46, s11 +; GFX9-NEXT: v_mov_b32_e32 v47, s10 +; GFX9-NEXT: v_mov_b32_e32 v56, s9 +; GFX9-NEXT: v_mov_b32_e32 v57, s8 +; GFX9-NEXT: v_mov_b32_e32 v58, s7 +; GFX9-NEXT: v_mov_b32_e32 v59, s6 +; GFX9-NEXT: .LBB57_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v32, v59, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v58, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v57, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v56, 16, v35 +; GFX9-NEXT: v_lshl_or_b32 v36, v47, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v46, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v45, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v44, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v43, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v42, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v50, v41, 16, v50 +; GFX9-NEXT: v_lshl_or_b32 v51, v40, 16, v51 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-TRUE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v12, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v12, 16, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s15, s11 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v37, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v36, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v83.l +; GFX11-TRUE16-NEXT: s_branch .LBB57_5 +; GFX11-TRUE16-NEXT: .LBB57_3: +; GFX11-TRUE16-NEXT: s_branch .LBB57_2 +; GFX11-TRUE16-NEXT: .LBB57_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s13 +; GFX11-TRUE16-NEXT: .LBB57_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v39, 16, v64 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v35, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v33, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v54, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v55, 16, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v28, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_mov_b32 v8, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, v35 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v60f16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-FAKE16-NEXT: .LBB57_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-FAKE16-NEXT: .LBB28_2: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 -; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v34, s15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v35, s14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, s13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, s12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, s11, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v33, s10, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v49, s9, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v48, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v36, s7, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: s_branch .LBB57_5 +; GFX11-FAKE16-NEXT: .LBB57_3: +; GFX11-FAKE16-NEXT: s_branch .LBB57_2 +; GFX11-FAKE16-NEXT: .LBB57_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s43 :: v_dual_mov_b32 v53, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5 +; GFX11-FAKE16-NEXT: .LBB57_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v84, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -24568,611 +51917,633 @@ end: } define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v60f16_to_v60i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v5 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v13 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v17 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v20 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v22 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v28 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v30 -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v51 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v39 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v61 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v50 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v61, v27 -; GCN-NEXT: v_mov_b32_e32 v63, v2 -; GCN-NEXT: v_mov_b32_e32 v45, v1 -; GCN-NEXT: v_mov_b32_e32 v56, v8 -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GCN-NEXT: v_or_b32_e32 v48, v48, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v54 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_or_b32_e32 v54, v50, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v50 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v1 -; GCN-NEXT: v_or_b32_e32 v7, v7, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 -; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v34 -; GCN-NEXT: v_or_b32_e32 v33, v33, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 -; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v35, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v38 -; GCN-NEXT: v_or_b32_e32 v37, v37, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v60 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v59 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v27 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v59, v63 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v61 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v62 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 -; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 -; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_or_b32_e32 v1, v1, v50 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v43, v40 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v45, v45, v3 -; GCN-NEXT: v_or_b32_e32 v43, v46, v2 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v43, v57, v56 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v63, v59, v58 -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: v_or_b32_e32 v61, v60, v25 -; GCN-NEXT: v_or_b32_e32 v55, v55, v47 -; GCN-NEXT: v_or_b32_e32 v51, v51, v44 -; GCN-NEXT: v_or_b32_e32 v30, v30, v41 -; GCN-NEXT: v_or_b32_e32 v28, v28, v53 -; GCN-NEXT: v_or_b32_e32 v42, v42, v24 -; GCN-NEXT: v_or_b32_e32 v49, v49, v29 -; GCN-NEXT: v_or_b32_e32 v39, v39, v27 -; GCN-NEXT: v_alignbit_b32 v60, v37, v50, 16 -; GCN-NEXT: v_alignbit_b32 v59, v35, v40, 16 -; GCN-NEXT: v_alignbit_b32 v3, v33, v3, 16 -; GCN-NEXT: v_alignbit_b32 v1, v31, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v56, v15, v56, 16 -; GCN-NEXT: v_alignbit_b32 v2, v11, v58, 16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v26, v7, v26, 16 -; GCN-NEXT: v_alignbit_b32 v25, v21, v25, 16 -; GCN-NEXT: v_alignbit_b32 v47, v19, v47, 16 -; GCN-NEXT: v_alignbit_b32 v44, v17, v44, 16 -; GCN-NEXT: v_alignbit_b32 v41, v13, v41, 16 -; GCN-NEXT: v_alignbit_b32 v53, v9, v53, 16 -; GCN-NEXT: v_alignbit_b32 v24, v5, v24, 16 -; GCN-NEXT: v_alignbit_b32 v29, v54, v29, 16 -; GCN-NEXT: v_alignbit_b32 v62, v48, v27, 16 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v60 -; GCN-NEXT: v_or_b32_e32 v57, v1, v50 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v38 -; GCN-NEXT: v_or_b32_e32 v37, v1, v37 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 4, v0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v59 -; GCN-NEXT: v_or_b32_e32 v46, v1, v50 -; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v36 -; GCN-NEXT: v_or_b32_e32 v35, v1, v35 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 12, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v43, v1, v3 -; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v2, v33, v2 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GCN-NEXT: v_or_b32_e32 v31, v31, v32 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v56 -; GCN-NEXT: v_or_b32_e32 v34, v34, v50 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 32, v0 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_or_b32_e32 v15, v15, v16 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v63 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; GCN-NEXT: v_or_b32_e32 v40, v40, v45 -; GCN-NEXT: v_add_i32_e32 v45, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_or_b32_e32 v11, v11, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_or_b32_e32 v23, v23, v26 -; GCN-NEXT: v_add_i32_e32 v26, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_or_b32_e32 v25, v56, v25 -; GCN-NEXT: v_add_i32_e32 v56, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_or_b32_e32 v21, v21, v22 -; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_or_b32_e32 v55, v55, v47 -; GCN-NEXT: v_add_i32_e32 v47, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v19, v19, v20 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_or_b32_e32 v51, v51, v44 -; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_or_b32_e32 v17, v17, v18 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_or_b32_e32 v30, v30, v41 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x50, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v13, v13, v14 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x54, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_or_b32_e32 v28, v28, v53 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_or_b32_e32 v9, v9, v10 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_or_b32_e32 v24, v42, v24 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v5, v5, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_or_b32_e32 v29, v49, v29 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v54, v4 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v62 -; GCN-NEXT: v_or_b32_e32 v27, v39, v27 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GCN-NEXT: v_or_b32_e32 v48, v48, v52 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v57, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v35, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v43, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v51, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v29, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v60f16_to_v60i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v45 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:104 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v59 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v35 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v9, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v46, v21 +; SI-NEXT: v_mov_b32_e32 v47, v17 +; SI-NEXT: v_mov_b32_e32 v56, v4 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_mov_b32_e32 v59, v6 +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v4, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v8, v8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v31, v31, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_or_b32_e32 v63, v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v11, v11, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v15, v15, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v22, v22, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; SI-NEXT: v_or_b32_e32 v7, v7, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; SI-NEXT: v_or_b32_e32 v14, v14, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v37, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v59 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v48, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v52, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v55, v34, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v4, v35, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v29, v29, v27 +; SI-NEXT: v_or_b32_e32 v39, v39, v30 +; SI-NEXT: v_or_b32_e32 v33, v33, v32 +; SI-NEXT: v_or_b32_e32 v50, v50, v28 +; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16 +; SI-NEXT: v_alignbit_b32 v27, v22, v27, 16 +; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16 +; SI-NEXT: v_alignbit_b32 v32, v31, v32, 16 +; SI-NEXT: v_alignbit_b32 v28, v8, v28, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v4 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_or_b32_e32 v4, v35, v1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_or_b32_e32 v59, v35, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_alignbit_b32 v10, v48, v10, 16 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_or_b32_e32 v58, v45, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v47 +; SI-NEXT: v_alignbit_b32 v13, v37, v13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v56, v35, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v51 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v46 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_or_b32_e32 v46, v45, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v47, v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v43 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_or_b32_e32 v61, v43, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v54 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v54 +; SI-NEXT: v_or_b32_e32 v54, v51, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v41 +; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v51 +; SI-NEXT: v_or_b32_e32 v36, v36, v41 +; SI-NEXT: v_alignbit_b32 v51, v25, v35, 16 +; SI-NEXT: v_alignbit_b32 v41, v2, v41, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 +; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_or_b32_e32 v4, v44, v43 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v14, v17, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v7, v21, 16 +; SI-NEXT: v_alignbit_b32 v44, v18, v45, 16 +; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v34, v34, v35 +; SI-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 +; SI-NEXT: v_or_b32_e32 v34, v34, v35 +; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60f16_to_v60i16: ; VI: ; %bb.0: @@ -25223,7 +52594,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 @@ -25285,7 +52656,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 ; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 ; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_sdwa v0, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25411,7 +52782,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v29, v59, v29, s6 @@ -25505,7 +52876,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v29 -; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: .LBB58_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v18, v40, v18, s4 @@ -25561,7 +52932,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] @@ -25593,7 +52964,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: .LBB29_2: ; %end +; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25635,7 +53006,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100 @@ -25727,7 +53098,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29 -; GFX11-FAKE16-NEXT: .LBB29_2: ; %end +; GFX11-FAKE16-NEXT: .LBB58_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100 ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100 @@ -25776,3 +53147,1659 @@ end: %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <60 x i16> %phi } + +define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v60f16_to_v60i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v45 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v46 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v61 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cbranch_scc0 .LBB59_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB59_3 +; SI-NEXT: .LBB59_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB59_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v61, v14 +; SI-NEXT: v_mov_b32_e32 v63, v15 +; SI-NEXT: v_mov_b32_e32 v15, v18 +; SI-NEXT: v_mov_b32_e32 v18, v22 +; SI-NEXT: v_mov_b32_e32 v22, v33 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v8, v5 +; SI-NEXT: v_mov_b32_e32 v5, v42 +; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: s_cbranch_vccnz .LBB59_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_or_b32_e32 v62, v1, v39 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_or_b32_e32 v6, v6, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_or_b32_e32 v16, v16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v20, v20, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v35 +; SI-NEXT: v_or_b32_e32 v34, v34, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_or_b32_e32 v37, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v48, v39, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 +; SI-NEXT: v_or_b32_e32 v51, v39, v50 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v39 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_or_b32_e32 v44, v39, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v47 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_or_b32_e32 v47, v50, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_or_b32_e32 v46, v50, v53 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v14, v58, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v63, v58, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v43, v43, v50 +; SI-NEXT: v_or_b32_e32 v28, v28, v57 +; SI-NEXT: v_or_b32_e32 v5, v5, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v41 +; SI-NEXT: v_or_b32_e32 v11, v11, v40 +; SI-NEXT: v_or_b32_e32 v33, v33, v55 +; SI-NEXT: v_or_b32_e32 v22, v22, v54 +; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_or_b32_e32 v15, v15, v24 +; SI-NEXT: v_or_b32_e32 v61, v58, v19 +; SI-NEXT: v_or_b32_e32 v1, v36, v14 +; SI-NEXT: v_alignbit_b32 v60, v44, v39, 16 +; SI-NEXT: v_alignbit_b32 v59, v29, v53, 16 +; SI-NEXT: v_alignbit_b32 v58, v26, v50, 16 +; SI-NEXT: v_alignbit_b32 v57, v51, v57, 16 +; SI-NEXT: v_alignbit_b32 v56, v48, v56, 16 +; SI-NEXT: v_alignbit_b32 v42, v37, v42, 16 +; SI-NEXT: v_alignbit_b32 v41, v34, v41, 16 +; SI-NEXT: v_alignbit_b32 v40, v31, v40, 16 +; SI-NEXT: v_alignbit_b32 v55, v20, v55, 16 +; SI-NEXT: v_alignbit_b32 v54, v16, v54, 16 +; SI-NEXT: v_alignbit_b32 v25, v12, v25, 16 +; SI-NEXT: v_alignbit_b32 v24, v9, v24, 16 +; SI-NEXT: v_alignbit_b32 v23, v6, v23, 16 +; SI-NEXT: v_alignbit_b32 v19, v3, v19, 16 +; SI-NEXT: v_alignbit_b32 v36, v62, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: .LBB59_5: ; %end +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v60 +; SI-NEXT: v_or_b32_e32 v39, v39, v50 +; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 +; SI-NEXT: v_or_b32_e32 v39, v39, v50 +; SI-NEXT: v_add_i32_e32 v50, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v59 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v39, v39, v50 +; SI-NEXT: v_add_i32_e32 v50, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v56 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v42 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v54 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_or_b32_e32 v5, v5, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v60f16_to_v60i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: s_lshr_b32 s7, s28, 16 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: s_lshr_b32 s10, s25, 16 +; VI-NEXT: s_lshr_b32 s11, s24, 16 +; VI-NEXT: s_lshr_b32 s12, s23, 16 +; VI-NEXT: s_lshr_b32 s13, s22, 16 +; VI-NEXT: s_lshr_b32 s14, s21, 16 +; VI-NEXT: s_lshr_b32 s15, s20, 16 +; VI-NEXT: s_lshr_b32 s40, s19, 16 +; VI-NEXT: s_lshr_b32 s41, s18, 16 +; VI-NEXT: s_lshr_b32 s42, s17, 16 +; VI-NEXT: s_lshr_b32 s43, s16, 16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB59_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_4 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v54, 0x200 +; VI-NEXT: v_add_f16_e32 v32, s16, v54 +; VI-NEXT: v_add_f16_e32 v59, s43, v54 +; VI-NEXT: v_add_f16_e32 v33, s17, v54 +; VI-NEXT: v_add_f16_e32 v58, s42, v54 +; VI-NEXT: v_add_f16_e32 v34, s18, v54 +; VI-NEXT: v_add_f16_e32 v57, s41, v54 +; VI-NEXT: v_add_f16_e32 v35, s19, v54 +; VI-NEXT: v_add_f16_e32 v56, s40, v54 +; VI-NEXT: v_add_f16_e32 v36, s20, v54 +; VI-NEXT: v_add_f16_e32 v47, s15, v54 +; VI-NEXT: v_add_f16_e32 v37, s21, v54 +; VI-NEXT: v_add_f16_e32 v46, s14, v54 +; VI-NEXT: v_add_f16_e32 v38, s22, v54 +; VI-NEXT: v_add_f16_e32 v45, s13, v54 +; VI-NEXT: v_add_f16_e32 v39, s23, v54 +; VI-NEXT: v_add_f16_e32 v44, s12, v54 +; VI-NEXT: v_add_f16_e32 v48, s24, v54 +; VI-NEXT: v_add_f16_e32 v43, s11, v54 +; VI-NEXT: v_add_f16_e32 v49, s25, v54 +; VI-NEXT: v_add_f16_e32 v42, s10, v54 +; VI-NEXT: v_add_f16_e32 v50, s26, v54 +; VI-NEXT: v_add_f16_e32 v41, s9, v54 +; VI-NEXT: v_add_f16_e32 v51, s27, v54 +; VI-NEXT: v_add_f16_e32 v40, s8, v54 +; VI-NEXT: v_add_f16_e32 v52, s28, v54 +; VI-NEXT: v_add_f16_e32 v55, s7, v54 +; VI-NEXT: v_add_f16_e32 v53, s29, v54 +; VI-NEXT: v_add_f16_e32 v54, s6, v54 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: s_branch .LBB59_5 +; VI-NEXT: .LBB59_3: +; VI-NEXT: s_branch .LBB59_2 +; VI-NEXT: .LBB59_4: +; VI-NEXT: v_mov_b32_e32 v54, s6 +; VI-NEXT: v_mov_b32_e32 v53, s29 +; VI-NEXT: v_mov_b32_e32 v55, s7 +; VI-NEXT: v_mov_b32_e32 v52, s28 +; VI-NEXT: v_mov_b32_e32 v40, s8 +; VI-NEXT: v_mov_b32_e32 v51, s27 +; VI-NEXT: v_mov_b32_e32 v41, s9 +; VI-NEXT: v_mov_b32_e32 v50, s26 +; VI-NEXT: v_mov_b32_e32 v42, s10 +; VI-NEXT: v_mov_b32_e32 v49, s25 +; VI-NEXT: v_mov_b32_e32 v43, s11 +; VI-NEXT: v_mov_b32_e32 v48, s24 +; VI-NEXT: v_mov_b32_e32 v44, s12 +; VI-NEXT: v_mov_b32_e32 v39, s23 +; VI-NEXT: v_mov_b32_e32 v45, s13 +; VI-NEXT: v_mov_b32_e32 v38, s22 +; VI-NEXT: v_mov_b32_e32 v46, s14 +; VI-NEXT: v_mov_b32_e32 v37, s21 +; VI-NEXT: v_mov_b32_e32 v47, s15 +; VI-NEXT: v_mov_b32_e32 v36, s20 +; VI-NEXT: v_mov_b32_e32 v56, s40 +; VI-NEXT: v_mov_b32_e32 v35, s19 +; VI-NEXT: v_mov_b32_e32 v57, s41 +; VI-NEXT: v_mov_b32_e32 v34, s18 +; VI-NEXT: v_mov_b32_e32 v58, s42 +; VI-NEXT: v_mov_b32_e32 v33, s17 +; VI-NEXT: v_mov_b32_e32 v59, s43 +; VI-NEXT: v_mov_b32_e32 v32, s16 +; VI-NEXT: .LBB59_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; VI-NEXT: v_or_b32_sdwa v32, v32, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v36, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v37, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v38, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v39, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v48, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v49, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v50, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v51, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v0, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_or_b32_sdwa v22, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_or_b32_sdwa v23, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; VI-NEXT: v_or_b32_sdwa v24, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v27 +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_or_b32_sdwa v27, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_sdwa v52, v52, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v53, v53, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v0, v32 +; VI-NEXT: v_mov_b32_e32 v1, v33 +; VI-NEXT: v_mov_b32_e32 v2, v34 +; VI-NEXT: v_mov_b32_e32 v3, v35 +; VI-NEXT: v_mov_b32_e32 v4, v36 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v38 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v8, v48 +; VI-NEXT: v_mov_b32_e32 v9, v49 +; VI-NEXT: v_mov_b32_e32 v10, v50 +; VI-NEXT: v_mov_b32_e32 v11, v51 +; VI-NEXT: v_mov_b32_e32 v12, v52 +; VI-NEXT: v_mov_b32_e32 v13, v53 +; VI-NEXT: v_mov_b32_e32 v14, v30 +; VI-NEXT: v_mov_b32_e32 v15, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v60f16_to_v60i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_lshr_b32 s43, s29, 16 +; GFX9-NEXT: s_lshr_b32 s42, s28, 16 +; GFX9-NEXT: s_lshr_b32 s41, s27, 16 +; GFX9-NEXT: s_lshr_b32 s40, s26, 16 +; GFX9-NEXT: s_lshr_b32 s15, s25, 16 +; GFX9-NEXT: s_lshr_b32 s14, s24, 16 +; GFX9-NEXT: s_lshr_b32 s13, s23, 16 +; GFX9-NEXT: s_lshr_b32 s12, s22, 16 +; GFX9-NEXT: s_lshr_b32 s11, s21, 16 +; GFX9-NEXT: s_lshr_b32 s10, s20, 16 +; GFX9-NEXT: s_lshr_b32 s9, s19, 16 +; GFX9-NEXT: s_lshr_b32 s8, s18, 16 +; GFX9-NEXT: s_lshr_b32 s7, s17, 16 +; GFX9-NEXT: s_lshr_b32 s6, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v29, 16, v15 +; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_lshl_or_b32 v14, v28, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v13, v27, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v12, v26, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v55, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v10, v54, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v9, v25, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v8, v24, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v22, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v21, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v19, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v17, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s4 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s29, s43 +; GFX9-NEXT: v_mov_b32_e32 v16, 0x200 +; GFX9-NEXT: v_pk_add_f16 v30, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s28, s42 +; GFX9-NEXT: v_pk_add_f16 v31, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s27, s41 +; GFX9-NEXT: v_pk_add_f16 v51, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s26, s40 +; GFX9-NEXT: v_pk_add_f16 v50, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s25, s15 +; GFX9-NEXT: v_pk_add_f16 v49, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s24, s14 +; GFX9-NEXT: v_pk_add_f16 v48, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s23, s13 +; GFX9-NEXT: v_pk_add_f16 v39, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s22, s12 +; GFX9-NEXT: v_pk_add_f16 v38, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s21, s11 +; GFX9-NEXT: v_pk_add_f16 v37, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s20, s10 +; GFX9-NEXT: v_pk_add_f16 v36, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s19, s9 +; GFX9-NEXT: v_pk_add_f16 v35, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s8 +; GFX9-NEXT: v_pk_add_f16 v34, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s17, s7 +; GFX9-NEXT: v_pk_add_f16 v33, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s6 +; GFX9-NEXT: v_pk_add_f16 v32, s4, v16 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: s_branch .LBB59_5 +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v30, s29 +; GFX9-NEXT: v_mov_b32_e32 v31, s28 +; GFX9-NEXT: v_mov_b32_e32 v51, s27 +; GFX9-NEXT: v_mov_b32_e32 v50, s26 +; GFX9-NEXT: v_mov_b32_e32 v49, s25 +; GFX9-NEXT: v_mov_b32_e32 v48, s24 +; GFX9-NEXT: v_mov_b32_e32 v39, s23 +; GFX9-NEXT: v_mov_b32_e32 v38, s22 +; GFX9-NEXT: v_mov_b32_e32 v37, s21 +; GFX9-NEXT: v_mov_b32_e32 v36, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s18 +; GFX9-NEXT: v_mov_b32_e32 v33, s17 +; GFX9-NEXT: v_mov_b32_e32 v32, s16 +; GFX9-NEXT: v_mov_b32_e32 v53, s43 +; GFX9-NEXT: v_mov_b32_e32 v52, s42 +; GFX9-NEXT: v_mov_b32_e32 v40, s41 +; GFX9-NEXT: v_mov_b32_e32 v41, s40 +; GFX9-NEXT: v_mov_b32_e32 v42, s15 +; GFX9-NEXT: v_mov_b32_e32 v43, s14 +; GFX9-NEXT: v_mov_b32_e32 v44, s13 +; GFX9-NEXT: v_mov_b32_e32 v45, s12 +; GFX9-NEXT: v_mov_b32_e32 v46, s11 +; GFX9-NEXT: v_mov_b32_e32 v47, s10 +; GFX9-NEXT: v_mov_b32_e32 v56, s9 +; GFX9-NEXT: v_mov_b32_e32 v57, s8 +; GFX9-NEXT: v_mov_b32_e32 v58, s7 +; GFX9-NEXT: v_mov_b32_e32 v59, s6 +; GFX9-NEXT: .LBB59_5: ; %end +; GFX9-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX9-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX9-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX9-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX9-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX9-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX9-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GFX9-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX9-NEXT: v_lshl_or_b32 v32, v59, 16, v32 +; GFX9-NEXT: v_lshl_or_b32 v33, v58, 16, v33 +; GFX9-NEXT: v_lshl_or_b32 v34, v57, 16, v34 +; GFX9-NEXT: v_lshl_or_b32 v35, v56, 16, v35 +; GFX9-NEXT: v_lshl_or_b32 v36, v47, 16, v36 +; GFX9-NEXT: v_lshl_or_b32 v37, v46, 16, v37 +; GFX9-NEXT: v_lshl_or_b32 v38, v45, 16, v38 +; GFX9-NEXT: v_lshl_or_b32 v39, v44, 16, v39 +; GFX9-NEXT: v_lshl_or_b32 v48, v43, 16, v48 +; GFX9-NEXT: v_lshl_or_b32 v49, v42, 16, v49 +; GFX9-NEXT: v_lshl_or_b32 v50, v41, 16, v50 +; GFX9-NEXT: v_lshl_or_b32 v51, v40, 16, v51 +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v16, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v17, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v16, v18, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v17, v19, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v18, v20, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v19, v21, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v20, v22, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v21, v23, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v22, v24, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v23, v25, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v24, v54, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NEXT: v_lshl_or_b32 v25, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v32 +; GFX9-NEXT: v_mov_b32_e32 v1, v33 +; GFX9-NEXT: v_mov_b32_e32 v2, v34 +; GFX9-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-NEXT: v_mov_b32_e32 v4, v36 +; GFX9-NEXT: v_mov_b32_e32 v5, v37 +; GFX9-NEXT: v_mov_b32_e32 v6, v38 +; GFX9-NEXT: v_mov_b32_e32 v7, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v48 +; GFX9-NEXT: v_mov_b32_e32 v9, v49 +; GFX9-NEXT: v_mov_b32_e32 v10, v50 +; GFX9-NEXT: v_mov_b32_e32 v11, v51 +; GFX9-NEXT: v_mov_b32_e32 v12, v52 +; GFX9-NEXT: v_mov_b32_e32 v13, v53 +; GFX9-NEXT: v_mov_b32_e32 v14, v30 +; GFX9-NEXT: v_mov_b32_e32 v15, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v14, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v12, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v13, 16, v10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v14, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v12, 16, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s24, s15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v14, 16, v1 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v13, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v15, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v12, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s23, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s15, s11 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v12, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v13, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v15, 16, v0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s14, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s11, s9 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s19 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s18 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s10, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s9, s7 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s17 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s16 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s40 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s12 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v37, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v36, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v65.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v67.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v69.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v81.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v83.l +; GFX11-TRUE16-NEXT: s_branch .LBB59_5 +; GFX11-TRUE16-NEXT: .LBB59_3: +; GFX11-TRUE16-NEXT: s_branch .LBB59_2 +; GFX11-TRUE16-NEXT: .LBB59_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, s29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, s27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, s45 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, s44 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, s43 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, s42 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, s41 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, s40 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, s13 +; GFX11-TRUE16-NEXT: .LBB59_5: ; %end +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v36, v38, 16, v37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v37, v39, 16, v64 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v39, v49, 16, v31 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v35, 16, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v35, v29, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v49, v33, 16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v38, v48, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v33, v54, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v48, v55, 16, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v50, 16, v14 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v52, 16, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v34, 16, v23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v34, v28, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v25, 16, v17 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v27, 16, v18 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v19, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v20, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v34 :: v_dual_and_b32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v23, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v6, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v30 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v32, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v51, 16, v13 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v26, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v8, 16, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v36 :: v_dual_mov_b32 v1, v37 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v31 :: v_dual_mov_b32 v8, v32 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, v35 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v60i16_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s45, s29, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s44, s28, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s43, s27, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s42, s26, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s41, s25, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s40, s24, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s15, s23, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s22, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s21, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s20, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s19, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s18, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s17, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s16, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s0, 16 +; GFX11-FAKE16-NEXT: s_mov_b32 s46, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s46 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v3, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v18, 16, v0 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s29, s29, s45 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s28, s28, s44 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s27, s27, s43 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s26, s26, s42 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s25, s25, s41 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s24, s24, s40 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s15, s23, s15 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s14, s22, s14 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s13, s21, s13 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v34, 0x200, s15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v35, 0x200, s14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, s13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, s12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, s11 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v33, 0x200, s10 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v49, 0x200, s9 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v48, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v36, 0x200, s7 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v38 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v36 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v49 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; GFX11-FAKE16-NEXT: s_branch .LBB59_5 +; GFX11-FAKE16-NEXT: .LBB59_3: +; GFX11-FAKE16-NEXT: s_branch .LBB59_2 +; GFX11-FAKE16-NEXT: .LBB59_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v51, s44 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s43 :: v_dual_mov_b32 v53, s42 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s40 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v64, s15 :: v_dual_mov_b32 v65, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v66, s13 :: v_dual_mov_b32 v67, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, s11 :: v_dual_mov_b32 v69, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, s9 :: v_dual_mov_b32 v71, s7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v80, s6 :: v_dual_mov_b32 v81, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s5 +; GFX11-FAKE16-NEXT: .LBB59_5: ; %end +; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v49, v70, 16, v49 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v84, 0xffff, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v19, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v49 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v82, 16, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v39, v80, 16, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v18, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v23, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v22, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v28, 16, v3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v39 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v38, v81, 16, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v80, 0xffff, v30 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v27, 16, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v37 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v36, v83, 16, v48 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v38 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v48, v71, 16, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v71, 0xffff, v31 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v69, 16, v33 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v68, 16, v32 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v33, v66, 16, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v67, 16, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v65, 16, v35 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v35, v64, 16, v66 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v55, 16, v14 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v53, 16, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff, v15 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v35 :: v_dual_and_b32 v4, 0xffff, v11 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v54, 16, v13 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v52, 16, v17 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v51, 16, v16 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v50, 16, v53 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v48 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v34 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <60 x half> %a, splat (half 0xH0200) + %a2 = bitcast <60 x half> %a1 to <60 x i16> + br label %end + +cmp.false: + %a3 = bitcast <60 x half> %a to <60 x i16> + br label %end + +end: + %phi = phi <60 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <60 x i16> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 4ae7c88e7eb45..f888f4f3b1407 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1,27 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: .LBB0_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v3f32: ; VI: ; %bb.0: @@ -85,22 +84,116 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v3i32_to_v3f32_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB1_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB1_3 +; SI-NEXT: .LBB1_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: .LBB1_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB1_4: +; SI-NEXT: s_branch .LBB1_2 +; +; VI-LABEL: bitcast_v3i32_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB1_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB1_3 +; VI-NEXT: .LBB1_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB1_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB1_4: +; VI-NEXT: s_branch .LBB1_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-NEXT: .LBB1_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB1_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB1_4: +; GFX9-NEXT: s_branch .LBB1_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_3 +; GFX11-NEXT: .LBB1_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB1_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB1_4: +; GFX11-NEXT: s_branch .LBB1_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB1_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: .LBB1_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; %bb.2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v3i32: ; VI: ; %bb.0: @@ -163,58 +256,156 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v3f32_to_v3i32_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB3_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB3_3: +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB3_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: +; VI-NEXT: s_branch .LBB3_2 +; VI-NEXT: .LBB3_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_branch .LBB3_2 +; GFX9-NEXT: .LBB3_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: +; GFX11-NEXT: s_branch .LBB3_2 +; GFX11-NEXT: .LBB3_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB2_4 -; GCN-NEXT: .LBB2_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB2_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB2_2 -; GCN-NEXT: .LBB2_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB4_4 +; SI-NEXT: .LBB4_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB4_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB4_2 +; SI-NEXT: .LBB4_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v12i8: ; VI: ; %bb.0: @@ -234,7 +425,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: s_cbranch_execz .LBB4_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -245,9 +436,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB2_2: ; %Flow +; VI-NEXT: .LBB4_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB2_4 +; VI-NEXT: s_cbranch_execz .LBB4_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 @@ -261,7 +452,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB2_4: ; %end +; VI-NEXT: .LBB4_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v13 ; VI-NEXT: v_mov_b32_e32 v4, v14 @@ -285,7 +476,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -296,9 +487,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB2_2: ; %Flow +; GFX9-NEXT: .LBB4_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: s_cbranch_execz .LBB4_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 ; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 @@ -312,7 +503,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB2_4: ; %end +; GFX9-NEXT: .LBB4_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -342,7 +533,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 @@ -355,7 +546,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 -; GFX11-TRUE16-NEXT: .LBB2_4: ; %end +; GFX11-TRUE16-NEXT: .LBB4_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h @@ -384,7 +575,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -395,9 +586,9 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB2_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB4_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB4_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14 @@ -412,7 +603,7 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB2_4: ; %end +; GFX11-FAKE16-NEXT: .LBB4_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 @@ -434,103 +625,391 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB5_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB5_3 +; SI-NEXT: .LBB5_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB5_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB5_2 +; +; VI-LABEL: bitcast_v3i32_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB5_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB5_3 +; VI-NEXT: .LBB5_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: .LBB5_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s15 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB5_4: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB5_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s14, s17, 8 +; GFX9-NEXT: s_lshr_b32 s15, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB5_3 +; GFX9-NEXT: .LBB5_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s14, s17, 8 +; GFX9-NEXT: s_lshr_b32 s15, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: .LBB5_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB5_4: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB5_2 +; +; GFX11-TRUE16-LABEL: bitcast_v3i32_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-TRUE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: .LBB5_3: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s6 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-NEXT: .LBB5_4: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB5_2 +; +; GFX11-FAKE16-LABEL: bitcast_v3i32_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB5_4 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s14 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-FAKE16-NEXT: .LBB5_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s1, s1, 3 +; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: .LBB5_3: ; %end +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v3, s4 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s11 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s6 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB5_4: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr3 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB5_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB3_4 -; GCN-NEXT: .LBB3_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB3_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v6, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB3_2 -; GCN-NEXT: .LBB3_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v15, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB6_4 +; SI-NEXT: .LBB6_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB6_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB6_2 +; SI-NEXT: .LBB6_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v3i32: ; VI: ; %bb.0: @@ -547,14 +1026,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: s_cbranch_execnz .LBB6_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB3_4 -; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB6_4 +; VI-NEXT: .LBB6_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB3_3: ; %cmp.false +; VI-NEXT: .LBB6_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -577,8 +1056,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB3_2 -; VI-NEXT: .LBB3_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: .LBB6_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -619,14 +1098,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: s_cbranch_execnz .LBB6_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB3_4 -; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB6_4 +; GFX9-NEXT: .LBB6_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB3_3: ; %cmp.false +; GFX9-NEXT: .LBB6_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -649,8 +1128,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: .LBB3_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: .LBB6_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -693,14 +1172,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB3_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB6_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB3_4 -; GFX11-TRUE16-NEXT: .LBB3_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB6_4 +; GFX11-TRUE16-NEXT: .LBB6_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB3_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h @@ -737,8 +1216,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB3_2 -; GFX11-TRUE16-NEXT: .LBB3_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 @@ -792,14 +1271,14 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB3_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB6_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB3_4 -; GFX11-FAKE16-NEXT: .LBB3_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB6_4 +; GFX11-FAKE16-NEXT: .LBB6_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB3_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -835,8 +1314,8 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB3_2 -; GFX11-FAKE16-NEXT: .LBB3_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-FAKE16-NEXT: .LBB6_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -890,53 +1369,403 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v12i8_to_v3i32_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB7_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB7_3 +; SI-NEXT: .LBB7_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: .LBB7_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB7_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB7_2 +; +; VI-LABEL: bitcast_v12i8_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB7_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB7_3 +; VI-NEXT: .LBB7_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB7_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB7_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; VI-NEXT: s_branch .LBB7_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-NEXT: .LBB7_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB7_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB7_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX9-NEXT: s_branch .LBB7_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB7_3 +; GFX11-NEXT: .LBB7_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB7_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX11-NEXT: s_branch .LBB7_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB4_4 -; GCN-NEXT: .LBB4_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB4_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB4_2 -; GCN-NEXT: .LBB4_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB8_4 +; SI-NEXT: .LBB8_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB8_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB8_2 +; SI-NEXT: .LBB8_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6bf16: ; VI: ; %bb.0: @@ -1000,63 +1829,179 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v3i32_to_v6bf16_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB9_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB9_3 +; SI-NEXT: .LBB9_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB9_4: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB9_2 +; +; VI-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB9_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB9_3 +; VI-NEXT: .LBB9_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB9_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB9_4: +; VI-NEXT: s_branch .LBB9_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB9_3 +; GFX9-NEXT: .LBB9_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB9_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB9_4: +; GFX9-NEXT: s_branch .LBB9_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_3 +; GFX11-NEXT: .LBB9_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB9_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_branch .LBB9_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB5_4 -; GCN-NEXT: .LBB5_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB5_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB5_2 -; GCN-NEXT: .LBB5_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v3, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB10_4 +; SI-NEXT: .LBB10_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB10_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB10_2 +; SI-NEXT: .LBB10_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v3i32: ; VI: ; %bb.0: @@ -1065,7 +2010,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -1122,7 +2067,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1133,7 +2078,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -1182,7 +2127,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s7 -; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: .LBB10_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1194,7 +2139,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1252,7 +2197,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: .LBB5_2: ; %end +; GFX11-TRUE16-NEXT: .LBB10_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1264,7 +2209,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1317,7 +2262,7 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB5_2: ; %end +; GFX11-FAKE16-NEXT: .LBB10_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -1337,59 +2282,352 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: s_cbranch_scc0 .LBB11_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: s_cbranch_execnz .LBB11_3 +; SI-NEXT: .LBB11_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: .LBB11_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB11_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB11_2 +; +; VI-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB11_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: +; VI-NEXT: s_branch .LBB11_2 +; VI-NEXT: .LBB11_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_branch .LBB11_2 +; GFX9-NEXT: .LBB11_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: +; GFX11-NEXT: s_branch .LBB11_2 +; GFX11-NEXT: .LBB11_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB6_4 -; GCN-NEXT: .LBB6_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB6_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB6_2 -; GCN-NEXT: .LBB6_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB12_4 +; SI-NEXT: .LBB12_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB12_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB12_2 +; SI-NEXT: .LBB12_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6f16: ; VI: ; %bb.0: @@ -1453,78 +2691,194 @@ end: ret <6 x half> %phi } -define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB7_4 -; GCN-NEXT: .LBB7_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB7_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: v_or_b32_e32 v1, v6, v1 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB7_2 -; GCN-NEXT: .LBB7_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v4, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <6 x half> @bitcast_v3i32_to_v6f16_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB13_2 ; -; VI-LABEL: bitcast_v6f16_to_v3i32: +; VI-LABEL: bitcast_v3i32_to_v6f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: .LBB13_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB13_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_4: +; VI-NEXT: s_branch .LBB13_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: .LBB13_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB13_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_4: +; GFX9-NEXT: s_branch .LBB13_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: s_branch .LBB13_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + +define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { +; SI-LABEL: bitcast_v6f16_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB14_4 +; SI-NEXT: .LBB14_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB14_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB14_2 +; SI-NEXT: .LBB14_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f16_to_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1536,7 +2890,7 @@ define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1588,40 +2942,182 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v6f16_to_v3i32_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB15_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_cbranch_execnz .LBB15_3 +; SI-NEXT: .LBB15_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: .LBB15_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB15_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB15_2 +; +; VI-LABEL: bitcast_v6f16_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB15_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB15_4 +; VI-NEXT: .LBB15_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB15_3: +; VI-NEXT: s_branch .LBB15_2 +; VI-NEXT: .LBB15_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB15_4 +; GFX9-NEXT: .LBB15_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_branch .LBB15_2 +; GFX9-NEXT: .LBB15_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB15_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB15_4 +; GFX11-NEXT: .LBB15_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB15_3: +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) { -; GCN-LABEL: bitcast_v3i32_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB8_4 -; GCN-NEXT: .LBB8_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB8_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB8_2 -; GCN-NEXT: .LBB8_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3i32_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB16_4 +; SI-NEXT: .LBB16_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB16_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB16_2 +; SI-NEXT: .LBB16_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3i32_to_v6i16: ; VI: ; %bb.0: @@ -1685,56 +3181,165 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3i32_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB17_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB17_3 +; SI-NEXT: .LBB17_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: .LBB17_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB17_4: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB17_2 +; +; VI-LABEL: bitcast_v3i32_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB17_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB17_3 +; VI-NEXT: .LBB17_2: ; %cmp.true +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: .LBB17_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB17_4: +; VI-NEXT: s_branch .LBB17_2 +; +; GFX9-LABEL: bitcast_v3i32_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB17_3 +; GFX9-NEXT: .LBB17_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: .LBB17_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB17_4: +; GFX9-NEXT: s_branch .LBB17_2 +; +; GFX11-LABEL: bitcast_v3i32_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB17_4 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB17_3 +; GFX11-NEXT: .LBB17_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: .LBB17_3: ; %end +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_4: +; GFX11-NEXT: s_branch .LBB17_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB9_4 -; GCN-NEXT: .LBB9_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB9_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v6 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB9_2 -; GCN-NEXT: .LBB9_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB18_4 +; SI-NEXT: .LBB18_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB18_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB18_2 +; SI-NEXT: .LBB18_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v3i32: ; VI: ; %bb.0: @@ -1743,7 +3348,7 @@ define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_e32 v3, 3, v2 @@ -1755,7 +3360,7 @@ define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v3, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1806,58 +3411,188 @@ end: ret <3 x i32> %phi } +define inreg <3 x i32> @bitcast_v6i16_to_v3i32_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v3i32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB19_3 +; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB19_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB19_2 +; +; VI-LABEL: bitcast_v6i16_to_v3i32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB19_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB19_3 +; VI-NEXT: .LBB19_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB19_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB19_4: +; VI-NEXT: s_branch .LBB19_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v3i32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB19_4 +; GFX9-NEXT: .LBB19_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_branch .LBB19_2 +; GFX9-NEXT: .LBB19_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v3i32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: +; GFX11-NEXT: s_branch .LBB19_2 +; GFX11-NEXT: .LBB19_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB10_4 -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB10_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB10_2 -; GCN-NEXT: .LBB10_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB20_4 +; SI-NEXT: .LBB20_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB20_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB20_2 +; SI-NEXT: .LBB20_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v12i8: ; VI: ; %bb.0: @@ -1877,7 +3612,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: s_cbranch_execz .LBB20_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -1888,9 +3623,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB10_2: ; %Flow +; VI-NEXT: .LBB20_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB10_4 +; VI-NEXT: s_cbranch_execz .LBB20_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -1904,7 +3639,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; VI-NEXT: .LBB10_4: ; %end +; VI-NEXT: .LBB20_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v13 ; VI-NEXT: v_mov_b32_e32 v4, v14 @@ -1928,7 +3663,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: s_cbranch_execz .LBB20_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -1939,9 +3674,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB10_2: ; %Flow +; GFX9-NEXT: .LBB20_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: s_cbranch_execz .LBB20_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 ; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -1955,7 +3690,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB10_4: ; %end +; GFX9-NEXT: .LBB20_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -1985,7 +3720,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 1.0, v10 @@ -1997,7 +3732,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 -; GFX11-TRUE16-NEXT: .LBB10_4: ; %end +; GFX11-TRUE16-NEXT: .LBB20_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h @@ -2026,7 +3761,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -2037,9 +3772,9 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB10_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB10_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 1.0, v8 ; GFX11-FAKE16-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13 @@ -2053,7 +3788,7 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB10_4: ; %end +; GFX11-FAKE16-NEXT: .LBB20_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 @@ -2075,103 +3810,411 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB21_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s17, 24 +; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_cbranch_execnz .LBB21_4 +; SI-NEXT: .LBB21_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB21_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB21_2 +; SI-NEXT: .LBB21_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB21_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s15, s17, 8 +; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v14, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v13, s16, 1.0 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: s_branch .LBB21_5 +; VI-NEXT: .LBB21_3: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB21_2 +; VI-NEXT: .LBB21_4: +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB21_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v13 +; VI-NEXT: v_mov_b32_e32 v4, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v8, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v14, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v13, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: s_branch .LBB21_5 +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB21_2 +; GFX9-NEXT: .LBB21_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB21_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v3f32_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-TRUE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, s2, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, s1, 1.0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, s0, 1.0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[8:9] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB21_5 +; GFX11-TRUE16-NEXT: .LBB21_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB21_2 +; GFX11-TRUE16-NEXT: .LBB21_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: .LBB21_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v3f32_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB21_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB21_4 +; GFX11-FAKE16-NEXT: .LBB21_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, s2, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v14, s1, 1.0 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, s0, 1.0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB21_5 +; GFX11-FAKE16-NEXT: .LBB21_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB21_2 +; GFX11-FAKE16-NEXT: .LBB21_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB21_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB11_4 -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB11_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v6, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v1, v2, v3 -; GCN-NEXT: v_or_b32_e32 v2, v4, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB11_2 -; GCN-NEXT: .LBB11_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NEXT: v_or_b32_e32 v0, v12, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v15, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v9, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v6 -; GCN-NEXT: v_or_b32_e32 v5, v7, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB22_4 +; SI-NEXT: .LBB22_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB22_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB22_2 +; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v3f32: ; VI: ; %bb.0: @@ -2188,14 +4231,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: s_cbranch_execnz .LBB22_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB11_4 -; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB22_4 +; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB11_3: ; %cmp.false +; VI-NEXT: .LBB22_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2218,8 +4261,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB11_2 -; VI-NEXT: .LBB11_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: .LBB22_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2260,14 +4303,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: s_cbranch_execnz .LBB22_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_4 -; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB22_4 +; GFX9-NEXT: .LBB22_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB11_3: ; %cmp.false +; GFX9-NEXT: .LBB22_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2290,8 +4333,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB11_2 -; GFX9-NEXT: .LBB11_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: .LBB22_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -2334,14 +4377,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-TRUE16-NEXT: .LBB11_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h @@ -2378,8 +4421,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-TRUE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 @@ -2433,14 +4476,14 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_4 -; GFX11-FAKE16-NEXT: .LBB11_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-FAKE16-NEXT: .LBB22_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB11_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -2476,8 +4519,8 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 -; GFX11-FAKE16-NEXT: .LBB11_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-FAKE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -2531,67 +4574,417 @@ end: ret <3 x float> %phi } -define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB12_4 -; GCN-NEXT: .LBB12_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB12_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB12_2 -; GCN-NEXT: .LBB12_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +define inreg <3 x float> @bitcast_v12i8_to_v3f32_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB23_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB23_3 +; SI-NEXT: .LBB23_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB23_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB23_2 ; -; VI-LABEL: bitcast_v3f32_to_v6bf16: +; VI-LABEL: bitcast_v12i8_to_v3f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB23_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB23_3 +; VI-NEXT: .LBB23_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB23_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB23_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; VI-NEXT: s_branch .LBB23_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB23_3 +; GFX9-NEXT: .LBB23_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB23_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB23_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX9-NEXT: s_branch .LBB23_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s8, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 +; GFX11-NEXT: .LBB23_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB23_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; GFX11-NEXT: s_branch .LBB23_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + +define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) { +; SI-LABEL: bitcast_v3f32_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB24_4 +; SI-NEXT: .LBB24_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB24_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB24_2 +; SI-NEXT: .LBB24_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v0, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v3f32_to_v6bf16: @@ -2640,63 +5033,184 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v3f32_to_v6bf16_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB25_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s6, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s7, s18, 16 +; SI-NEXT: s_and_b32 s8, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s9, s17, 16 +; SI-NEXT: s_and_b32 s10, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s16, 16 +; SI-NEXT: s_cbranch_execnz .LBB25_4 +; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB25_3: +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: s_branch .LBB25_2 +; SI-NEXT: .LBB25_4: +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB25_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: +; VI-NEXT: s_branch .LBB25_2 +; VI-NEXT: .LBB25_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: +; GFX9-NEXT: s_branch .LBB25_2 +; GFX9-NEXT: .LBB25_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: +; GFX11-NEXT: s_branch .LBB25_2 +; GFX11-NEXT: .LBB25_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB13_4 -; GCN-NEXT: .LBB13_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB13_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB13_2 -; GCN-NEXT: .LBB13_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v3, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB26_4 +; SI-NEXT: .LBB26_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB26_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB26_2 +; SI-NEXT: .LBB26_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v3f32: ; VI: ; %bb.0: @@ -2705,7 +5219,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -2762,7 +5276,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2773,7 +5287,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: s_cbranch_execz .LBB26_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -2822,7 +5336,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s7 -; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: .LBB26_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2834,7 +5348,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2892,7 +5406,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 -; GFX11-TRUE16-NEXT: .LBB13_2: ; %end +; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2904,7 +5418,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2957,7 +5471,7 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB13_2: ; %end +; GFX11-FAKE16-NEXT: .LBB26_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -2977,59 +5491,352 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB27_2 +; +; VI-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB27_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB27_4 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB27_3: +; VI-NEXT: s_branch .LBB27_2 +; VI-NEXT: .LBB27_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB27_4 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB27_3: +; GFX9-NEXT: s_branch .LBB27_2 +; GFX9-NEXT: .LBB27_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB27_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB27_4 +; GFX11-NEXT: .LBB27_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v2, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB27_3: +; GFX11-NEXT: s_branch .LBB27_2 +; GFX11-NEXT: .LBB27_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB14_4 -; GCN-NEXT: .LBB14_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB14_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB14_2 -; GCN-NEXT: .LBB14_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v6 -; GCN-NEXT: v_add_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_add_f32_e32 v5, 1.0, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB28_4 +; SI-NEXT: .LBB28_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB28_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB28_2 +; SI-NEXT: .LBB28_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6f16: ; VI: ; %bb.0: @@ -3092,69 +5899,189 @@ end: ret <6 x half> %phi } +define inreg <6 x half> @bitcast_v3f32_to_v6f16_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB29_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s4, s18, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_lshr_b32 s4, s16, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: s_cbranch_execnz .LBB29_3 +; SI-NEXT: .LBB29_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s18, 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: .LBB29_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB29_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB29_2 +; +; VI-LABEL: bitcast_v3f32_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB29_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: +; VI-NEXT: s_branch .LBB29_2 +; VI-NEXT: .LBB29_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: +; GFX9-NEXT: s_branch .LBB29_2 +; GFX9-NEXT: .LBB29_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: +; GFX11-NEXT: s_branch .LBB29_2 +; GFX11-NEXT: .LBB29_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB15_4 -; GCN-NEXT: .LBB15_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB15_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: v_or_b32_e32 v1, v6, v1 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB15_2 -; GCN-NEXT: .LBB15_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v4, v2 -; GCN-NEXT: v_or_b32_e32 v2, v3, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB30_4 +; SI-NEXT: .LBB30_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB30_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB30_2 +; SI-NEXT: .LBB30_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v3f32: ; VI: ; %bb.0: @@ -3163,7 +6090,7 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: s_cbranch_execz .LBB30_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3175,7 +6102,7 @@ define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: .LBB30_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3227,40 +6154,182 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v6f16_to_v3f32_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v8, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB31_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_cbranch_execnz .LBB31_3 +; SI-NEXT: .LBB31_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: .LBB31_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB31_4: +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_branch .LBB31_2 +; +; VI-LABEL: bitcast_v6f16_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB31_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB31_4 +; VI-NEXT: .LBB31_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s17, v0 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB31_3: +; VI-NEXT: s_branch .LBB31_2 +; VI-NEXT: .LBB31_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB31_4 +; GFX9-NEXT: .LBB31_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB31_3: +; GFX9-NEXT: s_branch .LBB31_2 +; GFX9-NEXT: .LBB31_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB31_4 +; GFX11-NEXT: .LBB31_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: +; GFX11-NEXT: s_branch .LBB31_2 +; GFX11-NEXT: .LBB31_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) { -; GCN-LABEL: bitcast_v3f32_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB16_4 -; GCN-NEXT: .LBB16_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB16_3: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB16_2 -; GCN-NEXT: .LBB16_4: ; %cmp.true -; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v3f32_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB32_4 +; SI-NEXT: .LBB32_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB32_3: ; %cmp.false +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB32_2 +; SI-NEXT: .LBB32_4: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6i16: ; VI: ; %bb.0: @@ -3323,56 +6392,168 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v3f32_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s19, 0 +; SI-NEXT: s_cbranch_scc0 .LBB33_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 +; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_cbranch_execnz .LBB33_4 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_3: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB33_2 +; SI-NEXT: .LBB33_4: +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB33_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB33_4 +; VI-NEXT: .LBB33_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v2, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB33_3: +; VI-NEXT: s_branch .LBB33_2 +; VI-NEXT: .LBB33_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB33_4 +; GFX9-NEXT: .LBB33_2: ; %cmp.true +; GFX9-NEXT: v_add_f32_e64 v2, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB33_3: +; GFX9-NEXT: s_branch .LBB33_2 +; GFX9-NEXT: .LBB33_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB33_4 +; GFX11-NEXT: .LBB33_2: ; %cmp.true +; GFX11-NEXT: v_add_f32_e64 v2, s2, 1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: +; GFX11-NEXT: s_branch .LBB33_2 +; GFX11-NEXT: .LBB33_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v3f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v2 -; GCN-NEXT: v_mov_b32_e32 v7, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB17_4 -; GCN-NEXT: .LBB17_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB17_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v6 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB17_2 -; GCN-NEXT: .LBB17_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v1 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB34_4 +; SI-NEXT: .LBB34_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB34_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB34_2 +; SI-NEXT: .LBB34_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v3f32: ; VI: ; %bb.0: @@ -3381,7 +6562,7 @@ define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 3 ; VI-NEXT: v_add_u16_e32 v3, 3, v2 @@ -3393,7 +6574,7 @@ define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v3, 3, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3444,111 +6625,241 @@ end: ret <3 x float> %phi } +define inreg <3 x float> @bitcast_v6i16_to_v3f32_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v3f32_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB35_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_cbranch_execnz .LBB35_3 +; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: .LBB35_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB35_4: +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6 +; SI-NEXT: s_branch .LBB35_2 +; +; VI-LABEL: bitcast_v6i16_to_v3f32_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB35_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: .LBB35_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB35_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_4: +; VI-NEXT: s_branch .LBB35_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v3f32_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: +; GFX9-NEXT: s_branch .LBB35_2 +; GFX9-NEXT: .LBB35_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v3f32_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: +; GFX11-NEXT: s_branch .LBB35_2 +; GFX11-NEXT: .LBB35_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GCN-NEXT: v_or_b32_e32 v11, v1, v0 -; GCN-NEXT: v_or_b32_e32 v13, v14, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v15, v4 -; GCN-NEXT: v_or_b32_e32 v12, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v16, v8 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB18_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB18_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v10 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GCN-NEXT: v_or_b32_e32 v4, v17, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 -; GCN-NEXT: v_or_b32_e32 v3, v16, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_or_b32_e32 v5, v15, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 -; GCN-NEXT: .LBB18_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v11 -; GCN-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NEXT: v_mov_b32_e32 v2, v7 -; GCN-NEXT: v_mov_b32_e32 v4, v12 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v3, v7, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; SI-NEXT: v_or_b32_e32 v11, v4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v5, v16, v2 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: .LBB36_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB36_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8 +; SI-NEXT: .LBB36_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: v_mov_b32_e32 v4, v11 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6bf16: ; VI: ; %bb.0: @@ -3565,14 +6876,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB18_3 +; VI-NEXT: s_cbranch_execnz .LBB36_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB18_4 -; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB36_4 +; VI-NEXT: .LBB36_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB18_3: ; %cmp.false +; VI-NEXT: .LBB36_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3595,8 +6906,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB18_2 -; VI-NEXT: .LBB18_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: .LBB36_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3637,14 +6948,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB18_3 +; GFX9-NEXT: s_cbranch_execnz .LBB36_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB18_4 -; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB36_4 +; GFX9-NEXT: .LBB36_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB18_3: ; %cmp.false +; GFX9-NEXT: .LBB36_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3667,8 +6978,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB18_2 -; GFX9-NEXT: .LBB18_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: .LBB36_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -3713,14 +7024,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-TRUE16-NEXT: .LBB18_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_4 +; GFX11-TRUE16-NEXT: .LBB36_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB18_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h @@ -3758,8 +7069,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-TRUE16-NEXT: .LBB18_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 @@ -3813,14 +7124,14 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_4 -; GFX11-FAKE16-NEXT: .LBB18_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_4 +; GFX11-FAKE16-NEXT: .LBB36_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB18_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -3856,8 +7167,8 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB18_2 -; GFX11-FAKE16-NEXT: .LBB18_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-FAKE16-NEXT: .LBB36_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -3911,88 +7222,449 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v12i8_to_v6bf16_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s17, 24 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s8, s4, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s25, 24 +; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: s_cbranch_execnz .LBB37_3 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s5, 16 +; SI-NEXT: s_and_b32 s11, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s4, 16 +; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB37_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: s_branch .LBB37_2 +; +; VI-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB37_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB37_3 +; VI-NEXT: .LBB37_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB37_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB37_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB37_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB37_3 +; GFX9-NEXT: .LBB37_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB37_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB37_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB37_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB37_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s7, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB37_3 +; GFX11-NEXT: .LBB37_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB37_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB37_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB37_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB19_4 -; GCN-NEXT: .LBB19_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB19_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GCN-NEXT: v_alignbit_b32 v0, v0, v15, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v14, 16 -; GCN-NEXT: v_alignbit_b32 v8, v10, v12, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB19_2 -; GCN-NEXT: .LBB19_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v6, v2, 16 -; GCN-NEXT: v_alignbit_b32 v8, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v11 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB38_4 +; SI-NEXT: .LBB38_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB38_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16 +; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB38_2 +; SI-NEXT: .LBB38_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v12i8: ; VI: ; %bb.0: @@ -4012,7 +7684,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13 @@ -4023,9 +7695,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: .LBB38_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: s_cbranch_execz .LBB38_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -4092,7 +7764,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: .LBB38_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v15 ; VI-NEXT: v_mov_b32_e32 v4, v16 @@ -4117,7 +7789,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -4128,9 +7800,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: .LBB38_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: s_cbranch_execz .LBB38_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -4192,7 +7864,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: .LBB38_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -4216,7 +7888,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v3 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 @@ -4228,9 +7900,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v10.l -; GFX11-TRUE16-NEXT: .LBB19_2: ; %Flow +; GFX11-TRUE16-NEXT: .LBB38_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB19_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v13 @@ -4295,7 +7967,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; GFX11-TRUE16-NEXT: .LBB19_4: ; %end +; GFX11-TRUE16-NEXT: .LBB38_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h @@ -4325,7 +7997,7 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -4336,9 +8008,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB19_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB38_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB19_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB38_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v8 @@ -4384,30 +8056,588 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v6, v7, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v4, v5, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-FAKE16-NEXT: .LBB38_4: ; %end +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + +define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: s_cbranch_scc0 .LBB39_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16 +; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16 +; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: s_cbranch_execnz .LBB39_3 +; SI-NEXT: .LBB39_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 +; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB39_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB39_2 +; +; VI-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB39_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s15, s17, 8 +; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; VI-NEXT: s_branch .LBB39_5 +; VI-NEXT: .LBB39_3: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB39_2 +; VI-NEXT: .LBB39_4: +; VI-NEXT: v_mov_b32_e32 v14, s16 +; VI-NEXT: v_mov_b32_e32 v15, s17 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s14 +; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v13, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB39_5: ; %end +; VI-NEXT: v_mov_b32_e32 v0, v14 +; VI-NEXT: v_mov_b32_e32 v4, v15 +; VI-NEXT: v_mov_b32_e32 v9, v13 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s18, 16 +; GFX9-NEXT: s_lshr_b32 s14, s18, 8 +; GFX9-NEXT: s_lshr_b32 s10, s17, 24 +; GFX9-NEXT: s_lshr_b32 s11, s17, 8 +; GFX9-NEXT: s_lshr_b32 s13, s16, 16 +; GFX9-NEXT: s_lshr_b32 s12, s16, 8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v3 +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v3 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v3 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v11, v4, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB39_2 +; GFX9-NEXT: .LBB39_4: +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v6, s19 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-TRUE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s1, 0, s2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v14, 0x7fc07fc0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v10, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v2, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v0, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v11 :: v_dual_add_nc_u32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v8, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v0, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v3, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v6, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v5, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v9, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-TRUE16-NEXT: s_branch .LBB39_5 +; GFX11-TRUE16-NEXT: .LBB39_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB39_2 +; GFX11-TRUE16-NEXT: .LBB39_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: .LBB39_5: ; %end +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_4 +; GFX11-FAKE16-NEXT: .LBB39_2: ; %cmp.true +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s1, 0, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v0, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo -; GFX11-FAKE16-NEXT: v_perm_b32 v1, v6, v7, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v2, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v0, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v0, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_perm_b32 v11, v4, v5, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v6, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v7, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-FAKE16-NEXT: .LBB19_4: ; %end -; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-NEXT: .LBB39_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB39_2 +; GFX11-FAKE16-NEXT: .LBB39_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -4427,97 +8657,97 @@ end: } define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v14 -; GCN-NEXT: v_or_b32_e32 v2, v2, v15 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v4, v4, v16 -; GCN-NEXT: v_or_b32_e32 v5, v5, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: .LBB20_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB20_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v10 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v13 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v17, v0 -; GCN-NEXT: v_or_b32_e32 v1, v16, v1 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_or_b32_e32 v4, v15, v4 -; GCN-NEXT: v_or_b32_e32 v2, v14, v2 -; GCN-NEXT: v_or_b32_e32 v5, v12, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v0 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GCN-NEXT: .LBB20_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v9 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v4, v4, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_or_b32_e32 v4, v4, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: .LBB40_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB40_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v11, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v15, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v12, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB40_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6f16: ; VI: ; %bb.0: @@ -4534,14 +8764,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: s_cbranch_execnz .LBB40_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB20_4 -; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB40_4 +; VI-NEXT: .LBB40_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB20_3: ; %cmp.false +; VI-NEXT: .LBB40_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4564,8 +8794,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB20_2 -; VI-NEXT: .LBB20_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: .LBB40_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4606,14 +8836,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-NEXT: s_cbranch_execnz .LBB40_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB20_4 -; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB40_4 +; GFX9-NEXT: .LBB40_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB20_3: ; %cmp.false +; GFX9-NEXT: .LBB40_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4636,8 +8866,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB20_2 -; GFX9-NEXT: .LBB20_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: .LBB40_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -4682,14 +8912,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_4 -; GFX11-TRUE16-NEXT: .LBB20_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_4 +; GFX11-TRUE16-NEXT: .LBB40_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB20_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h @@ -4727,8 +8957,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-TRUE16-NEXT: .LBB20_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 @@ -4782,14 +9012,14 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_4 -; GFX11-FAKE16-NEXT: .LBB20_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_4 +; GFX11-FAKE16-NEXT: .LBB40_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB20_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -4825,8 +9055,8 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2 -; GFX11-FAKE16-NEXT: .LBB20_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-FAKE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -4880,90 +9110,437 @@ end: ret <6 x half> %phi } +define inreg <6 x half> @bitcast_v12i8_to_v6f16_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB41_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s19, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: s_cbranch_execnz .LBB41_3 +; SI-NEXT: .LBB41_2: ; %cmp.true +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s27, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s23, 8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_lshl_b32 s9, s19, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s16, 0xff +; SI-NEXT: s_lshl_b32 s10, s17, 8 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: .LBB41_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB41_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB41_2 +; +; VI-LABEL: bitcast_v12i8_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB41_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: .LBB41_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB41_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB41_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: .LBB41_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB41_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB41_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB41_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s7, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB41_3 +; GFX11-NEXT: .LBB41_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB41_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB41_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB21_4 -; GCN-NEXT: .LBB21_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB21_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-NEXT: v_or_b32_e32 v4, v13, v1 -; GCN-NEXT: v_or_b32_e32 v8, v12, v2 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_bfe_u32 v11, v10, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB21_2 -; GCN-NEXT: .LBB21_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v4, v2, v4 -; GCN-NEXT: v_or_b32_e32 v8, v3, v5 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_bfe_u32 v11, v10, 8, 8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v4 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB42_4 +; SI-NEXT: .LBB42_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB42_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_or_b32_e32 v4, v13, v1 +; SI-NEXT: v_or_b32_e32 v8, v12, v7 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB42_2 +; SI-NEXT: .LBB42_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v12i8: ; VI: ; %bb.0: @@ -4992,7 +9569,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 ; VI-NEXT: ; %bb.2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: s_cbranch_execz .LBB42_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 0x200 ; VI-NEXT: v_add_f16_sdwa v6, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5014,7 +9591,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: .LBB42_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v13 ; VI-NEXT: v_mov_b32_e32 v4, v14 @@ -5038,7 +9615,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: s_cbranch_execz .LBB42_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v13 @@ -5049,9 +9626,9 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: .LBB42_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] @@ -5067,7 +9644,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: .LBB42_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v15 ; GFX9-NEXT: v_mov_b32_e32 v4, v16 @@ -5098,7 +9675,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] @@ -5112,7 +9689,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12 -; GFX11-TRUE16-NEXT: .LBB21_4: ; %end +; GFX11-TRUE16-NEXT: .LBB42_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.h @@ -5141,7 +9718,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v13 @@ -5152,9 +9729,9 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX11-FAKE16-NEXT: .LBB21_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB42_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB42_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] @@ -5171,7 +9748,7 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v16 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v15 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; GFX11-FAKE16-NEXT: .LBB21_4: ; %end +; GFX11-FAKE16-NEXT: .LBB42_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v15 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v16 @@ -5194,115 +9771,462 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v15, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_or_b32_e32 v0, v14, v0 +; SI-NEXT: v_or_b32_e32 v4, v13, v1 +; SI-NEXT: v_or_b32_e32 v8, v12, v7 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: s_cbranch_execnz .LBB43_3 +; SI-NEXT: .LBB43_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 +; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB43_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_branch .LBB43_2 +; +; VI-LABEL: bitcast_v6f16_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB43_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 16 +; VI-NEXT: s_lshr_b32 s14, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s13, s17, 24 +; VI-NEXT: s_lshr_b32 s15, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s12, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_e32 v6, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v13, s17, v1 +; VI-NEXT: v_add_f16_e32 v2, s4, v1 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v12, v13, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v1 +; VI-NEXT: v_add_f16_e32 v10, s4, v1 +; VI-NEXT: v_or_b32_e32 v11, v0, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_add_f16_e32 v8, s18, v1 +; VI-NEXT: v_or_b32_e32 v14, v8, v3 +; VI-NEXT: v_mov_b32_e32 v15, 0x7e007e00 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v14 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: v_mov_b32_e32 v4, v13 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB43_2 +; VI-NEXT: .LBB43_4: +; VI-NEXT: v_mov_b32_e32 v2, s19 +; VI-NEXT: v_mov_b32_e32 v6, s15 +; VI-NEXT: v_mov_b32_e32 v10, s14 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v7, s13 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v5, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e007e00 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; GFX9-NEXT: s_branch .LBB43_5 +; GFX9-NEXT: .LBB43_3: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB43_2 +; GFX9-NEXT: .LBB43_4: +; GFX9-NEXT: v_mov_b32_e32 v14, s16 +; GFX9-NEXT: v_mov_b32_e32 v15, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB43_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-NEXT: v_mov_b32_e32 v4, v15 +; GFX9-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v6f16_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, 0x7e007e00 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[8:9], 24, v[3:4] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB43_5 +; GFX11-TRUE16-NEXT: .LBB43_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB43_2 +; GFX11-TRUE16-NEXT: .LBB43_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 +; GFX11-TRUE16-NEXT: .LBB43_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v6f16_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_4 +; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, s1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, s2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, 0x7e007e00 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; GFX11-FAKE16-NEXT: s_branch .LBB43_5 +; GFX11-FAKE16-NEXT: .LBB43_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB43_2 +; GFX11-FAKE16-NEXT: .LBB43_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB43_5: ; %end +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v14 :: v_dual_mov_b32 v9, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v15 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { -; GCN-LABEL: bitcast_v12i8_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v14, v4 -; GCN-NEXT: v_mov_b32_e32 v15, v2 -; GCN-NEXT: v_mov_b32_e32 v13, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB22_4 -; GCN-NEXT: .LBB22_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB22_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v14 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v15 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v13 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 -; GCN-NEXT: v_or_b32_e32 v0, v0, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v3, v18 -; GCN-NEXT: v_or_b32_e32 v4, v4, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v7, v1 -; GCN-NEXT: v_or_b32_e32 v6, v12, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_or_b32_e32 v2, v0, v1 -; GCN-NEXT: v_or_b32_e32 v0, v3, v6 -; GCN-NEXT: v_or_b32_e32 v4, v4, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v6, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB22_2 -; GCN-NEXT: .LBB22_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 -; GCN-NEXT: s_movk_i32 s6, 0x300 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; GCN-NEXT: s_mov_b32 s7, 0x3000000 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v14 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GCN-NEXT: v_or_b32_e32 v0, v18, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v17, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; GCN-NEXT: v_or_b32_e32 v1, v12, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_or_b32_e32 v3, v7, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v4 -; GCN-NEXT: v_or_b32_e32 v5, v9, v5 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v3 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v12i8_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_mov_b32_e32 v15, v2 +; SI-NEXT: v_mov_b32_e32 v13, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB44_4 +; SI-NEXT: .LBB44_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB44_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v12, v1 +; SI-NEXT: v_or_b32_e32 v2, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v4, v4, v18 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v4, v4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB44_2 +; SI-NEXT: .LBB44_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_or_b32_e32 v1, v16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12i8_to_v6i16: ; VI: ; %bb.0: @@ -5319,14 +10243,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB22_3 +; VI-NEXT: s_cbranch_execnz .LBB44_3 ; VI-NEXT: ; %bb.1: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB22_4 -; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_cbranch_execnz .LBB44_4 +; VI-NEXT: .LBB44_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB22_3: ; %cmp.false +; VI-NEXT: .LBB44_3: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5349,8 +10273,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB22_2 -; VI-NEXT: .LBB22_4: ; %cmp.true +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: .LBB44_4: ; %cmp.true ; VI-NEXT: v_add_u16_e32 v0, 3, v13 ; VI-NEXT: v_add_u16_e32 v1, 3, v14 ; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5391,14 +10315,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB22_3 +; GFX9-NEXT: s_cbranch_execnz .LBB44_3 ; GFX9-NEXT: ; %bb.1: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB22_4 -; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_cbranch_execnz .LBB44_4 +; GFX9-NEXT: .LBB44_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] -; GFX9-NEXT: .LBB22_3: ; %cmp.false +; GFX9-NEXT: .LBB44_3: ; %cmp.false ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -5421,8 +10345,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB22_2 -; GFX9-NEXT: .LBB22_4: ; %cmp.true +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: .LBB44_4: ; %cmp.true ; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5467,14 +10391,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_4 -; GFX11-TRUE16-NEXT: .LBB22_2: ; %end +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_4 +; GFX11-TRUE16-NEXT: .LBB44_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false +; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h @@ -5512,8 +10436,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 @@ -5567,14 +10491,14 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_3 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_4 -; GFX11-FAKE16-NEXT: .LBB22_2: ; %end +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_4 +; GFX11-FAKE16-NEXT: .LBB44_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; GFX11-FAKE16-NEXT: .LBB22_3: ; %cmp.false +; GFX11-FAKE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v4 @@ -5610,8 +10534,8 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2 -; GFX11-FAKE16-NEXT: .LBB22_4: ; %cmp.true +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-FAKE16-NEXT: .LBB44_4: ; %cmp.true ; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v13, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v1, v14, 3 ; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v4, 3 @@ -5665,89 +10589,454 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v12i8_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s28, 0 +; SI-NEXT: s_cbranch_scc0 .LBB45_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s8, s17, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_or_b32 s6, s6, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s8, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s8 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s27, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s10, s9, s8 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_or_b32 s8, s4, s10 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s10, s10, 16 +; SI-NEXT: s_cbranch_execnz .LBB45_3 +; SI-NEXT: .LBB45_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x3000000 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s8, s4, 0x3000000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: .LBB45_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB45_2 +; +; VI-LABEL: bitcast_v12i8_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s28, 0 +; VI-NEXT: s_cbranch_scc0 .LBB45_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_cbranch_execnz .LBB45_3 +; VI-NEXT: .LBB45_2: ; %cmp.true +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s20, 0xff +; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: .LBB45_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB45_4: +; VI-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; VI-NEXT: s_branch .LBB45_2 +; +; GFX9-LABEL: bitcast_v12i8_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_cbranch_execnz .LBB45_3 +; GFX9-NEXT: .LBB45_2: ; %cmp.true +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: .LBB45_3: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB45_4: +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-NEXT: s_branch .LBB45_2 +; +; GFX11-LABEL: bitcast_v12i8_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s24, 0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB45_4 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s1, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s3, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_or_b32 s5, s6, s7 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_lshl_b32 s6, s17, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s5, s16, 0xff +; GFX11-NEXT: s_and_b32 s7, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-NEXT: s_and_b32 s10, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s23, 8 +; GFX11-NEXT: s_or_b32 s7, s7, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 16 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s6, s7, s9 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX11-NEXT: s_cbranch_vccnz .LBB45_3 +; GFX11-NEXT: .LBB45_2: ; %cmp.true +; GFX11-NEXT: s_add_i32 s0, s0, 3 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_add_i32 s16, s16, 3 +; GFX11-NEXT: s_or_b32 s4, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s17, 8 +; GFX11-NEXT: s_add_i32 s18, s18, 3 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s19, 8 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s22, s22, 3 +; GFX11-NEXT: s_or_b32 s1, s2, s1 +; GFX11-NEXT: s_and_b32 s2, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s21, 8 +; GFX11-NEXT: s_and_b32 s5, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s23, 8 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_addk_i32 s0, 0x300 +; GFX11-NEXT: s_addk_i32 s1, 0x300 +; GFX11-NEXT: s_addk_i32 s2, 0x300 +; GFX11-NEXT: s_addk_i32 s3, 0x300 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s5, s0, s1 +; GFX11-NEXT: s_or_b32 s6, s2, s3 +; GFX11-NEXT: .LBB45_3: ; %end +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB45_4: +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX11-NEXT: s_branch .LBB45_2 + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v12i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v5 -; GCN-NEXT: v_mov_b32_e32 v12, v4 -; GCN-NEXT: v_mov_b32_e32 v16, v3 -; GCN-NEXT: v_mov_b32_e32 v13, v2 -; GCN-NEXT: v_mov_b32_e32 v14, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v15 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB23_4 -; GCN-NEXT: .LBB23_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB23_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NEXT: v_bfe_u32 v7, v16, 8, 8 -; GCN-NEXT: v_or_b32_e32 v0, v0, v17 -; GCN-NEXT: v_or_b32_e32 v4, v1, v18 -; GCN-NEXT: v_or_b32_e32 v8, v2, v19 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: v_bfe_u32 v11, v15, 8, 8 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB23_2 -; GCN-NEXT: .LBB23_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v14 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v13 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v0, v17, v0 -; GCN-NEXT: v_or_b32_e32 v1, v18, v1 -; GCN-NEXT: v_or_b32_e32 v2, v19, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v2 -; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v12i8: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v15, v5 +; SI-NEXT: v_mov_b32_e32 v16, v3 +; SI-NEXT: v_mov_b32_e32 v12, v4 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB46_4 +; SI-NEXT: .LBB46_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB46_3: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 +; SI-NEXT: v_or_b32_e32 v4, v1, v18 +; SI-NEXT: v_or_b32_e32 v8, v6, v17 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; SI-NEXT: v_bfe_u32 v7, v16, 8, 8 +; SI-NEXT: v_bfe_u32 v11, v15, 8, 8 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB46_2 +; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_or_b32_e32 v1, v17, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v12i8: ; VI: ; %bb.0: @@ -5767,7 +11056,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -5780,9 +11069,9 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v8, v2 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: .LBB46_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: s_cbranch_execz .LBB46_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -5803,7 +11092,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0 ; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 -; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: .LBB46_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v16 ; VI-NEXT: v_mov_b32_e32 v1, v15 @@ -5829,7 +11118,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: s_cbranch_execz .LBB46_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -5840,9 +11129,9 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: .LBB46_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -5856,7 +11145,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: .LBB46_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v13 ; GFX9-NEXT: v_mov_b32_e32 v4, v14 @@ -5886,7 +11175,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_4 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -5899,7 +11188,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v11 -; GFX11-TRUE16-NEXT: .LBB23_4: ; %end +; GFX11-TRUE16-NEXT: .LBB46_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h @@ -5928,7 +11217,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -5939,9 +11228,9 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB23_2: ; %Flow +; GFX11-FAKE16-NEXT: .LBB46_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_4 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB46_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -5956,7 +11245,7 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 -; GFX11-FAKE16-NEXT: .LBB23_4: ; %end +; GFX11-FAKE16-NEXT: .LBB46_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 @@ -5978,80 +11267,422 @@ end: ret <12 x i8> %phi } +define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v12i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB47_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_lshr_b32 s12, s8, 8 +; SI-NEXT: s_and_b32 s10, s19, 0xffff +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_bfe_u32 s11, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s14, s21, 0x80008 +; SI-NEXT: s_cbranch_execnz .LBB47_3 +; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 +; SI-NEXT: s_lshr_b32 s11, s7, 24 +; SI-NEXT: s_lshr_b32 s10, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_lshr_b32 s14, s8, 24 +; SI-NEXT: s_lshr_b32 s13, s8, 16 +; SI-NEXT: s_lshr_b32 s12, s8, 8 +; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: s_branch .LBB47_2 +; +; VI-LABEL: bitcast_v6i16_to_v12i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB47_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB47_3 +; VI-NEXT: .LBB47_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s17, 3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s17, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s5, s18, 3 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s4, 0x30000 +; VI-NEXT: s_lshr_b32 s19, s16, 8 +; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s11, s18, 8 +; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s13, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; VI-NEXT: .LBB47_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: v_mov_b32_e32 v2, s15 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s17 +; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: v_mov_b32_e32 v6, s13 +; VI-NEXT: v_mov_b32_e32 v7, s12 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB47_4: +; VI-NEXT: ; implicit-def: $sgpr19 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: s_branch .LBB47_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v12i8_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_lshr_b32 s19, s16, 8 +; GFX9-NEXT: s_lshr_b32 s10, s18, 16 +; GFX9-NEXT: s_lshr_b32 s11, s18, 8 +; GFX9-NEXT: s_lshr_b32 s12, s17, 24 +; GFX9-NEXT: s_lshr_b32 s13, s17, 16 +; GFX9-NEXT: s_lshr_b32 s15, s17, 8 +; GFX9-NEXT: s_lshr_b32 s14, s16, 16 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; GFX9-NEXT: s_cbranch_execnz .LBB47_4 +; GFX9-NEXT: .LBB47_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: s_branch .LBB47_5 +; GFX9-NEXT: .LBB47_3: +; GFX9-NEXT: ; implicit-def: $sgpr19 +; GFX9-NEXT: ; implicit-def: $sgpr14 +; GFX9-NEXT: ; implicit-def: $sgpr4 +; GFX9-NEXT: ; implicit-def: $sgpr15 +; GFX9-NEXT: ; implicit-def: $sgpr13 +; GFX9-NEXT: ; implicit-def: $sgpr12 +; GFX9-NEXT: ; implicit-def: $sgpr11 +; GFX9-NEXT: ; implicit-def: $sgpr10 +; GFX9-NEXT: ; implicit-def: $sgpr6 +; GFX9-NEXT: s_branch .LBB47_2 +; GFX9-NEXT: .LBB47_4: +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v10, s10 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: .LBB47_5: ; %end +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: bitcast_v6i16_to_v12i8_scalar: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0 +; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s1, 8 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-TRUE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[12:13], 24, v[8:9] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX11-TRUE16-NEXT: s_branch .LBB47_5 +; GFX11-TRUE16-NEXT: .LBB47_3: +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr11_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr9_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $sgpr6_lo16 +; GFX11-TRUE16-NEXT: s_branch .LBB47_2 +; GFX11-TRUE16-NEXT: .LBB47_4: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, s4 +; GFX11-TRUE16-NEXT: .LBB47_5: ; %end +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: bitcast_v6i16_to_v12i8_scalar: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0 +; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB47_3 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s10, s1, 24 +; GFX11-FAKE16-NEXT: s_lshr_b32 s11, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s13, s1, 8 +; GFX11-FAKE16-NEXT: s_lshr_b32 s12, s0, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s14, s0, 8 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 +; GFX11-FAKE16-NEXT: s_lshr_b64 s[4:5], s[0:1], 24 +; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB47_4 +; GFX11-FAKE16-NEXT: .LBB47_2: ; %cmp.true +; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, s2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, s1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, s0, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-FAKE16-NEXT: s_branch .LBB47_5 +; GFX11-FAKE16-NEXT: .LBB47_3: +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr14 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr12 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr4 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr13 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr11 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr10 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr9 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr8 +; GFX11-FAKE16-NEXT: ; implicit-def: $sgpr6 +; GFX11-FAKE16-NEXT: s_branch .LBB47_2 +; GFX11-FAKE16-NEXT: .LBB47_4: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v13, s0 :: v_dual_mov_b32 v14, s1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s10 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s8 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s6 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4 +; GFX11-FAKE16-NEXT: .LBB47_5: ; %end +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB24_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB24_4 -; GCN-NEXT: .LBB24_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB24_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB24_2 -; GCN-NEXT: .LBB24_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB48_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB48_4 +; SI-NEXT: .LBB48_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB48_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB48_2 +; SI-NEXT: .LBB48_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v6f16: ; VI: ; %bb.0: @@ -6060,7 +11691,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6117,7 +11748,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6128,7 +11759,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6177,7 +11808,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v5, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v4, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 -; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: .LBB48_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6189,7 +11820,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -6245,7 +11876,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 -; GFX11-TRUE16-NEXT: .LBB24_2: ; %end +; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6257,7 +11888,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6312,7 +11943,7 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB24_2: ; %end +; GFX11-FAKE16-NEXT: .LBB48_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6332,74 +11963,386 @@ end: ret <6 x half> %phi } +define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s21 +; SI-NEXT: s_cbranch_scc0 .LBB49_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_cbranch_execnz .LBB49_3 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB49_2 +; +; VI-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB49_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: +; VI-NEXT: s_branch .LBB49_2 +; VI-NEXT: .LBB49_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: +; GFX9-NEXT: s_branch .LBB49_2 +; GFX9-NEXT: .LBB49_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v2, 0x40c00000, s0 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_pack_lh_b32_b16 s4, 0, s2 +; GFX11-NEXT: v_add_f32_e64 v4, 0x40c00000, s1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v2 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v10, v10, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v10 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v9, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v9 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v6 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: +; GFX11-NEXT: s_branch .LBB49_2 +; GFX11-NEXT: .LBB49_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB25_4 -; GCN-NEXT: .LBB25_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB25_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB25_2 -; GCN-NEXT: .LBB25_4: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB50_4 +; SI-NEXT: .LBB50_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB50_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB50_2 +; SI-NEXT: .LBB50_4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6bf16: ; VI: ; %bb.0: @@ -6408,7 +12351,7 @@ define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v0 @@ -6420,7 +12363,7 @@ define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v6, v2 ; VI-NEXT: v_or_b32_e32 v1, v5, v1 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6472,70 +12415,219 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v6f16_to_v6bf16_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s21 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB51_2 +; +; VI-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB51_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB51_4 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_e32 v3, s16, v0 +; VI-NEXT: v_add_f16_sdwa v4, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v5, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v3, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB51_3: +; VI-NEXT: s_branch .LBB51_2 +; VI-NEXT: .LBB51_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB51_4 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB51_3: +; GFX9-NEXT: s_branch .LBB51_2 +; GFX9-NEXT: .LBB51_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB51_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB51_4 +; GFX11-NEXT: .LBB51_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB51_3: +; GFX11-NEXT: s_branch .LBB51_2 +; GFX11-NEXT: .LBB51_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { -; GCN-LABEL: bitcast_v6bf16_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB26_4 -; GCN-NEXT: .LBB26_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB26_3: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB26_2 -; GCN-NEXT: .LBB26_4: ; %cmp.true -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 -; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NEXT: v_alignbit_b32 v0, v7, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v6, 16 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6bf16_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB52_4 +; SI-NEXT: .LBB52_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB52_3: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB52_2 +; SI-NEXT: .LBB52_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6bf16_to_v6i16: ; VI: ; %bb.0: @@ -6544,7 +12636,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6601,7 +12693,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 ; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6612,7 +12704,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6661,7 +12753,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v5, v2, s6 ; GFX9-NEXT: v_perm_b32 v1, v4, v1, s6 ; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 -; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6673,7 +12765,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -6733,7 +12825,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v4 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v3 -; GFX11-TRUE16-NEXT: .LBB26_2: ; %end +; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6745,7 +12837,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6800,7 +12892,7 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-FAKE16-NEXT: .LBB26_2: ; %end +; GFX11-FAKE16-NEXT: .LBB52_2: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -6820,59 +12912,349 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 +; SI-NEXT: s_cbranch_scc0 .LBB53_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: s_cbranch_execnz .LBB53_3 +; SI-NEXT: .LBB53_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB53_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB53_2 +; +; VI-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB53_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %cmp.true +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 +; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: +; VI-NEXT: s_branch .LBB53_2 +; VI-NEXT: .LBB53_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %cmp.true +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s17 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v1, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v2 +; GFX9-NEXT: s_pack_lh_b32_b16 s4, 0, s18 +; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: v_and_or_b32 v0, v3, v6, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: +; GFX9-NEXT: s_branch .LBB53_2 +; GFX9-NEXT: .LBB53_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %cmp.true +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 +; GFX11-NEXT: s_pack_lh_b32_b16 s3, 0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_add_f32_e64 v3, 0x40c00000, s3 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, 0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0 +; GFX11-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_f32_e64 v8, 0x40c00000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v9, v10, v6 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v4, v5 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v3, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: +; GFX11-NEXT: s_branch .LBB53_2 +; GFX11-NEXT: .LBB53_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v6bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v9, v4 -; GCN-NEXT: v_mov_b32_e32 v7, v2 -; GCN-NEXT: v_mov_b32_e32 v8, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB27_4 -; GCN-NEXT: .LBB27_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB27_3: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB27_2 -; GCN-NEXT: .LBB27_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v0, v5, v0 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v6bf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB54_4 +; SI-NEXT: .LBB54_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB54_3: ; %cmp.false +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB54_2 +; SI-NEXT: .LBB54_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6bf16: ; VI: ; %bb.0: @@ -6881,7 +13263,7 @@ define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -6893,7 +13275,7 @@ define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -6944,50 +13326,192 @@ end: ret <6 x bfloat> %phi } +define inreg <6 x bfloat> @bitcast_v6i16_to_v6bf16_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB55_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s11, s20, 16 +; SI-NEXT: s_lshl_b32 s10, s21, 16 +; SI-NEXT: s_cbranch_execnz .LBB55_3 +; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s9, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s8, s5, 16 +; SI-NEXT: s_and_b32 s10, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s4, 16 +; SI-NEXT: .LBB55_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s11 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB55_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: s_branch .LBB55_2 +; +; VI-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB55_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: .LBB55_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB55_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_4: +; VI-NEXT: s_branch .LBB55_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: +; GFX9-NEXT: s_branch .LBB55_2 +; GFX9-NEXT: .LBB55_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v6bf16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: +; GFX11-NEXT: s_branch .LBB55_2 +; GFX11-NEXT: .LBB55_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { -; GCN-LABEL: bitcast_v6f16_to_v6i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_or_b32_e32 v4, v4, v6 -; GCN-NEXT: v_or_b32_e32 v2, v2, v7 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: .LBB28_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6f16_to_v6i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB56_2 +; SI-NEXT: ; %bb.1: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB56_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f16_to_v6i16: ; VI: ; %bb.0: @@ -6996,7 +13520,7 @@ define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v4, 0x200 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v0 @@ -7008,7 +13532,7 @@ define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v6, v2 ; VI-NEXT: v_or_b32_e32 v1, v5, v1 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: .LBB56_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7060,62 +13584,200 @@ end: ret <6 x i16> %phi } +define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6f16_to_v6i16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB57_4: +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v6f16_to_v6i16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB57_3 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB57_4 +; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s17, 16 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_add_f16_e32 v3, s16, v0 +; VI-NEXT: v_add_f16_sdwa v4, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v5, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v3, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB57_3: +; VI-NEXT: s_branch .LBB57_2 +; VI-NEXT: .LBB57_4: +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v6i16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB57_4 +; GFX9-NEXT: .LBB57_2: ; %cmp.true +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB57_3: +; GFX9-NEXT: s_branch .LBB57_2 +; GFX9-NEXT: .LBB57_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v6i16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB57_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB57_4 +; GFX11-NEXT: .LBB57_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB57_3: +; GFX11-NEXT: s_branch .LBB57_2 +; GFX11-NEXT: .LBB57_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v6i16_to_v6f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v5 -; GCN-NEXT: v_mov_b32_e32 v7, v4 -; GCN-NEXT: v_mov_b32_e32 v8, v3 -; GCN-NEXT: v_mov_b32_e32 v9, v2 -; GCN-NEXT: v_mov_b32_e32 v10, v1 -; GCN-NEXT: v_mov_b32_e32 v11, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB29_4 -; GCN-NEXT: .LBB29_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB29_3: ; %cmp.false -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB29_2 -; GCN-NEXT: .LBB29_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v7 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bitcast_v6i16_to_v6f16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v1 +; SI-NEXT: v_mov_b32_e32 v11, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_3 +; SI-NEXT: ; %bb.1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB58_4 +; SI-NEXT: .LBB58_2: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB58_3: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB58_2 +; SI-NEXT: .LBB58_4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6i16_to_v6f16: ; VI: ; %bb.0: @@ -7124,7 +13786,7 @@ define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v3, 3 ; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -7136,7 +13798,7 @@ define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v5 ; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: .LBB58_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -7186,3 +13848,130 @@ end: %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <6 x half> %phi } + +define inreg <6 x half> @bitcast_v6i16_to_v6f16_scalar(<6 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v6i16_to_v6f16_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s22, 0 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s21 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_branch .LBB59_2 +; +; VI-LABEL: bitcast_v6i16_to_v6f16_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_cmp_lg_u32 s19, 0 +; VI-NEXT: s_cbranch_scc0 .LBB59_4 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_cbranch_execnz .LBB59_3 +; VI-NEXT: .LBB59_2: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s16, 3 +; VI-NEXT: s_add_i32 s7, s17, 3 +; VI-NEXT: s_add_i32 s9, s18, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 +; VI-NEXT: s_and_b32 s8, s18, 0xffff0000 +; VI-NEXT: s_and_b32 s9, s9, 0xffff +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s8, 0x30000 +; VI-NEXT: s_add_i32 s17, s6, 0x30000 +; VI-NEXT: s_add_i32 s16, s4, 0x30000 +; VI-NEXT: .LBB59_3: ; %end +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB59_4: +; VI-NEXT: s_branch .LBB59_2 +; +; GFX9-LABEL: bitcast_v6i16_to_v6f16_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_cbranch_execnz .LBB59_4 +; GFX9-NEXT: .LBB59_2: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB59_3: +; GFX9-NEXT: s_branch .LBB59_2 +; GFX9-NEXT: .LBB59_4: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v6f16_scalar: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB59_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_vccnz .LBB59_4 +; GFX11-NEXT: .LBB59_2: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB59_3: +; GFX11-NEXT: s_branch .LBB59_2 +; GFX11-NEXT: .LBB59_4: +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll index 01a1e6b73ac6a..2b48cf0f41c88 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll @@ -1,25 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; GCN-LABEL: bitcast_i8ptr_v16i8ptr: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_endpgm +; SI-LABEL: bitcast_i8ptr_v16i8ptr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm ; ; VI-LABEL: bitcast_i8ptr_v16i8ptr: ; VI: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 51afa79674a80..338dd9dedd37e 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -27,11 +27,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -42,11 +40,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 { ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll index 155042c5fc3c3..8ed8d905c5512 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -21,21 +21,20 @@ define amdgpu_kernel void @s_ashr_v2i16(ptr addrspace(1) %out, i32, <2 x i16> %l ; ; VI-LABEL: s_ashr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x30 +; VI-NEXT: s_load_dword s6, s[4:5], 0x38 +; VI-NEXT: s_load_dword s7, s[4:5], 0x30 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: s_load_dword s4, s[4:5], 0x38 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s6, 16 -; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_ashr_i32 s7, s4, 16 -; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_ashr_i32 s5, s5, s7 -; VI-NEXT: s_ashr_i32 s4, s6, s4 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_ashr_i32 s5, s7, 16 +; VI-NEXT: s_ashr_i32 s4, s5, s4 +; VI-NEXT: s_sext_i32_i16 s5, s7 +; VI-NEXT: s_ashr_i32 s5, s5, s6 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 9775a37276dfd..cbceb0885e8db 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -875,7 +875,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -980,7 +979,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -4282,7 +4280,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -4387,7 +4384,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -6691,7 +6687,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -6796,7 +6791,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_and_rtn_b32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -8052,7 +8046,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -8157,7 +8150,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB17_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -9412,7 +9404,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -9517,7 +9508,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB19_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -10772,7 +10762,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -10877,7 +10866,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_max_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -12600,7 +12588,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -12705,7 +12692,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_min_rtn_i32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -14428,7 +14414,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -14533,7 +14518,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB27_2 ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv @@ -16243,7 +16227,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: ; %bb.1: ; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1064_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1064_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv @@ -16348,7 +16331,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: ; %bb.1: ; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s6 -; GFX1164_DPP-NEXT: s_mov_b32 s3, s6 ; GFX1164_DPP-NEXT: ds_min_rtn_u32 v0, v0, v4 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index a9358dc4a51d8..ab078be5c13a3 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -52,7 +52,6 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s2, s2, 0xffff ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_lshr_b32 s2, s2, 16 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -80,11 +79,9 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_brev_b32 s2, s2 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; GFX11-GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-GISEL-TRUE16-NEXT: s_endpgm @@ -96,11 +93,9 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) # ; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_brev_b32 s2, s2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-GISEL-FAKE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index bc8e21e03251d..a1aef8ddf6bba 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -1004,7 +1004,6 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; ; VI-LABEL: ps_mesa_inreg_i16: ; VI: ; %bb.0: -; VI-NEXT: s_and_b32 s0, 0xffff, s0 ; VI-NEXT: s_add_i32 s0, s0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_short v[0:1], v0 @@ -1012,9 +1011,8 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { ; ; GFX11-LABEL: ps_mesa_inreg_i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off ; GFX11-NEXT: s_endpgm @@ -1156,20 +1154,20 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: s_lshr_b32 s1, s0, 24 -; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: s_lshr_b32 s2, s0, 24 ; VI-NEXT: s_bfe_u32 s3, s0, 0x80008 +; VI-NEXT: s_add_i32 s2, s2, s2 ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 8 +; VI-NEXT: s_and_b32 s1, s1, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s1, s1, 24 -; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: s_or_b32 s1, s1, s2 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s2, s3, 8 ; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s1, s1, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1240,8 +1238,8 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: s_bfe_u32 s2, s0, 0x80008 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v2, s1 @@ -1318,22 +1316,21 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v0, 4 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_lshr_b32 s2, s0, 24 -; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 24 ; VI-NEXT: s_bfe_u32 s4, s0, 0x80008 +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_add_i32 s4, s4, s4 +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_lshl_b32 s2, s2, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: s_lshl_b32 s3, s4, 8 -; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_add_i32 s1, s1, s1 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: s_or_b32 s0, s0, s2 @@ -1430,37 +1427,37 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s1, 16 -; VI-NEXT: s_lshr_b32 s2, s1, 24 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_lshr_b32 s3, s0, 24 +; VI-NEXT: s_lshr_b32 s4, s1, 16 +; VI-NEXT: s_lshr_b32 s5, s1, 24 +; VI-NEXT: s_bfe_u32 s6, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s7, s1, 0x80008 +; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_bfe_u32 s6, s1, 0x80008 ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_lshl_b32 s2, s2, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s3, s6, 8 -; VI-NEXT: s_lshr_b32 s4, s0, 24 -; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_or_b32 s1, s1, s3 -; VI-NEXT: s_bfe_u32 s7, s0, 0x80008 -; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s3, s5, 0xff ; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_lshl_b32 s2, s4, 24 -; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s5, s7, 8 ; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s3, s7, 8 +; VI-NEXT: s_lshl_b32 s3, s6, 8 +; VI-NEXT: s_or_b32 s1, s1, s5 ; VI-NEXT: s_or_b32 s0, s0, s3 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s1, s1, s4 ; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1599,69 +1596,69 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_lshr_b32 s4, s3, 24 +; VI-NEXT: s_lshr_b32 s4, s0, 16 +; VI-NEXT: s_lshr_b32 s5, s0, 24 +; VI-NEXT: s_lshr_b32 s6, s1, 16 +; VI-NEXT: s_lshr_b32 s7, s1, 24 +; VI-NEXT: s_lshr_b32 s8, s2, 16 +; VI-NEXT: s_lshr_b32 s9, s2, 24 +; VI-NEXT: s_lshr_b32 s10, s3, 16 +; VI-NEXT: s_lshr_b32 s11, s3, 24 +; VI-NEXT: s_bfe_u32 s12, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s13, s1, 0x80008 +; VI-NEXT: s_bfe_u32 s14, s2, 0x80008 +; VI-NEXT: s_bfe_u32 s15, s3, 0x80008 +; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_add_i32 s10, s10, s10 +; VI-NEXT: s_add_i32 s9, s9, s9 +; VI-NEXT: s_add_i32 s8, s8, s8 +; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_add_i32 s6, s6, s6 ; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_bfe_u32 s12, s3, 0x80008 ; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_lshl_b32 s4, s4, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s7, s2, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s3, s3, 0xff -; VI-NEXT: s_lshl_b32 s5, s12, 8 -; VI-NEXT: s_lshr_b32 s6, s2, 24 -; VI-NEXT: s_add_i32 s7, s7, s7 -; VI-NEXT: s_or_b32 s3, s3, s5 -; VI-NEXT: s_bfe_u32 s13, s2, 0x80008 -; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s5, s7, 0xff -; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_or_b32 s3, s3, s4 -; VI-NEXT: s_lshl_b32 s4, s6, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s9, s1, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s5, s13, 8 -; VI-NEXT: s_lshr_b32 s8, s1, 24 -; VI-NEXT: s_add_i32 s9, s9, s9 -; VI-NEXT: s_or_b32 s2, s2, s5 -; VI-NEXT: s_bfe_u32 s14, s1, 0x80008 -; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s5, s9, 0xff ; VI-NEXT: s_add_i32 s14, s14, s14 +; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_or_b32 s2, s2, s4 -; VI-NEXT: s_lshl_b32 s4, s8, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshr_b32 s11, s0, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s5, s14, 8 -; VI-NEXT: s_lshr_b32 s10, s0, 24 -; VI-NEXT: s_add_i32 s11, s11, s11 -; VI-NEXT: s_or_b32 s1, s1, s5 -; VI-NEXT: s_bfe_u32 s15, s0, 0x80008 -; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s5, s11, 0xff -; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s4 -; VI-NEXT: s_lshl_b32 s4, s10, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s11, s15, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s9, s14, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s7, s13, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s5, s15, 8 +; VI-NEXT: s_lshl_b32 s5, s12, 8 +; VI-NEXT: s_or_b32 s3, s3, s11 +; VI-NEXT: s_or_b32 s2, s2, s9 +; VI-NEXT: s_or_b32 s1, s1, s7 ; VI-NEXT: s_or_b32 s0, s0, s5 +; VI-NEXT: s_lshl_b32 s10, s10, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s10 +; VI-NEXT: s_or_b32 s2, s2, s8 +; VI-NEXT: s_or_b32 s1, s1, s6 ; VI-NEXT: s_or_b32 s0, s0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1905,138 +1902,138 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; VI-NEXT: v_mov_b32_e32 v4, 16 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s9, s3, 16 -; VI-NEXT: s_lshr_b32 s8, s3, 24 +; VI-NEXT: s_lshr_b32 s8, s4, 16 +; VI-NEXT: s_lshr_b32 s9, s4, 24 +; VI-NEXT: s_lshr_b32 s10, s5, 16 +; VI-NEXT: s_lshr_b32 s11, s5, 24 +; VI-NEXT: s_lshr_b32 s12, s6, 16 +; VI-NEXT: s_lshr_b32 s13, s6, 24 +; VI-NEXT: s_lshr_b32 s14, s7, 16 +; VI-NEXT: s_lshr_b32 s15, s7, 24 +; VI-NEXT: s_bfe_u32 s24, s4, 0x80008 +; VI-NEXT: s_bfe_u32 s25, s5, 0x80008 +; VI-NEXT: s_bfe_u32 s26, s6, 0x80008 +; VI-NEXT: s_bfe_u32 s27, s7, 0x80008 +; VI-NEXT: s_add_i32 s15, s15, s15 +; VI-NEXT: s_add_i32 s14, s14, s14 +; VI-NEXT: s_add_i32 s13, s13, s13 +; VI-NEXT: s_add_i32 s12, s12, s12 +; VI-NEXT: s_add_i32 s11, s11, s11 +; VI-NEXT: s_add_i32 s10, s10, s10 ; VI-NEXT: s_add_i32 s9, s9, s9 -; VI-NEXT: s_bfe_u32 s24, s3, 0x80008 ; VI-NEXT: s_add_i32 s8, s8, s8 -; VI-NEXT: s_and_b32 s9, s9, 0xff +; VI-NEXT: s_lshr_b32 s16, s0, 16 +; VI-NEXT: s_lshr_b32 s17, s0, 24 +; VI-NEXT: s_lshr_b32 s18, s1, 16 +; VI-NEXT: s_lshr_b32 s19, s1, 24 +; VI-NEXT: s_lshr_b32 s20, s2, 16 +; VI-NEXT: s_lshr_b32 s21, s2, 24 +; VI-NEXT: s_lshr_b32 s22, s3, 16 +; VI-NEXT: s_lshr_b32 s23, s3, 24 +; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_and_b32 s14, s14, 0xff +; VI-NEXT: s_add_i32 s7, s7, s7 +; VI-NEXT: s_add_i32 s27, s27, s27 +; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_add_i32 s6, s6, s6 +; VI-NEXT: s_add_i32 s26, s26, s26 +; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_add_i32 s5, s5, s5 +; VI-NEXT: s_add_i32 s25, s25, s25 +; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_add_i32 s4, s4, s4 ; VI-NEXT: s_add_i32 s24, s24, s24 -; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_lshl_b32 s8, s8, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s11, s2, 16 +; VI-NEXT: s_bfe_u32 s28, s0, 0x80008 +; VI-NEXT: s_bfe_u32 s29, s1, 0x80008 +; VI-NEXT: s_bfe_u32 s30, s2, 0x80008 +; VI-NEXT: s_bfe_u32 s31, s3, 0x80008 +; VI-NEXT: s_add_i32 s23, s23, s23 +; VI-NEXT: s_add_i32 s22, s22, s22 +; VI-NEXT: s_add_i32 s21, s21, s21 +; VI-NEXT: s_add_i32 s20, s20, s20 +; VI-NEXT: s_add_i32 s19, s19, s19 +; VI-NEXT: s_add_i32 s18, s18, s18 +; VI-NEXT: s_add_i32 s17, s17, s17 +; VI-NEXT: s_add_i32 s16, s16, s16 +; VI-NEXT: s_or_b32 s14, s14, s15 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s15, s27, 8 +; VI-NEXT: s_or_b32 s12, s12, s13 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_lshl_b32 s13, s26, 8 +; VI-NEXT: s_or_b32 s10, s10, s11 +; VI-NEXT: s_and_b32 s5, s5, 0xff +; VI-NEXT: s_lshl_b32 s11, s25, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s9, s24, 8 -; VI-NEXT: s_lshr_b32 s10, s2, 24 -; VI-NEXT: s_add_i32 s11, s11, s11 -; VI-NEXT: s_or_b32 s3, s3, s9 -; VI-NEXT: s_bfe_u32 s25, s2, 0x80008 -; VI-NEXT: s_add_i32 s10, s10, s10 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s9, s11, 0xff -; VI-NEXT: s_add_i32 s25, s25, s25 +; VI-NEXT: s_lshl_b32 s23, s23, 8 +; VI-NEXT: s_and_b32 s22, s22, 0xff +; VI-NEXT: s_add_i32 s3, s3, s3 +; VI-NEXT: s_add_i32 s31, s31, s31 +; VI-NEXT: s_lshl_b32 s21, s21, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xff ; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: s_or_b32 s3, s3, s8 -; VI-NEXT: s_lshl_b32 s8, s10, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s13, s1, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s9, s25, 8 -; VI-NEXT: s_lshr_b32 s12, s1, 24 -; VI-NEXT: s_add_i32 s13, s13, s13 -; VI-NEXT: s_or_b32 s2, s2, s9 -; VI-NEXT: s_bfe_u32 s26, s1, 0x80008 -; VI-NEXT: s_add_i32 s12, s12, s12 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s9, s13, 0xff -; VI-NEXT: s_add_i32 s26, s26, s26 +; VI-NEXT: s_add_i32 s30, s30, s30 +; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xff ; VI-NEXT: s_add_i32 s1, s1, s1 -; VI-NEXT: s_or_b32 s2, s2, s8 -; VI-NEXT: s_lshl_b32 s8, s12, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s15, s0, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s1, s1, 0xff -; VI-NEXT: s_lshl_b32 s9, s26, 8 -; VI-NEXT: s_lshr_b32 s14, s0, 24 -; VI-NEXT: s_add_i32 s15, s15, s15 -; VI-NEXT: s_or_b32 s1, s1, s9 -; VI-NEXT: s_bfe_u32 s27, s0, 0x80008 -; VI-NEXT: s_add_i32 s14, s14, s14 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_and_b32 s9, s15, 0xff -; VI-NEXT: s_add_i32 s27, s27, s27 +; VI-NEXT: s_add_i32 s29, s29, s29 +; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_and_b32 s16, s16, 0xff ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: s_or_b32 s1, s1, s8 -; VI-NEXT: s_lshl_b32 s8, s14, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s17, s7, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: s_lshl_b32 s9, s27, 8 -; VI-NEXT: s_lshr_b32 s16, s7, 24 -; VI-NEXT: s_add_i32 s17, s17, s17 -; VI-NEXT: s_or_b32 s0, s0, s9 -; VI-NEXT: s_bfe_u32 s28, s7, 0x80008 -; VI-NEXT: s_add_i32 s16, s16, s16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_and_b32 s9, s17, 0xff ; VI-NEXT: s_add_i32 s28, s28, s28 -; VI-NEXT: s_add_i32 s7, s7, s7 -; VI-NEXT: s_or_b32 s0, s0, s8 -; VI-NEXT: s_lshl_b32 s8, s16, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s19, s6, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s28, 8 -; VI-NEXT: s_lshr_b32 s18, s6, 24 -; VI-NEXT: s_add_i32 s19, s19, s19 -; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: s_bfe_u32 s29, s6, 0x80008 -; VI-NEXT: s_add_i32 s18, s18, s18 +; VI-NEXT: s_or_b32 s7, s7, s15 +; VI-NEXT: s_or_b32 s6, s6, s13 +; VI-NEXT: s_or_b32 s5, s5, s11 +; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_or_b32 s22, s22, s23 +; VI-NEXT: s_and_b32 s3, s3, 0xff +; VI-NEXT: s_lshl_b32 s23, s31, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_and_b32 s2, s2, 0xff +; VI-NEXT: s_lshl_b32 s21, s30, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: s_lshl_b32 s19, s29, 8 +; VI-NEXT: s_or_b32 s16, s16, s17 +; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_lshl_b32 s17, s28, 8 +; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s9, s19, 0xff -; VI-NEXT: s_add_i32 s29, s29, s29 -; VI-NEXT: s_add_i32 s6, s6, s6 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_lshl_b32 s8, s18, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s21, s5, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s9, s29, 8 -; VI-NEXT: s_lshr_b32 s20, s5, 24 -; VI-NEXT: s_add_i32 s21, s21, s21 -; VI-NEXT: s_or_b32 s6, s6, s9 -; VI-NEXT: s_bfe_u32 s30, s5, 0x80008 -; VI-NEXT: s_add_i32 s20, s20, s20 +; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_and_b32 s9, s21, 0xff -; VI-NEXT: s_add_i32 s30, s30, s30 -; VI-NEXT: s_add_i32 s5, s5, s5 -; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: s_lshl_b32 s8, s20, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_lshr_b32 s23, s4, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_lshl_b32 s9, s30, 8 -; VI-NEXT: s_lshr_b32 s22, s4, 24 -; VI-NEXT: s_add_i32 s23, s23, s23 -; VI-NEXT: s_or_b32 s5, s5, s9 -; VI-NEXT: s_bfe_u32 s31, s4, 0x80008 -; VI-NEXT: s_add_i32 s22, s22, s22 +; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_and_b32 s9, s23, 0xff -; VI-NEXT: s_add_i32 s31, s31, s31 -; VI-NEXT: s_add_i32 s4, s4, s4 -; VI-NEXT: s_or_b32 s5, s5, s8 -; VI-NEXT: s_lshl_b32 s8, s22, 24 -; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_and_b32 s4, s4, 0xff -; VI-NEXT: s_lshl_b32 s9, s31, 8 -; VI-NEXT: s_or_b32 s4, s4, s9 +; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s3, s3, s23 +; VI-NEXT: s_or_b32 s2, s2, s21 +; VI-NEXT: s_or_b32 s1, s1, s19 +; VI-NEXT: s_or_b32 s0, s0, s17 +; VI-NEXT: s_or_b32 s7, s7, s14 +; VI-NEXT: s_or_b32 s6, s6, s12 +; VI-NEXT: s_or_b32 s5, s5, s10 ; VI-NEXT: s_or_b32 s4, s4, s8 +; VI-NEXT: s_lshl_b32 s22, s22, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s20, s20, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshl_b32 s18, s18, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_lshl_b32 s16, s16, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: s_or_b32 s3, s3, s22 +; VI-NEXT: s_or_b32 s2, s2, s20 +; VI-NEXT: s_or_b32 s1, s1, s18 +; VI-NEXT: s_or_b32 s0, s0, s16 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index f712421083e6b..df35a4e4bcc75 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -codegenprepare < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; This particular case will actually be worse in terms of code size ; from sinking into both. @@ -116,21 +116,15 @@ ret: ; OPT: store ; OPT: ret -; For GFX8: since i16 is legal type, we cannot sink lshr into .LBBs. - ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr -; VI: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x2c -; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004 ; GCN: s_cbranch_scc{{[0-1]}} ; GCN: ; %bb.1: -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 -; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7f +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 ; GCN: .LBB2_2: -; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 -; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 ; GCN: buffer_store_short ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index fcb871cedd0cb..ae8080cf9f06a 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -6,9 +7,23 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; i32 compares ; -------------------------------------------------------------------------------- -; GCN-LABEL: {{^}}commute_eq_64_i32: -; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}} define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_eq_64_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 64, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -19,9 +34,23 @@ define amdgpu_kernel void @commute_eq_64_i32(ptr addrspace(1) %out, ptr addrspac ret void } -; GCN-LABEL: {{^}}commute_ne_64_i32: -; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}} define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ne_64_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 64, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -33,10 +62,24 @@ define amdgpu_kernel void @commute_ne_64_i32(ptr addrspace(1) %out, ptr addrspac } ; FIXME: Why isn't this being folded as a constant? -; GCN-LABEL: {{^}}commute_ne_litk_i32: -; GCN: s_movk_i32 [[K:s[0-9]+]], 0x3039 -; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}} define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ne_litk_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_movk_i32 s4, 0x3039 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, s4, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -47,9 +90,23 @@ define amdgpu_kernel void @commute_ne_litk_i32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ugt_64_i32: -; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}} define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ugt_64_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 64, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -60,9 +117,23 @@ define amdgpu_kernel void @commute_ugt_64_i32(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_uge_64_i32: -; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}} define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_uge_64_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 63, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -73,9 +144,23 @@ define amdgpu_kernel void @commute_uge_64_i32(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_ult_64_i32: -; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ult_64_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -86,9 +171,23 @@ define amdgpu_kernel void @commute_ult_64_i32(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_ule_63_i32: -; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ule_63_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -99,10 +198,24 @@ define amdgpu_kernel void @commute_ule_63_i32(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_ule_64_i32: -; GCN: s_movk_i32 [[K:s[0-9]+]], 0x41{{$}} -; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ule_64_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_movk_i32 s4, 0x41 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s4, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -113,9 +226,23 @@ define amdgpu_kernel void @commute_ule_64_i32(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_sgt_neg1_i32: -; GCN: v_ashrrev_i32_e32 v2, 31, v2 define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_sgt_neg1_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_not_b32_e32 v2, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -126,9 +253,23 @@ define amdgpu_kernel void @commute_sgt_neg1_i32(ptr addrspace(1) %out, ptr addrs ret void } -; GCN-LABEL: {{^}}commute_sge_neg2_i32: -; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}} define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_sge_neg2_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -3, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -139,9 +280,23 @@ define amdgpu_kernel void @commute_sge_neg2_i32(ptr addrspace(1) %out, ptr addrs ret void } -; GCN-LABEL: {{^}}commute_slt_neg16_i32: -; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}} define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_slt_neg16_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, -16, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -152,9 +307,23 @@ define amdgpu_kernel void @commute_slt_neg16_i32(ptr addrspace(1) %out, ptr addr ret void } -; GCN-LABEL: {{^}}commute_sle_5_i32: -; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}} define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_sle_5_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 6, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -169,9 +338,24 @@ define amdgpu_kernel void @commute_sle_5_i32(ptr addrspace(1) %out, ptr addrspac ; i64 compares ; -------------------------------------------------------------------------------- -; GCN-LABEL: {{^}}commute_eq_64_i64: -; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_eq_64_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 64, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -182,9 +366,24 @@ define amdgpu_kernel void @commute_eq_64_i64(ptr addrspace(1) %out, ptr addrspac ret void } -; GCN-LABEL: {{^}}commute_ne_64_i64: -; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ne_64_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 64, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -195,9 +394,24 @@ define amdgpu_kernel void @commute_ne_64_i64(ptr addrspace(1) %out, ptr addrspac ret void } -; GCN-LABEL: {{^}}commute_ugt_64_i64: -; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ugt_64_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_u64_e32 vcc, 64, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -208,9 +422,24 @@ define amdgpu_kernel void @commute_ugt_64_i64(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_uge_64_i64: -; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_uge_64_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -221,9 +450,24 @@ define amdgpu_kernel void @commute_uge_64_i64(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_ult_64_i64: -; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ult_64_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -234,9 +478,24 @@ define amdgpu_kernel void @commute_ult_64_i64(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_ule_63_i64: -; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ule_63_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -249,10 +508,25 @@ define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspa ; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm -; GCN-LABEL: {{^}}commute_ule_64_i64: -; GCN: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x41 -; GCN: v_cmp_gt_u64_e32 vcc, [[K]], v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ule_64_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[4:5], 0x41 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -263,9 +537,24 @@ define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspa ret void } -; GCN-LABEL: {{^}}commute_sgt_neg1_i64: -; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_sgt_neg1_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -276,9 +565,24 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs ret void } -; GCN-LABEL: {{^}}commute_sge_neg2_i64: -; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_sge_neg2_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -3, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -289,9 +593,24 @@ define amdgpu_kernel void @commute_sge_neg2_i64(ptr addrspace(1) %out, ptr addrs ret void } -; GCN-LABEL: {{^}}commute_slt_neg16_i64: -; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_slt_neg16_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, -16, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -302,9 +621,24 @@ define amdgpu_kernel void @commute_slt_neg16_i64(ptr addrspace(1) %out, ptr addr ret void } -; GCN-LABEL: {{^}}commute_sle_5_i64: -; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_sle_5_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 6, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -319,10 +653,23 @@ define amdgpu_kernel void @commute_sle_5_i64(ptr addrspace(1) %out, ptr addrspac ; f32 compares ; -------------------------------------------------------------------------------- - -; GCN-LABEL: {{^}}commute_oeq_2.0_f32: -; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_oeq_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -333,10 +680,23 @@ define amdgpu_kernel void @commute_oeq_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } - -; GCN-LABEL: {{^}}commute_ogt_2.0_f32: -; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ogt_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -347,9 +707,23 @@ define amdgpu_kernel void @commute_ogt_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_oge_2.0_f32: -; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_oge_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_le_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -360,9 +734,23 @@ define amdgpu_kernel void @commute_oge_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_olt_2.0_f32: -; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_olt_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -373,9 +761,23 @@ define amdgpu_kernel void @commute_olt_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ole_2.0_f32: -; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ole_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ge_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -386,9 +788,23 @@ define amdgpu_kernel void @commute_ole_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_one_2.0_f32: -; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_one_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -399,9 +815,23 @@ define amdgpu_kernel void @commute_one_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ord_2.0_f32: -; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ord_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -412,9 +842,23 @@ define amdgpu_kernel void @commute_ord_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ueq_2.0_f32: -; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ueq_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -425,9 +869,23 @@ define amdgpu_kernel void @commute_ueq_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ugt_2.0_f32: -; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ugt_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nge_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -438,9 +896,23 @@ define amdgpu_kernel void @commute_ugt_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_uge_2.0_f32: -; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_uge_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -451,9 +923,23 @@ define amdgpu_kernel void @commute_uge_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ult_2.0_f32: -; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ult_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nle_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -464,9 +950,23 @@ define amdgpu_kernel void @commute_ult_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ule_2.0_f32: -; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ule_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -477,9 +977,23 @@ define amdgpu_kernel void @commute_ule_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_une_2.0_f32: -; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}} define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_une_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 2.0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -490,9 +1004,23 @@ define amdgpu_kernel void @commute_une_2.0_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_uno_2.0_f32: -; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_uno_2.0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -507,10 +1035,24 @@ define amdgpu_kernel void @commute_uno_2.0_f32(ptr addrspace(1) %out, ptr addrsp ; f64 compares ; -------------------------------------------------------------------------------- - -; GCN-LABEL: {{^}}commute_oeq_2.0_f64: -; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_oeq_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -521,10 +1063,24 @@ define amdgpu_kernel void @commute_oeq_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } - -; GCN-LABEL: {{^}}commute_ogt_2.0_f64: -; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ogt_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lt_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -535,9 +1091,24 @@ define amdgpu_kernel void @commute_ogt_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_oge_2.0_f64: -; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_oge_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_le_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -548,9 +1119,24 @@ define amdgpu_kernel void @commute_oge_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_olt_2.0_f64: -; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_olt_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -561,9 +1147,24 @@ define amdgpu_kernel void @commute_olt_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ole_2.0_f64: -; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ole_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ge_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -574,9 +1175,24 @@ define amdgpu_kernel void @commute_ole_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_one_2.0_f64: -; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_one_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_lg_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -587,9 +1203,24 @@ define amdgpu_kernel void @commute_one_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ord_2.0_f64: -; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ord_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_o_f64_e32 vcc, v[3:4], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -600,9 +1231,24 @@ define amdgpu_kernel void @commute_ord_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ueq_2.0_f64: -; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ueq_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nlg_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -613,9 +1259,24 @@ define amdgpu_kernel void @commute_ueq_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ugt_2.0_f64: -; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ugt_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nge_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -626,9 +1287,24 @@ define amdgpu_kernel void @commute_ugt_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_uge_2.0_f64: -; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_uge_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ngt_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -639,9 +1315,24 @@ define amdgpu_kernel void @commute_uge_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ult_2.0_f64: -; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ult_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nle_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -652,9 +1343,24 @@ define amdgpu_kernel void @commute_ult_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_ule_2.0_f64: -; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_ule_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -665,9 +1371,24 @@ define amdgpu_kernel void @commute_ule_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_une_2.0_f64: -; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_une_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_neq_f64_e32 vcc, 2.0, v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -678,9 +1399,24 @@ define amdgpu_kernel void @commute_une_2.0_f64(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}commute_uno_2.0_f64: -; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; GCN-LABEL: commute_uno_2.0_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[3:4], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -696,12 +1432,27 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp ; Without commuting the frame index in the pre-regalloc run of ; SIShrinkInstructions, this was using the VOP3 compare. -; GCN-LABEL: {{^}}commute_frameindex: -; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} - -; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} -; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}} define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 { +; GCN-LABEL: commute_frameindex: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; GCN-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN-NEXT: s_add_u32 s12, s12, s11 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm entry: %stack0 = alloca i32, addrspace(5) %ptr0 = load volatile ptr addrspace(5), ptr addrspace(1) poison diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 00f74f50a4b8b..52c90817dddd1 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1623,15 +1623,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1669,11 +1664,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1684,11 +1675,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, -16 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm @@ -1700,13 +1690,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 92ece0d007fe2..773369b7a5beb 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -652,14 +652,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v3 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -760,16 +759,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index f0c278a67c8bc..7f83fc571bf29 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1402,15 +1402,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; VI-NEXT: v_mov_b32_e32 v1, 0xffff ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1448,10 +1443,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX10-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1463,9 +1455,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 3c45596fba14b..6b1551a88df5c 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -629,9 +629,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -731,9 +730,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 @@ -1460,13 +1458,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x100, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -1503,14 +1496,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v3 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -1557,19 +1550,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_readfirstlane_b32 s2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s3, v0 -; VI-NEXT: s_lshl_b32 s2, s2, 8 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s3, s2, 0x10000 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_ff1_i32_b32 s3, s3 -; VI-NEXT: s_cmp_lg_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, s3, 0xffff +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1611,9 +1597,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 07a7d8d20c439..7262724064918 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -647,12 +647,7 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_xor_b32 s2, s2, 0x8000 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s3, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 +; VI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -731,11 +726,9 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_xor_b32 s3, s3, 0x8000 -; VI-NEXT: s_xor_b32 s2, s2, 0x8000 ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 -; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_sub_f16_e64 v1, 2.0, s2 +; VI-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.path.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll similarity index 93% rename from llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.path.ll rename to llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll index 5a5e39489d888..e5815e96fbe33 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.path.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll @@ -12,6 +12,20 @@ define <2 x half> @v_test_cvt_v2f32_v2f16(<2 x float> %src) { ret <2 x half> %res } +define half @fptrunc_v2f32_v2f16_then_extract(<2 x float> %src) { +; GFX950-LABEL: fptrunc_v2f32_v2f16_then_extract: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1 +; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX950-NEXT: s_setpc_b64 s[30:31] + %vec_half = fptrunc <2 x float> %src to <2 x half> + %first = extractelement <2 x half> %vec_half, i64 1 + %second = extractelement <2 x half> %vec_half, i64 0 + %res = fadd half %first, %second + ret half %res +} + define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) { ; GFX950-SDAG-LABEL: v_test_cvt_v2f64_v2f16: ; GFX950-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll index 92d0a05f35732..b2ebf2e33e29f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -23,10 +23,10 @@ ;. define amdgpu_kernel void @k0() #0 { ; CHECK-LABEL: @k0() #0 -; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4 -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !12, !noalias !13 +; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !2, !noalias !5 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !9, !noalias !10 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !11, !noalias !12 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !13, !noalias !14 ; CHECK-NEXT: ret void store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1 @@ -41,9 +41,9 @@ define amdgpu_kernel void @k0() #0 { define amdgpu_kernel void @k1() #0 { ; CHECK-LABEL: @k1() #1 -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !21, !noalias !22 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !23, !noalias !24 ; CHECK-NEXT: ret void ; store i8 2, ptr addrspace(3) @lds.size.2.align.2, align 2 @@ -83,8 +83,8 @@ define amdgpu_kernel void @calls_f0() { define void @f0() { ; CHECK-LABEL: define void @f0() -; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !24 -; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !29 +; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !25 +; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !30 ; CHECK-NEXT: ret void store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll index 70142fa4b5b29..37ae05bfab86f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll @@ -16,10 +16,10 @@ ;. define amdgpu_kernel void @k0() { ; CHECK-LABEL: @k0( -; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4 -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !12, !noalias !13 +; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !2, !noalias !5 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !9, !noalias !10 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !11, !noalias !12 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 16, !alias.scope !13, !noalias !14 ; CHECK-NEXT: ret void store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1 @@ -34,9 +34,9 @@ define amdgpu_kernel void @k0() { define amdgpu_kernel void @k1() { ; CHECK-LABEL: @k1( -; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17 -; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21 -; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23 +; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !21, !noalias !22 +; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !23, !noalias !24 ; CHECK-NEXT: ret void ; store i8 2, ptr addrspace(3) @lds.size.2.align.2, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index 4ab05c2923fdb..dbab9e520f989 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -34,19 +34,20 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce ; GCN-NEXT: s_endpgm ; CHECK-LABEL: define protected amdgpu_kernel void @test( ; CHECK-SAME: ptr addrspace(1) captures(none) [[PTR_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, align 4, !alias.scope !1, !noalias !4 -; CHECK-NEXT: tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope !6, !noalias !7 -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope !4, !noalias !1 +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, align 4, !alias.scope [[META2:![0-9]+]], !noalias [[META5:![0-9]+]] +; CHECK-NEXT: tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope [[META7:![0-9]+]], !noalias [[META8:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope [[META5]], !noalias [[META2]] ; CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 3 -; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, align 4, !alias.scope !1, !noalias !4 -; CHECK-NEXT: tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope !6, !noalias !7 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope !4, !noalias !1 +; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, align 4, !alias.scope [[META2]], !noalias [[META5]] +; CHECK-NEXT: tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), ptr addrspace(3) noundef align 1 dereferenceable(3) @llvm.amdgcn.kernel.test.lds, i64 3, i1 false), !alias.scope [[META7]], !noalias [[META8]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_TEST_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.test.lds, i32 0, i32 2), align 4, !alias.scope [[META5]], !noalias [[META2]] ; CHECK-NEXT: [[CMP_I_I19:%.*]] = icmp eq i8 [[TMP1]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = and i1 [[CMP_I_I19]], [[CMP_I_I]] ; CHECK-NEXT: [[FROMBOOL8:%.*]] = zext i1 [[TMP2]] to i8 ; CHECK-NEXT: store i8 [[FROMBOOL8]], ptr addrspace(1) [[PTR_COERCE]], align 1 ; CHECK-NEXT: ret void +; entry: store i8 3, ptr addrspace(3) @_f1, align 1 tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) @_f2, ptr addrspace(3) noundef align 1 dereferenceable(3) @_f1, i64 3, i1 false) @@ -63,17 +64,15 @@ entry: } declare void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #1 - ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="7" } ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ;. -; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1} -; CHECK: [[META1:![0-9]+]] = !{!2} -; CHECK: [[META2:![0-9]+]] = distinct !{!2, !3} -; CHECK: [[META3:![0-9]+]] = distinct !{!3} -; CHECK: [[META4:![0-9]+]] = !{!5} -; CHECK: [[META5:![0-9]+]] = distinct !{!5, !3} -; CHECK: [[META6:![0-9]+]] = !{!5, !2} -; CHECK: [[META7:![0-9]+]] = !{} +; CHECK: [[META2]] = !{[[META3:![0-9]+]]} +; CHECK: [[META3]] = distinct !{[[META3]], [[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]]} +; CHECK: [[META5]] = !{[[META6:![0-9]+]]} +; CHECK: [[META6]] = distinct !{[[META6]], [[META4]]} +; CHECK: [[META7]] = !{[[META6]], [[META3]]} +; CHECK: [[META8]] = !{} ;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll index 154c798a44f93..7437ce347d1a5 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll @@ -9,12 +9,12 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(ptr addrs ; CHECK-LABEL: define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa( ; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: bb: -; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa [[TBAA1:![0-9]+]], !noalias !6 +; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa [[TBAA2:![0-9]+]], !noalias [[META7:![0-9]+]] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA1]], !noalias !6 -; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa [[TBAA1]], !noalias !11 +; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA2]], !noalias [[META7]] +; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa [[TBAA2]], !noalias [[META12:![0-9]+]] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !noalias !11 +; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA2]], !noalias [[META12]] ; CHECK-NEXT: [[VAL:%.*]] = add i32 [[VAL_A]], [[VAL_B]] ; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4 ; CHECK-NEXT: ret void @@ -42,17 +42,18 @@ bb: !8 = !{!"omnipotent char", !9, i64 0} !9 = !{!"Simple C++ TBAA"} -; CHECK:!0 = !{i32 0, i32 1} -; CHECK:!1 = !{!2, !3, i64 0} -; CHECK:!2 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !3, i64 0} -; CHECK:!3 = !{!"int", !4, i64 0} -; CHECK:!4 = !{!"omnipotent char", !5, i64 0} -; CHECK:!5 = !{!"Simple C++ TBAA"} -; CHECK:!6 = !{!7, !9} -; CHECK:!7 = distinct !{!7, !8} -; CHECK:!8 = distinct !{!8} -; CHECK:!9 = distinct !{!9, !10} -; CHECK:!10 = distinct !{!10} -; CHECK:!11 = !{!12, !13} -; CHECK:!12 = distinct !{!12, !8} -; CHECK:!13 = distinct !{!13, !10} +; CHECK: !0 = !{i32 0, i32 1} +; CHECK: !1 = !{i32 1, !"amdgpu.lowered_lds", i32 1} +; CHECK: !2 = !{!3, !4, i64 0} +; CHECK: !3 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !4, i64 0} +; CHECK: !4 = !{!"int", !5, i64 0} +; CHECK: !5 = !{!"omnipotent char", !6, i64 0} +; CHECK: !6 = !{!"Simple C++ TBAA"} +; CHECK: !7 = !{!8, !10} +; CHECK: !8 = distinct !{!8, !9} +; CHECK: !9 = distinct !{!9} +; CHECK: !10 = distinct !{!10, !11} +; CHECK: !11 = distinct !{!11} +; CHECK: !12 = !{!13, !14} +; CHECK: !13 = distinct !{!13, !9} +; CHECK: !14 = distinct !{!14, !11} diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll index 24c1bfb8d50f0..927ef687bc8d9 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -14,12 +14,12 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i ; CHECK-LABEL: define amdgpu_kernel void @no_clobber_ds_load_stores_x2( ; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: bb: -; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, align 16, !alias.scope !1, !noalias !4 +; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, align 16, !alias.scope !2, !noalias !5 ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !alias.scope !1, !noalias !4 -; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 1), align 16, !alias.scope !4, !noalias !1 +; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !alias.scope !2, !noalias !5 +; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 1), align 16, !alias.scope !5, !noalias !2 ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2.lds, i32 0, i32 1), i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !alias.scope !4, !noalias !1 +; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !alias.scope !5, !noalias !2 ; CHECK-NEXT: [[VAL:%.*]] = add i32 [[VAL_A]], [[VAL_B]] ; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4 ; CHECK-NEXT: ret void @@ -58,15 +58,15 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; CHECK-LABEL: define amdgpu_kernel void @no_clobber_ds_load_stores_x3( ; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: bb: -; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, align 16, !alias.scope !6, !noalias !9 +; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, align 16, !alias.scope !7, !noalias !10 ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !alias.scope !6, !noalias !9 -; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 1), align 16, !alias.scope !12, !noalias !13 +; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !alias.scope !7, !noalias !10 +; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 1), align 16, !alias.scope !13, !noalias !14 ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 1), i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !alias.scope !12, !noalias !13 -; CHECK-NEXT: store i32 3, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 2), align 16, !alias.scope !14, !noalias !15 +; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !alias.scope !13, !noalias !14 +; CHECK-NEXT: store i32 3, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 2), align 16, !alias.scope !15, !noalias !16 ; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X3_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x3.lds, i32 0, i32 2), i32 0, i32 [[I]] -; CHECK-NEXT: [[VAL_C:%.*]] = load i32, ptr addrspace(3) [[GEP_C]], align 4, !alias.scope !14, !noalias !15 +; CHECK-NEXT: [[VAL_C:%.*]] = load i32, ptr addrspace(3) [[GEP_C]], align 4, !alias.scope !15, !noalias !16 ; CHECK-NEXT: [[VAL_1:%.*]] = add i32 [[VAL_A]], [[VAL_B]] ; CHECK-NEXT: [[VAL:%.*]] = add i32 [[VAL_1]], [[VAL_C]] ; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4 @@ -111,18 +111,19 @@ bb: } ; CHECK: !0 = !{i32 0, i32 1} -; CHECK: !1 = !{!2} -; CHECK: !2 = distinct !{!2, !3} -; CHECK: !3 = distinct !{!3} -; CHECK: !4 = !{!5} -; CHECK: !5 = distinct !{!5, !3} -; CHECK: !6 = !{!7} -; CHECK: !7 = distinct !{!7, !8} -; CHECK: !8 = distinct !{!8} -; CHECK: !9 = !{!10, !11} -; CHECK: !10 = distinct !{!10, !8} -; CHECK: !11 = distinct !{!11, !8} -; CHECK: !12 = !{!10} -; CHECK: !13 = !{!7, !11} -; CHECK: !14 = !{!11} -; CHECK: !15 = !{!7, !10} +; CHECK: !1 = !{i32 1, !"amdgpu.lowered_lds", i32 1} +; CHECK: !2 = !{!3} +; CHECK: !3 = distinct !{!3, !4} +; CHECK: !4 = distinct !{!4} +; CHECK: !5 = !{!6} +; CHECK: !6 = distinct !{!6, !4} +; CHECK: !7 = !{!8} +; CHECK: !8 = distinct !{!8, !9} +; CHECK: !9 = distinct !{!9} +; CHECK: !10 = !{!11, !12} +; CHECK: !11 = distinct !{!11, !9} +; CHECK: !12 = distinct !{!12, !9} +; CHECK: !13 = !{!11} +; CHECK: !14 = !{!8, !12} +; CHECK: !15 = !{!12} +; CHECK: !16 = !{!8, !11} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll index 4fcad258d4a74..9edaa72fa55bb 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-all-indirect-accesses.ll @@ -9,7 +9,8 @@ @B = external addrspace(3) global [0 x i32] define amdgpu_kernel void @kernel_0() { -; CHECK-LABEL: define amdgpu_kernel void @kernel_0() #0 !llvm.amdgcn.lds.kernel.id !1 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_0 +; CHECK-SAME: () #[[ATTR0:[0-9]+]] {{.*}}.amdgcn.lds.kernel.id [[META2:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_0.lds) ] ; CHECK-NEXT: call void @call_store_A() ; CHECK-NEXT: ret void @@ -19,7 +20,8 @@ define amdgpu_kernel void @kernel_0() { } define amdgpu_kernel void @kernel_1() { -; CHECK-LABEL: define amdgpu_kernel void @kernel_1() !llvm.amdgcn.lds.kernel.id !2 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_1 +; CHECK-SAME: () {{.*}}.amdgcn.lds.kernel.id [[META3:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() ; CHECK-NEXT: ret void @@ -29,7 +31,8 @@ define amdgpu_kernel void @kernel_1() { } define amdgpu_kernel void @kernel_2() { -; CHECK-LABEL: define amdgpu_kernel void @kernel_2() #0 !llvm.amdgcn.lds.kernel.id !3 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_2 +; CHECK-SAME: () #[[ATTR0]] {{.*}}.amdgcn.lds.kernel.id [[META4:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_2.lds) ] ; CHECK-NEXT: call void @store_A() ; CHECK-NEXT: ret void @@ -39,7 +42,8 @@ define amdgpu_kernel void @kernel_2() { } define amdgpu_kernel void @kernel_3() { -; CHECK-LABEL: define amdgpu_kernel void @kernel_3() !llvm.amdgcn.lds.kernel.id !4 { +; CHECK-LABEL: define amdgpu_kernel void @kernel_3 +; CHECK-SAME: () {{.*}}.amdgcn.lds.kernel.id [[META5:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() ; CHECK-NEXT: ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll index a553375cb51e0..a98e170a68b8a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll @@ -13,22 +13,12 @@ @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8 ; CHECK: %llvm.amdgcn.module.lds.t = type { i32 } -; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double] -; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8 -; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol !0 -; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" ; Alignment of these must be the maximum of the alignment of the reachable symbols -; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0 -; CHECK: @llvm.amdgcn.expect_align2.dynlds = external addrspace(3) global [0 x i8], align 2, !absolute_symbol !0 -; CHECK: @llvm.amdgcn.expect_align4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1 -; CHECK: @llvm.amdgcn.expect_align8.dynlds = external addrspace(3) global [0 x i8], align 8, !absolute_symbol !0 ; Align 4 and symbol at address [4 5) as module.lds is reachable -; CHECK: @llvm.amdgcn.expect_max_of_2_and_4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1 ; Builds a lookup table out of the newly created (suffixed .dynlds) variables in kernel.id order -; CHECK: @llvm.amdgcn.dynlds.offset.table = internal addrspace(4) constant [5 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds to i32)] @@ -113,7 +103,8 @@ define void @use_shared8() #0 { ; The kernels are annotated with kernel.id and llvm.donothing use of the corresponding variable define amdgpu_kernel void @expect_align1() { -; CHECK-LABEL: define amdgpu_kernel void @expect_align1() !llvm.amdgcn.lds.kernel.id !2 { +; CHECK-LABEL: define amdgpu_kernel void @expect_align1( +; CHECK-SAME: ) {{.*}}.amdgcn.lds.kernel.id [[META3:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds) ] ; CHECK-NEXT: call void @use_shared1() ; CHECK-NEXT: ret void @@ -123,7 +114,8 @@ define amdgpu_kernel void @expect_align1() { } define amdgpu_kernel void @expect_align2() { -; CHECK-LABEL: define amdgpu_kernel void @expect_align2() !llvm.amdgcn.lds.kernel.id !3 { +; CHECK-LABEL: define amdgpu_kernel void @expect_align2( +; CHECK-SAME: ) {{.*}}.amdgcn.lds.kernel.id [[META4:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds) ] ; CHECK-NEXT: call void @use_shared2() ; CHECK-NEXT: ret void @@ -134,7 +126,7 @@ define amdgpu_kernel void @expect_align2() { define amdgpu_kernel void @expect_align4() { ; CHECK-LABEL: define amdgpu_kernel void @expect_align4( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id !4 { +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {{.*}}.amdgcn.lds.kernel.id [[META5:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared4() @@ -146,7 +138,8 @@ define amdgpu_kernel void @expect_align4() { ; Use dynamic_shared directly too. define amdgpu_kernel void @expect_align8() { -; CHECK-LABEL: define amdgpu_kernel void @expect_align8() !llvm.amdgcn.lds.kernel.id !5 { +; CHECK-LABEL: define amdgpu_kernel void @expect_align8( +; CHECK-SAME: ) {{.*}}.amdgcn.lds.kernel.id [[META6:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9 ; CHECK-NEXT: store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8 @@ -162,7 +155,7 @@ define amdgpu_kernel void @expect_align8() { ; Note: use_shared4 uses module.lds so this will allocate at offset 4 define amdgpu_kernel void @expect_max_of_2_and_4() { ; CHECK-LABEL: define amdgpu_kernel void @expect_max_of_2_and_4( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id !6 { +; CHECK-SAME: ) #[[ATTR1]] {{.*}}.amdgcn.lds.kernel.id [[META7:![0-9]+]] { ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @use_shared2() @@ -178,10 +171,8 @@ define amdgpu_kernel void @expect_max_of_2_and_4() { attributes #0 = { noinline } ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) -; CHECK: declare void @llvm.donothing() #2 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -; CHECK: declare noundef i32 @llvm.amdgcn.lds.kernel.id() #3 ; CHECK: attributes #0 = { noinline } ; CHECK: attributes #1 = { "amdgpu-lds-size"="4,4" } @@ -190,8 +181,8 @@ attributes #0 = { noinline } ; CHECK: !0 = !{i32 0, i32 1} ; CHECK: !1 = !{i32 4, i32 5} -; CHECK: !2 = !{i32 0} -; CHECK: !3 = !{i32 1} -; CHECK: !4 = !{i32 2} -; CHECK: !5 = !{i32 3} -; CHECK: !6 = !{i32 4} +; CHECK: !3 = !{i32 0} +; CHECK: !4 = !{i32 1} +; CHECK: !5 = !{i32 2} +; CHECK: !6 = !{i32 3} +; CHECK: !7 = !{i32 4} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 2a7553ae5d92b..4aa92ce85adef 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -195,7 +195,7 @@ define amdgpu_kernel void @k01() { define amdgpu_kernel void @k23() { ; OPT-LABEL: @k23( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: call void @f3() ; OPT-NEXT: ret void @@ -245,12 +245,12 @@ define amdgpu_kernel void @k23() { ; Access and allocate three variables define amdgpu_kernel void @k123() { ; OPT-LABEL: @k123( -; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META12:![0-9]+]], !noalias [[META15:![0-9]+]] ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; OPT-NEXT: call void @f1() -; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META14]], !noalias [[META11]] +; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META15]], !noalias [[META12]] ; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 -; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META14]], !noalias [[META11]] +; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META15]], !noalias [[META12]] ; OPT-NEXT: call void @f2() ; OPT-NEXT: ret void ; @@ -312,18 +312,19 @@ define amdgpu_kernel void @k123() { ; OPT: !1 = !{i32 4, i32 5} ; OPT: !2 = !{i32 8, i32 9} ; OPT: !3 = !{i32 1, !"amdhsa_code_object_version", i32 500} -; OPT: !4 = !{i32 1} -; OPT: !5 = !{!6} -; OPT: !6 = distinct !{!6, !7} -; OPT: !7 = distinct !{!7} -; OPT: !8 = !{!9} -; OPT: !9 = distinct !{!9, !7} -; OPT: !10 = !{i32 0} -; OPT: !11 = !{!12} -; OPT: !12 = distinct !{!12, !13} -; OPT: !13 = distinct !{!13} -; OPT: !14 = !{!15} -; OPT: !15 = distinct !{!15, !13} +; OPT: !4 = !{i32 1, !"amdgpu.lowered_lds", i32 1} +; OPT: !5 = !{i32 1} +; OPT: !6 = !{!7} +; OPT: !7 = distinct !{!7, !8} +; OPT: !8 = distinct !{!8} +; OPT: !9 = !{!10} +; OPT: !10 = distinct !{!10, !8} +; OPT: !11 = !{i32 0} +; OPT: !12 = !{!13} +; OPT: !13 = distinct !{!13, !14} +; OPT: !14 = distinct !{!14} +; OPT: !15 = !{!16} +; OPT: !16 = distinct !{!16, !14} attributes #0 = { "amdgpu-lds-size"="8" } attributes #1 = { "amdgpu-lds-size"="16" } diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 5a9259efc0cc8..9a93b1d8b5909 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -20,13 +20,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s2, 0xffff -; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_lshr_b32 s2, s2, s5 -; VI-NEXT: s_lshr_b32 s3, s4, s3 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_lshr_b32 s4, s5, s4 +; VI-NEXT: s_lshr_b32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s3, s4, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index d2f4f54cefe78..05ffaf62ff1e0 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -454,15 +454,15 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; VI-LABEL: s_test_imin_sle_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[8:9], 0x28 -; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_load_dword s2, s[8:9], 0x4c +; VI-NEXT: s_load_dword s3, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 -; VI-NEXT: s_min_i32 s2, s2, s3 +; VI-NEXT: s_min_i32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -472,14 +472,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; ; GFX9-LABEL: s_test_imin_sle_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 -; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x4c +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x28 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i8 s2, s2 ; GFX9-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NEXT: s_min_i32 s2, s2, s3 +; GFX9-NEXT: s_min_i32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -487,14 +487,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX10-LABEL: s_test_imin_sle_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 -; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x4c +; GFX10-NEXT: s_load_dword s3, s[8:9], 0x28 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i8 s2, s2 ; GFX10-NEXT: s_sext_i32_i8 s3, s3 -; GFX10-NEXT: s_min_i32 s2, s2, s3 +; GFX10-NEXT: s_min_i32 s2, s3, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -502,15 +502,15 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; GFX11-LABEL: s_test_imin_sle_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 -; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x28 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_sext_i32_i8 s2, s2 ; GFX11-NEXT: s_sext_i32_i8 s3, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s2, s3, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -614,30 +614,32 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-LABEL: s_test_imin_sle_v4i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 -; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 -; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 -; VI-NEXT: s_bfe_i32 s6, s2, 0x80008 -; VI-NEXT: s_sext_i32_i8 s2, s2 -; VI-NEXT: s_ashr_i32 s7, s3, 24 -; VI-NEXT: s_bfe_i32 s8, s3, 0x80010 -; VI-NEXT: s_bfe_i32 s9, s3, 0x80008 +; VI-NEXT: s_ashr_i32 s6, s3, 24 +; VI-NEXT: s_min_i32 s4, s4, s6 +; VI-NEXT: s_bfe_i32 s6, s3, 0x80010 +; VI-NEXT: s_bfe_i32 s8, s2, 0x80010 +; VI-NEXT: s_min_i32 s6, s8, s6 +; VI-NEXT: s_sext_i32_i16 s5, s2 +; VI-NEXT: s_sext_i32_i16 s7, s3 +; VI-NEXT: s_lshl_b32 s4, s4, 8 +; VI-NEXT: s_and_b32 s6, s6, 0xff +; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_ashr_i32 s6, s7, 8 +; VI-NEXT: s_ashr_i32 s5, s5, 8 ; VI-NEXT: s_sext_i32_i8 s3, s3 +; VI-NEXT: s_sext_i32_i8 s2, s2 +; VI-NEXT: s_min_i32 s5, s5, s6 ; VI-NEXT: s_min_i32 s2, s2, s3 -; VI-NEXT: s_min_i32 s3, s6, s9 -; VI-NEXT: s_min_i32 s5, s5, s8 -; VI-NEXT: s_min_i32 s4, s4, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_lshl_b32 s3, s3, 8 +; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_and_b32 s2, s2, 0xff -; VI-NEXT: s_lshl_b32 s4, s4, 24 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_or_b32 s2, s2, s5 +; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -649,36 +651,35 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_ashr_i32 s9, s3, 24 -; GFX9-NEXT: s_ashr_i32 s6, s2, 24 -; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: s_sext_i32_i16 s7, s3 -; GFX9-NEXT: v_min_i16_e32 v1, s6, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_lshr_b32 s7, s7, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_min_i16_e32 v2, s5, v2 -; GFX9-NEXT: s_lshr_b32 s4, s4, 8 -; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX9-NEXT: v_min_i16_e32 v2, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: v_min_i16_e32 v3, s2, v3 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_ashr_i32 s7, s7, 8 +; GFX9-NEXT: s_ashr_i32 s5, s5, 8 +; GFX9-NEXT: s_ashr_i32 s4, s2, 24 +; GFX9-NEXT: s_ashr_i32 s6, s3, 24 +; GFX9-NEXT: s_min_i32 s5, s5, s7 +; GFX9-NEXT: s_sext_i32_i8 s7, s3 +; GFX9-NEXT: s_sext_i32_i8 s8, s2 +; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80010 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX9-NEXT: s_min_i32 s7, s8, s7 +; GFX9-NEXT: s_min_i32 s4, s4, s6 +; GFX9-NEXT: s_min_i32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_or_b32 s5, s7, s5 +; GFX9-NEXT: s_or_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s2, s5, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -688,111 +689,70 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 ; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s4, s2 +; GFX10-NEXT: s_sext_i32_i16 s5, s2 ; GFX10-NEXT: s_sext_i32_i16 s7, s3 -; GFX10-NEXT: s_ashr_i32 s6, s2, 24 -; GFX10-NEXT: s_ashr_i32 s9, s3, 24 -; GFX10-NEXT: s_lshr_b32 s4, s4, 8 -; GFX10-NEXT: s_lshr_b32 s7, s7, 8 -; GFX10-NEXT: v_min_i16 v0, s6, s9 -; GFX10-NEXT: v_min_i16 v1, s4, s7 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000 -; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX10-NEXT: v_min_i16 v2, s5, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX10-NEXT: v_min_i16 v3, s2, s3 -; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_ashr_i32 s4, s2, 24 +; GFX10-NEXT: s_ashr_i32 s6, s3, 24 +; GFX10-NEXT: s_sext_i32_i8 s8, s3 +; GFX10-NEXT: s_sext_i32_i8 s9, s2 +; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80010 +; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX10-NEXT: s_ashr_i32 s7, s7, 8 +; GFX10-NEXT: s_ashr_i32 s5, s5, 8 +; GFX10-NEXT: s_min_i32 s8, s9, s8 +; GFX10-NEXT: s_min_i32 s4, s4, s6 +; GFX10-NEXT: s_min_i32 s2, s2, s3 +; GFX10-NEXT: s_min_i32 s3, s5, s7 +; GFX10-NEXT: s_and_b32 s5, s8, 0xff +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_or_b32 s3, s5, s3 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_test_imin_sle_v4i8: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x28 -; GFX11-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x4c -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_sext_i32_i16 s2, s0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-TRUE16-NEXT: s_sext_i32_i16 s7, s1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 -; GFX11-TRUE16-NEXT: s_ashr_i32 s6, s0, 24 -; GFX11-TRUE16-NEXT: s_ashr_i32 s9, s1, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX11-TRUE16-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX11-TRUE16-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s7, 8 -; GFX11-TRUE16-NEXT: s_bfe_i32 s8, s8, 0x80000 -; GFX11-TRUE16-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX11-TRUE16-NEXT: v_min_i16 v0.l, s6, s9 -; GFX11-TRUE16-NEXT: v_min_i16 v1.l, s3, s8 -; GFX11-TRUE16-NEXT: v_min_i16 v2.l, s2, s7 -; GFX11-TRUE16-NEXT: v_min_i16 v3.l, s0, s1 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_test_imin_sle_v4i8: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x28 -; GFX11-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x4c -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_sext_i32_i16 s2, s0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-FAKE16-NEXT: s_sext_i32_i16 s7, s1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16 -; GFX11-FAKE16-NEXT: s_ashr_i32 s6, s0, 24 -; GFX11-FAKE16-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX11-FAKE16-NEXT: s_ashr_i32 s9, s1, 24 -; GFX11-FAKE16-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX11-FAKE16-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s7, 8 -; GFX11-FAKE16-NEXT: s_bfe_i32 s8, s8, 0x80000 -; GFX11-FAKE16-NEXT: v_min_i16 v0, s6, s9 -; GFX11-FAKE16-NEXT: v_min_i16 v1, s0, s1 -; GFX11-FAKE16-NEXT: v_min_i16 v2, s3, s8 -; GFX11-FAKE16-NEXT: v_min_i16 v3, s2, s7 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_test_imin_sle_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s5, s2 +; GFX11-NEXT: s_sext_i32_i16 s7, s3 +; GFX11-NEXT: s_ashr_i32 s4, s2, 24 +; GFX11-NEXT: s_ashr_i32 s6, s3, 24 +; GFX11-NEXT: s_sext_i32_i8 s8, s3 +; GFX11-NEXT: s_sext_i32_i8 s9, s2 +; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80010 +; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX11-NEXT: s_ashr_i32 s7, s7, 8 +; GFX11-NEXT: s_ashr_i32 s5, s5, 8 +; GFX11-NEXT: s_min_i32 s8, s9, s8 +; GFX11-NEXT: s_min_i32 s4, s4, s6 +; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s3, s5, s7 +; GFX11-NEXT: s_and_b32 s5, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s3, s5, s3 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %cmp = icmp sle <4 x i8> %a, %b %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %val, ptr addrspace(1) %out @@ -860,11 +820,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s4, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_ashr_i32 s5, s3, 16 +; VI-NEXT: s_ashr_i32 s4, s3, 16 +; VI-NEXT: s_ashr_i32 s5, s2, 16 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_min_i32 s4, s4, s5 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_min_i32 s4, s5, s4 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: s_lshl_b32 s3, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff @@ -977,24 +937,24 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_ashr_i32 s8, s3, 16 +; VI-NEXT: s_ashr_i32 s6, s3, 16 +; VI-NEXT: s_ashr_i32 s7, s1, 16 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_ashr_i32 s7, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s9, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_min_i32 s6, s6, s8 +; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_min_i32 s6, s7, s6 ; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_min_i32 s7, s7, s9 -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: s_lshl_b32 s2, s6, 16 +; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_lshl_b32 s2, s7, 16 +; VI-NEXT: s_or_b32 s1, s1, s6 +; VI-NEXT: s_ashr_i32 s3, s2, 16 +; VI-NEXT: s_ashr_i32 s6, s0, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_min_i32 s3, s6, s3 +; VI-NEXT: s_min_i32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s3, s3, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_or_b32 s0, s0, s3 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2768,19 +2728,22 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: flat_load_ushort v5, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_ushort v4, v[2:3] +; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_and_b32_e32 v6, 0xffff, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v7, v6 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; VI-NEXT: flat_store_short v[0:1], v4 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: flat_store_byte v[2:3], v0 @@ -2794,7 +2757,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -2810,29 +2773,54 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX10-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: global_store_byte v0, v2, s[2:3] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_umin_ult_i16_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_u16 v1, v0, s[4:5] -; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_umin_ult_i16_multi_use: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[6:7] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v0, s[4:5] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_umin_ult_i16_multi_use: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %aptr, align 2 %b = load i16, ptr addrspace(1) %bptr, align 2 %cmp = icmp ult i16 %a, %b @@ -3197,38 +3185,38 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s10, s3, 16 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_lshr_b32 s11, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshr_b32 s12, s1, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_lshr_b32 s13, s0, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_lshr_b32 s14, s7, 16 -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_lshr_b32 s15, s6, 16 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshr_b32 s16, s5, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshr_b32 s17, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_min_u32 s0, s0, s4 -; VI-NEXT: s_min_u32 s4, s13, s17 -; VI-NEXT: s_min_u32 s1, s1, s5 -; VI-NEXT: s_min_u32 s5, s12, s16 -; VI-NEXT: s_min_u32 s2, s2, s6 -; VI-NEXT: s_min_u32 s6, s11, s15 +; VI-NEXT: s_and_b32 s10, s7, 0xffff +; VI-NEXT: s_and_b32 s11, s3, 0xffff +; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s3, s3, 16 ; VI-NEXT: s_min_u32 s3, s3, s7 -; VI-NEXT: s_min_u32 s7, s10, s14 -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_or_b32 s3, s3, s7 -; VI-NEXT: s_or_b32 s2, s2, s6 -; VI-NEXT: s_or_b32 s1, s1, s5 -; VI-NEXT: s_or_b32 s0, s0, s4 +; VI-NEXT: s_min_u32 s10, s11, s10 +; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_or_b32 s3, s10, s3 +; VI-NEXT: s_and_b32 s7, s6, 0xffff +; VI-NEXT: s_and_b32 s10, s2, 0xffff +; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: s_min_u32 s2, s2, s6 +; VI-NEXT: s_min_u32 s7, s10, s7 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_or_b32 s2, s7, s2 +; VI-NEXT: s_and_b32 s6, s5, 0xffff +; VI-NEXT: s_and_b32 s7, s1, 0xffff +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s1, s1, 16 +; VI-NEXT: s_min_u32 s1, s1, s5 +; VI-NEXT: s_min_u32 s6, s7, s6 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s1, s6, s1 +; VI-NEXT: s_and_b32 s5, s4, 0xffff +; VI-NEXT: s_and_b32 s6, s0, 0xffff +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: s_min_u32 s0, s0, s4 +; VI-NEXT: s_min_u32 s5, s6, s5 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s5, s0 ; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3571,9 +3559,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s3, s2 -; VI-NEXT: s_ashr_i32 s2, s2, 16 -; VI-NEXT: s_min_i32 s2, s3, s2 +; VI-NEXT: s_ashr_i32 s3, s2, 16 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -3586,9 +3574,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s3, s2 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_min_i32 s2, s3, s2 +; GFX9-NEXT: s_ashr_i32 s3, s2, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: s_min_i32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -3600,9 +3588,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s3, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_min_i32 s2, s3, s2 +; GFX10-NEXT: s_ashr_i32 s3, s2, 16 +; GFX10-NEXT: s_sext_i32_i16 s2, s2 +; GFX10-NEXT: s_min_i32 s2, s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -3614,10 +3602,10 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s3, s2 -; GFX11-NEXT: s_ashr_i32 s2, s2, 16 +; GFX11-NEXT: s_ashr_i32 s3, s2, 16 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_i32 s2, s3, s2 +; GFX11-NEXT: s_min_i32 s2, s2, s3 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll index 444997858bf7a..3e88b93125101 100644 --- a/llvm/test/CodeGen/AMDGPU/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/mmra.ll @@ -14,12 +14,12 @@ define void @fence_loads(ptr %ptr) { ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1 - ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4) + ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2 + ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load acquire (s8) from %ir.ptr, align 4) ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2 - ; CHECK-NEXT: FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !3 + ; CHECK-NEXT: FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !3 :: (store release (s8) into %ir.ptr, align 4) ; CHECK-NEXT: SI_RETURN fence release, !mmra !0 %ld = load atomic i8, ptr %ptr acquire, align 4, !mmra !2 @@ -37,8 +37,8 @@ define void @atomicrmw_acq(ptr %ptr) { ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1 - ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE killed [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2 + ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE killed [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load acquire (s8) from %ir.ptr) ; CHECK-NEXT: SI_RETURN %old.2 = atomicrmw add ptr %ptr, i8 0 acquire, !mmra !2 ret void @@ -69,8 +69,8 @@ define void @atomicrmw_rel(ptr %ptr) { ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec ; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[V_LSHLREV_B32_e64_1]], implicit $exec - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !2 - ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load (s32) from %ir.AlignedAddr) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !3 + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !3 :: (load (s32) from %ir.AlignedAddr) ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: @@ -83,9 +83,9 @@ define void @atomicrmw_rel(ptr %ptr) { ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !2 - ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !2 :: (load store release monotonic (s32) on %ir.AlignedAddr) - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !2 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !3 + ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !3 :: (load store release monotonic (s32) on %ir.AlignedAddr) + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !3 ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.2 @@ -125,8 +125,8 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 killed [[V_LSHLREV_B32_e64_1]], implicit $exec ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !1 - ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load (s32) from %ir.AlignedAddr) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !2 + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load (s32) from %ir.AlignedAddr) ; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[FLAT_LOAD_DWORD]], [[V_NOT_B32_e32_]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF @@ -141,8 +141,8 @@ define void @cmpxchg(ptr %ptr) { ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_OR_B32_e64_]], %subreg.sub0, [[PHI2]], %subreg.sub1 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !1 - ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !1 :: (load store acquire acquire (s32) on %ir.AlignedAddr) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !2 + ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !2 :: (load store acquire acquire (s32) on %ir.AlignedAddr) ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI2]], implicit $exec ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -248,8 +248,8 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) { ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec ; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[V_LSHLREV_B32_e64_1]], implicit $exec - ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !0 - ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY31]], 0, 0, implicit $exec, implicit $flat_scr, mmra !0 :: (load (s32) from %ir.AlignedAddr) + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !1 + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY31]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load (s32) from %ir.AlignedAddr) ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: @@ -262,9 +262,9 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) { ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY32:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !0 - ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY30]], killed [[COPY32]], 0, 1, implicit $exec, implicit $flat_scr, mmra !0 :: (load store release monotonic (s32) on %ir.AlignedAddr) - ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !0 + ; CHECK-NEXT: [[COPY32:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !1 + ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY30]], killed [[COPY32]], 0, 1, implicit $exec, implicit $flat_scr, mmra !1 :: (load store release monotonic (s32) on %ir.AlignedAddr) + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !1 ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll new file mode 100644 index 0000000000000..1c2d07c2f7af5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s + +; In this test, V_CNDMASK_B32_e64 gets converted to V_CNDMASK_B32_e32, +; but the expected conversion to SDWA does not occur. This led to a +; compilation error, because the use of $vcc in the resulting +; instruction must be fixed to $vcc_lo for wave32 which only happened +; after the full conversion to SDWA. + +define void @quux(i32 %arg, i1 %arg1, i1 %arg2) { +; CHECK-LABEL: quux: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %bb3 +; CHECK-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 +; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:3 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xffff +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_and_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; CHECK-NEXT: v_mov_b32_e32 v1, 24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xff +; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; CHECK-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CHECK-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; CHECK-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; CHECK-NEXT: .LBB0_2: ; %bb9 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: global_store_byte v[2:3], v1, off +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + br i1 %arg1, label %bb9, label %bb3 + +bb3: ; preds = %bb + %call = tail call i32 @llvm.amdgcn.workitem.id.x() + %mul = mul i32 %call, 5 + %zext = zext i32 %mul to i64 + %getelementptr = getelementptr i8, ptr addrspace(1) null, i64 %zext + %getelementptr4 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 4 + %load = load i8, ptr addrspace(1) %getelementptr4, align 1 + %getelementptr5 = getelementptr i8, ptr addrspace(1) %getelementptr, i64 3 + %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1 + %insertelement = insertelement <5 x i8> poison, i8 %load, i64 4 + %select = select i1 %arg2, <5 x i8> %insertelement, <5 x i8> + %insertelement7 = insertelement <5 x i8> %select, i8 %load6, i64 0 + %icmp = icmp ult i32 0, %arg + %select8 = select i1 %icmp, <5 x i8> zeroinitializer, <5 x i8> %insertelement7 + %shufflevector = shufflevector <5 x i8> zeroinitializer, <5 x i8> %select8, <5 x i32> + br label %bb9 + +bb9: ; preds = %bb3, %bb + %phi = phi <5 x i8> [ %shufflevector, %bb3 ], [ zeroinitializer, %bb ] + %extractelement = extractelement <5 x i8> %phi, i64 0 + store i8 %extractelement, ptr addrspace(1) null, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir index 4b45c54a3b83d..aef392749498a 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir @@ -230,3 +230,92 @@ body: | $vgpr0 = COPY %3 SI_RETURN implicit $vgpr0 ... + +--- +name: cndmask-not-converted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: cndmask-not-converted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr8_sgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 + ; CHECK-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 0, implicit-def $scc + ; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[S_CSELECT_B32_]], implicit-def dead $scc + ; CHECK-NEXT: $vcc_lo = COPY [[S_AND_B32_]] + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc_lo + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[V_MUL_U32_U24_e64_:%[0-9]+]]:vgpr_32 = V_MUL_U32_U24_e64 [[COPY1]](s32), 5, 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_U32_U24_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE]], 3, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[GLOBAL_LOAD_USHORT]], 255, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_MOV_B32_e32_2]], 0, [[GLOBAL_LOAD_USHORT]], 0, 6, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY2]].sub0, 0, implicit-def $scc + ; CHECK-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + ; CHECK-NEXT: $vcc_lo = COPY [[S_CSELECT_B32_1]] + ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 0, killed [[V_AND_B32_sdwa]], implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 24, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_MOV_B32_e32_3]], 0, [[V_CNDMASK_B32_e32_]], 0, 1, 0, 6, 6, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CNDMASK_B32_e32_]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, [[V_CNDMASK_B32_e32_]], 0, [[V_MOV_B32_e32_4]], 0, 6, 0, 5, 6, implicit $exec + ; CHECK-NEXT: [[V_OR_B32_sdwa:%[0-9]+]]:vgpr_32 = V_OR_B32_sdwa 0, [[V_AND_B32_sdwa1]], 0, [[V_LSHRREV_B32_sdwa]], 0, 5, 0, 6, 6, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, [[V_OR_B32_sdwa]], %bb.1 + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], [[PHI]], 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $vgpr0, $sgpr8_sgpr9 + + %0:sgpr_64 = COPY $sgpr8_sgpr9 + %1:vgpr_32 = COPY $vgpr0 + %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 + S_BITCMP1_B32 %2.sub1, 0, implicit-def $scc + %3:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_32 = S_AND_B32 $exec_lo, %3, implicit-def dead $scc + $vcc_lo = COPY %5 + S_CBRANCH_VCCNZ %bb.2, implicit $vcc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + %6:sreg_64 = COPY %2 + %7:vgpr_32 = V_MUL_U32_U24_e64 %1(s32), 5, 0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, killed %8, %subreg.sub1 + %10:vgpr_32 = GLOBAL_LOAD_USHORT %9, 3, 0, implicit $exec + %11:vgpr_32 = V_AND_B32_e64 %10, 255, implicit $exec + %12:vgpr_32 = V_AND_B32_e64 65535, killed %11, implicit $exec + S_CMP_EQ_U32 %6.sub0, 0, implicit-def $scc + %13:sreg_32_xm0_xexec = S_CSELECT_B32 -1, 0, implicit $scc + %14:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, killed %12, %13, implicit $exec + %15:vgpr_32 = V_LSHRREV_B32_e64 24, %14, implicit $exec + %16:vgpr_32 = V_LSHLREV_B16_e64 8, %15, implicit $exec + %17:vgpr_32 = V_LSHRREV_B32_e64 16, %14, implicit $exec + %18:vgpr_32 = V_AND_B32_e64 %17, 255, implicit $exec + %19:vgpr_32 = V_OR_B32_e64 killed %18, killed %16, implicit $exec + %20:vgpr_32 = V_LSHLREV_B32_e64 16, killed %19, implicit $exec + + bb.2: + %21:vgpr_32 = PHI %4, %bb.0, %20, %bb.1 + %22:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + GLOBAL_STORE_BYTE killed %22, %21, 0, 0, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index d999945948101..38e45042b5ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1787,15 +1787,14 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; NOSDWA-NEXT: flat_load_dword v1, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 -; NOSDWA-NEXT: s_waitcnt vmcnt(1) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, v1, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v4 -; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; NOSDWA-NEXT: v_or_b32_e32 v2, v1, v2 +; NOSDWA-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 ; NOSDWA-NEXT: s_endpgm @@ -1813,9 +1812,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX89-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX89-NEXT: v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_add_u32_e32 v3, vcc, v1, v2 +; GFX89-NEXT: v_add_u32_sdwa v1, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: v_mov_b32_e32 v1, s1 ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index 4e3dccb975fe8..cc07ee4ee4780 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -521,13 +521,10 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %o ; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16: ; GCN: s_load_dword [[VAL:s[0-9]+]] -; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000 -; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] -; SI: buffer_store_short [[VBFE]] +; GCN: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000 +; GCN: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] +; GCN: buffer_store_short [[VBFE]] -; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 -; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 { %ld = load i32, ptr addrspace(4) %ptr %in = trunc i32 %ld to i16 @@ -622,9 +619,7 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 % ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]] ; SI: buffer_store_short [[VSEXT]] -; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} -; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} +; GFX89: s_bfe_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000 define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %in) #0 { %shl = shl i16 %in, 8 %sext = ashr i16 %shl, 8 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 4b616e836f916..1c5c16d886251 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -27,9 +27,9 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_lshl_b32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_lshl_b32 s0, s1, s0 ; VI-NEXT: s_lshl_b32 s1, s2, s3 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index fe47663b11028..6ca8f490ff165 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -22,17 +22,17 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_sub_i32 s4, 0, s2 +; VI-NEXT: s_sub_i32 s3, 0, s2 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_ashr_i32 s5, s2, 16 +; VI-NEXT: s_sub_i32 s4, 0, s4 +; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sub_i32 s3, 0, s3 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_max_i32 s2, s2, s4 -; VI-NEXT: s_max_i32 s3, s5, s3 +; VI-NEXT: s_max_i32 s2, s2, s3 +; VI-NEXT: s_max_i32 s4, s5, s4 ; VI-NEXT: s_add_i32 s2, s2, 2 -; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_lshl_b32 s3, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 @@ -171,17 +171,17 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_sub_i32 s4, 0, s2 +; VI-NEXT: s_sub_i32 s3, 0, s2 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: s_ashr_i32 s5, s2, 16 +; VI-NEXT: s_sub_i32 s4, 0, s4 +; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sub_i32 s3, 0, s3 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_max_i32 s2, s2, s4 -; VI-NEXT: s_max_i32 s3, s5, s3 +; VI-NEXT: s_max_i32 s2, s2, s3 +; VI-NEXT: s_max_i32 s4, s5, s4 ; VI-NEXT: s_add_i32 s2, s2, 2 -; VI-NEXT: s_lshl_b32 s3, s3, 16 +; VI-NEXT: s_lshl_b32 s3, s4, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 @@ -331,31 +331,31 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_sub_i32 s6, 0, s3 -; VI-NEXT: s_sub_i32 s7, 0, s2 -; VI-NEXT: s_sub_i32 s5, 0, s5 -; VI-NEXT: s_sub_i32 s4, 0, s4 +; VI-NEXT: s_lshr_b32 s7, s2, 16 +; VI-NEXT: s_sub_i32 s7, 0, s7 +; VI-NEXT: s_sub_i32 s4, 0, s3 +; VI-NEXT: s_lshr_b32 s6, s3, 16 ; VI-NEXT: s_ashr_i32 s8, s2, 16 -; VI-NEXT: s_ashr_i32 s9, s3, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_sext_i32_i16 s7, s7 -; VI-NEXT: s_sext_i32_i16 s6, s6 +; VI-NEXT: s_sub_i32 s5, 0, s2 +; VI-NEXT: s_sub_i32 s6, 0, s6 +; VI-NEXT: s_max_i32 s7, s8, s7 +; VI-NEXT: s_ashr_i32 s8, s3, 16 ; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: s_sext_i32_i16 s6, s6 ; VI-NEXT: s_sext_i32_i16 s5, s5 -; VI-NEXT: s_max_i32 s3, s3, s6 -; VI-NEXT: s_max_i32 s2, s2, s7 -; VI-NEXT: s_max_i32 s5, s9, s5 -; VI-NEXT: s_max_i32 s4, s8, s4 -; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_max_i32 s3, s3, s4 +; VI-NEXT: s_max_i32 s6, s8, s6 +; VI-NEXT: s_max_i32 s2, s2, s5 ; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s4, s6, 16 ; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_add_i32 s2, s2, 2 +; VI-NEXT: s_or_b32 s3, s4, s3 +; VI-NEXT: s_lshl_b32 s4, s7, 16 ; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s3, s5, s3 ; VI-NEXT: s_or_b32 s2, s4, s2 ; VI-NEXT: s_add_i32 s3, s3, 0x20000 ; VI-NEXT: s_add_i32 s2, s2, 0x20000 @@ -560,23 +560,23 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_ashr_i32 s0, s4, 16 -; VI-NEXT: s_sext_i32_i16 s1, s4 -; VI-NEXT: s_ashr_i32 s2, s5, 16 +; VI-NEXT: s_ashr_i32 s0, s5, 16 +; VI-NEXT: s_ashr_i32 s1, s4, 16 ; VI-NEXT: s_sext_i32_i16 s3, s5 -; VI-NEXT: s_max_i32 s4, s0, s2 -; VI-NEXT: s_max_i32 s5, s1, s3 -; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_max_i32 s2, s1, s0 +; VI-NEXT: s_max_i32 s5, s4, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_min_i32 s0, s1, s0 +; VI-NEXT: s_min_i32 s1, s4, s3 +; VI-NEXT: s_or_b32 s2, s5, s2 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff ; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -661,12 +661,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_max_i32_sdwa v6, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i32_sdwa v7, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-NEXT: v_min_i32_sdwa v8, sext(v4), sext(v5) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_min_i32_sdwa v4, sext(v4), sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[2:3], v4 @@ -748,37 +748,37 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_ashr_i32 s0, s5, 16 -; VI-NEXT: s_ashr_i32 s1, s4, 16 -; VI-NEXT: s_sext_i32_i16 s2, s5 -; VI-NEXT: s_sext_i32_i16 s3, s4 -; VI-NEXT: s_ashr_i32 s4, s7, 16 -; VI-NEXT: s_ashr_i32 s5, s6, 16 -; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: s_ashr_i32 s0, s7, 16 +; VI-NEXT: s_ashr_i32 s1, s5, 16 +; VI-NEXT: s_sext_i32_i16 s3, s7 +; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_max_i32 s2, s1, s0 +; VI-NEXT: s_max_i32 s7, s5, s3 +; VI-NEXT: s_lshl_b32 s2, s2, 16 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_or_b32 s2, s7, s2 +; VI-NEXT: s_ashr_i32 s7, s6, 16 +; VI-NEXT: s_ashr_i32 s8, s4, 16 ; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_max_i32 s8, s1, s5 -; VI-NEXT: s_max_i32 s9, s0, s4 -; VI-NEXT: s_max_i32 s10, s3, s6 -; VI-NEXT: s_max_i32 s11, s2, s7 -; VI-NEXT: s_min_i32 s0, s0, s4 -; VI-NEXT: s_min_i32 s2, s2, s7 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_min_i32 s0, s1, s0 +; VI-NEXT: s_min_i32 s1, s5, s3 +; VI-NEXT: s_max_i32 s9, s8, s7 +; VI-NEXT: s_max_i32 s10, s4, s6 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 -; VI-NEXT: s_and_b32 s11, s11, 0xffff -; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s10, s10, 0xffff -; VI-NEXT: s_min_i32 s1, s1, s5 -; VI-NEXT: s_min_i32 s3, s3, s6 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s9, s11, s9 -; VI-NEXT: s_or_b32 s8, s10, s8 -; VI-NEXT: s_or_b32 s0, s2, s0 +; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_min_i32 s1, s8, s7 +; VI-NEXT: s_min_i32 s2, s4, s6 +; VI-NEXT: s_or_b32 s9, s10, s9 ; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s2, s3, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: v_mov_b32_e32 v4, s9 ; VI-NEXT: s_or_b32 s1, s2, s1 ; VI-NEXT: v_mov_b32_e32 v6, s1 ; VI-NEXT: v_mov_b32_e32 v7, s0 @@ -899,42 +899,34 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_readfirstlane_b32 s0, v4 -; VI-NEXT: v_readfirstlane_b32 s1, v5 -; VI-NEXT: s_ashr_i32 s3, s0, 16 -; VI-NEXT: s_ashr_i32 s5, s1, 16 -; VI-NEXT: s_cmp_gt_i32 s3, s5 -; VI-NEXT: s_sext_i32_i16 s2, s0 -; VI-NEXT: s_sext_i32_i16 s4, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, s3, s5 -; VI-NEXT: s_cselect_b32 s3, s5, s3 -; VI-NEXT: s_lshl_b32 s5, s0, 16 -; VI-NEXT: s_cmp_gt_i32 s2, s4 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; VI-NEXT: s_cselect_b32 s0, s2, s4 -; VI-NEXT: s_cselect_b32 s1, s4, s2 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v4 -; VI-NEXT: s_lshl_b32 s2, s3, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s5 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: v_and_b32_e32 v4, 3, v4 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_ashrrev_i32_e32 v10, 16, v4 +; VI-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; VI-NEXT: v_bfe_i32 v6, v4, 0, 16 +; VI-NEXT: v_bfe_i32 v7, v5, 0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, v10, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], v6, v7 +; VI-NEXT: v_cndmask_b32_e64 v6, v5, v4, s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; VI-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: flat_store_dword v[0:1], v6 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v6 +; VI-NEXT: v_or_b32_e32 v0, v9, v5 +; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 3, v0 +; VI-NEXT: flat_store_dword v[2:3], v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_byte v[0:1], v4 +; VI-NEXT: flat_store_byte v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; @@ -1020,23 +1012,23 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_lshr_b32 s0, s4, 16 -; VI-NEXT: s_lshr_b32 s2, s5, 16 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_and_b32 s1, s4, 0xffff -; VI-NEXT: s_and_b32 s3, s5, 0xffff -; VI-NEXT: s_max_u32 s5, s0, s2 -; VI-NEXT: s_max_u32 s4, s1, s3 +; VI-NEXT: s_lshr_b32 s3, s5, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_and_b32 s0, s5, 0xffff +; VI-NEXT: s_max_u32 s5, s4, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_max_u32 s2, s1, s0 ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_min_u32 s0, s0, s2 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_min_u32 s1, s1, s3 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_min_u32 s0, s1, s0 +; VI-NEXT: s_min_u32 s1, s4, s3 +; VI-NEXT: s_or_b32 s2, s2, s5 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 67c51286de216..68ed7cecd8ff7 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -187,15 +187,14 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s0, v1 -; VI-NEXT: v_readfirstlane_b32 s1, v0 -; VI-NEXT: s_ashr_i32 s2, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: v_readfirstlane_b32 s1, v1 +; VI-NEXT: s_lshr_b32 s2, s1, 16 ; VI-NEXT: s_ashr_i32 s3, s0, 16 ; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s0, s1, s0 -; VI-NEXT: s_ashr_i32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_ashr_i32 s2, s3, s2 +; VI-NEXT: s_ashr_i32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s2, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff ; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -294,28 +293,26 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s4, v2 -; VI-NEXT: v_readfirstlane_b32 s5, v3 -; VI-NEXT: v_readfirstlane_b32 s6, v0 -; VI-NEXT: v_readfirstlane_b32 s7, v1 -; VI-NEXT: s_ashr_i32 s8, s7, 16 -; VI-NEXT: s_sext_i32_i16 s7, s7 +; VI-NEXT: v_readfirstlane_b32 s4, v0 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: v_readfirstlane_b32 s6, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v3 +; VI-NEXT: s_lshr_b32 s8, s7, 16 ; VI-NEXT: s_ashr_i32 s9, s6, 16 ; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_ashr_i32 s10, s5, 16 -; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: s_lshr_b32 s10, s5, 16 ; VI-NEXT: s_ashr_i32 s11, s4, 16 ; VI-NEXT: s_sext_i32_i16 s4, s4 -; VI-NEXT: s_ashr_i32 s4, s6, s4 -; VI-NEXT: s_ashr_i32 s6, s9, s11 -; VI-NEXT: s_ashr_i32 s5, s7, s5 -; VI-NEXT: s_ashr_i32 s7, s8, s10 +; VI-NEXT: s_ashr_i32 s8, s9, s8 +; VI-NEXT: s_ashr_i32 s6, s6, s7 +; VI-NEXT: s_ashr_i32 s7, s11, s10 +; VI-NEXT: s_ashr_i32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s8, 16 +; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_or_b32 s4, s4, s7 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 44e403854217e..42bd2ff8797a1 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -116,23 +116,21 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_load_dword s6, s[2:3], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_sub_i32 s2, s2, s3 -; VI-NEXT: s_sub_i32 s0, s0, s1 -; VI-NEXT: s_and_b32 s1, s2, 0xffff -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_sub_i32 s5, s6, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_sub_i32 s4, s6, s4 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16: @@ -230,9 +228,9 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s3, 16 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_sub_i32 s0, s1, s0 ; VI-NEXT: s_sub_i32 s1, s2, s3 ; VI-NEXT: s_lshl_b32 s0, s0, 16 ; VI-NEXT: s_and_b32 s1, s1, 0xffff diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 86fc0ace2c43f..6ab3022a91cd7 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -162,10 +162,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 -; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: s_sext_i32_i16 s2, s2 +; SDAG-VI-NEXT: s_sext_i32_i16 s3, s3 +; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0 +; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0 +; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -689,12 +690,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 -; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 -; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 +; SDAG-VI-NEXT: s_ashr_i32 s3, s2, 16 +; SDAG-VI-NEXT: s_sext_i32_i16 s2, s2 +; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0 +; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0 +; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 ; SDAG-VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index d41720e19c217..77d1e6c2593c1 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -317,7 +317,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s0, 0xffffff00 +; VI-NEXT: s_and_b32 s1, s0, 0xff00 ; VI-NEXT: s_add_i32 s0, s0, 12 ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: s_and_b32 s0, s0, 0xff diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll index af50e09f509a3..c77828aa5606f 100644 --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI %s ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s ; R600: {{^}}s_mad_zext_i32_to_i64: @@ -53,7 +53,8 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}} ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}} -; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]] +; VI: s_cmp_eq_u32 [[MASK_B]], [[MASK_A]] +; SI: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]] ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] ; GCN: buffer_store_short [[RESULT]] diff --git a/llvm/test/CodeGen/ARM/nofpclass.ll b/llvm/test/CodeGen/ARM/nofpclass.ll new file mode 100644 index 0000000000000..aaeb6c11fa598 --- /dev/null +++ b/llvm/test/CodeGen/ARM/nofpclass.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=armv8-unknown-none-eabi < %s | FileCheck %s --check-prefixes=CHECK,HARD +; RUN: llc -mtriple=armv8-unknown-none-eabi -mattr=+soft-float < %s | FileCheck %s --check-prefixes=CHECK,SOFT + +define nofpclass(nan inf) half @f1(half returned nofpclass(nan inf) %x) { +; CHECK-LABEL: f1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + ret half %x +} + +define noundef half @f2(half nofpclass(nan) %a) { +; HARD-LABEL: f2: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov.f32 s0, #1.000000e+00 +; HARD-NEXT: vmov s2, r0 +; HARD-NEXT: vcvtb.f32.f16 s2, s2 +; HARD-NEXT: vadd.f32 s0, s2, s0 +; HARD-NEXT: vcvtb.f16.f32 s0, s0 +; HARD-NEXT: vmov r0, s0 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: f2: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: .save {r11, lr} +; SOFT-NEXT: push {r11, lr} +; SOFT-NEXT: uxth r0, r0 +; SOFT-NEXT: bl __aeabi_h2f +; SOFT-NEXT: mov r1, #1065353216 +; SOFT-NEXT: bl __aeabi_fadd +; SOFT-NEXT: bl __aeabi_f2h +; SOFT-NEXT: pop {r11, pc} +entry: + %0 = fadd half %a, 0xH3C00 + ret half %0 +} diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/disable-opt-cs.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/disable-opt-cs.ll index 8bf242fdbec67..3f2ec9a85b2a0 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/disable-opt-cs.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/disable-opt-cs.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: ; Disable shader optimizations ; CHECK: ; Shader Flags for Module Functions -; CHECK: ; Function main : 0x00000000 +; CHECK: ; Function main : 0x00000001 ; The test source in this file generated from the following command: ; clang -cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -O0 -o - <&1 %s | FileCheck %s +; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC + +; Check that when the dx.nativelowprec module flag is not specified, the +; module-level shader flag UseNativeLowPrecision is not set, and the +; MinimumPrecision feature flag is set due to the presence of half and i16 +; without native low precision. target triple = "dxil-pc-shadermodel6.7-library" @@ -6,25 +12,33 @@ target triple = "dxil-pc-shadermodel6.7-library" ;CHECK-NEXT: ; Shader Flags Value: 0x00000020 ;CHECK-NEXT: ; ;CHECK-NEXT: ; Note: shader requires additional functionality: +;CHECK-NEXT: ; Minimum-precision data types ;CHECK-NEXT: ; Note: extra DXIL module flags: -;CHECK-NEXT: ; Low-precision data types +;CHECK-NEXT: ; Low-precision data types present ;CHECK-NEXT: ; ;CHECK-NEXT: ; Shader Flags for Module Functions ;CHECK-LABEL: ; Function add_i16 : 0x00000020 -define i16 @add_i16(i16 %a, i16 %b) { +define i16 @add_i16(i16 %a, i16 %b) "hlsl.export" { %sum = add i16 %a, %b ret i16 %sum } ;CHECK-LABEL: ; Function add_i32 : 0x00000000 -define i32 @add_i32(i32 %a, i32 %b) { +define i32 @add_i32(i32 %a, i32 %b) "hlsl.export" { %sum = add i32 %a, %b ret i32 %sum } ;CHECK-LABEL: ; Function add_half : 0x00000020 -define half @add_half(half %a, half %b) { +define half @add_half(half %a, half %b) "hlsl.export" { %sum = fadd half %a, %b ret half %sum } + +; DXC: - Name: SFI0 +; DXC-NEXT: Size: 8 +; DXC-NEXT: Flags: +; DXC: MinimumPrecision: true +; DXC: NativeLowPrecision: false +; DXC: ... diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.5.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.5.ll index af6001be1f610..6079071919dbc 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.5.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.5.ll @@ -13,7 +13,7 @@ target triple = "dxil-pc-shadermodel6.7-library" ; CHECK: UAVs at every shader stage ; CHECK-NOT: 64 UAV slots -; CHECK: Function test : 0x00000000 +; CHECK: Function test : 0x00010000 define void @test() "hlsl.export" { ; RWBuffer Buf : register(u0, space0) %buf0 = call target("dx.TypedBuffer", float, 1, 0, 1) diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.6.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.6.ll index 7e1d73b31b35b..4f1a1e7b3da53 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.6.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs-array-valver1.6.ll @@ -14,7 +14,7 @@ target triple = "dxil-pc-shadermodel6.7-library" ; CHECK: UAVs at every shader stage ; CHECK: 64 UAV slots -; CHECK: Function test : 0x00000000 +; CHECK: Function test : 0x00018000 define void @test() "hlsl.export" { ; RWBuffer Buf : register(u0, space0) %buf0 = call target("dx.TypedBuffer", float, 1, 0, 1) diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs.ll index a97fe5d45d00a..e0d4ac737704e 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/max-64-uavs.ll @@ -13,7 +13,7 @@ target triple = "dxil-pc-shadermodel6.7-library" ; CHECK: 64 UAV slots ; Note: 64 UAV slots does not get set per-function -; CHECK: Function test : 0x00000000 +; CHECK: Function test : 0x00008000 define void @test() "hlsl.export" { ; RWBuffer Buf : register(u0, space0) %buf0 = call target("dx.TypedBuffer", float, 1, 0, 1) diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/res-may-not-alias-sm6.7.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/res-may-not-alias-sm6.7.ll index 934319557a11f..c0ea6b4c1e8f3 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/res-may-not-alias-sm6.7.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/res-may-not-alias-sm6.7.ll @@ -2,8 +2,8 @@ ; This test checks to ensure the behavior of the DXIL shader flag analysis ; for the flag ResMayNotAlias is correct when the DXIL Version is >= 1.7 and the -; DXIL Validator Version < 1.8. The ResMayNotAlias flag (0x20000000) should be -; set on all functions if there are one or more UAVs present globally in the +; DXIL Validator Version < 1.8. The ResMayNotAlias module flag (0x20000000) +; should be set if there are one or more UAVs present globally in the ; module. target triple = "dxil-pc-shadermodel6.7-library" @@ -19,7 +19,7 @@ target triple = "dxil-pc-shadermodel6.7-library" ; CHECK: Any UAV may not alias any other UAV ; -; CHECK: Function loadUAV : 0x200000000 +; CHECK: Function loadUAV : 0x200010000 define float @loadUAV() #0 { %res = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) @@ -29,7 +29,7 @@ define float @loadUAV() #0 { ret float %val } -; CHECK: Function loadSRV : 0x200000010 +; CHECK: Function loadSRV : 0x200010010 define float @loadSRV() #0 { %res = tail call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-lib-valver1.7.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-lib-valver1.7.ll index 552f513095fa5..bd1de58732183 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-lib-valver1.7.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-lib-valver1.7.ll @@ -12,7 +12,7 @@ target triple = "dxil-pc-shadermodel6.5-library" ; CHECK: Note: shader requires additional functionality: ; CHECK: UAVs at every shader stage -; CHECK: Function test : 0x00000000 +; CHECK: Function test : 0x00010000 define void @test() "hlsl.export" { ; RWBuffer Buf : register(u0, space0) %buf0 = call target("dx.TypedBuffer", float, 1, 0, 1) diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-vs.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-vs.ll index d3f556b62ed0c..bdb07b41d1559 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-vs.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/uavs-at-every-stage-vs.ll @@ -13,7 +13,7 @@ target triple = "dxil-pc-shadermodel6.5-vertex" ; CHECK: Note: shader requires additional functionality: ; CHECK: UAVs at every shader stage -; CHECK: Function VSMain : 0x00000000 +; CHECK: Function VSMain : 0x00010000 define void @VSMain() { ; RWBuffer Buf : register(u0, space0) %buf0 = call target("dx.TypedBuffer", float, 1, 0, 1) diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-0.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-0.ll index c537a01482f39..2e68fe375a42c 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-0.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-0.ll @@ -1,7 +1,9 @@ ; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s +; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC ; Check that when the dx.nativelowprec module flag is set to 0, the module-level -; shader flag UseNativeLowPrecision is not set +; shader flag UseNativeLowPrecision is not set, and the MinimumPrecision feature +; flag is set due to the presence of half and i16 without native low precision. target triple = "dxil-pc-shadermodel6.7-library" @@ -9,29 +11,37 @@ target triple = "dxil-pc-shadermodel6.7-library" ;CHECK-NEXT: ; Shader Flags Value: 0x00000020 ;CHECK-NEXT: ; ;CHECK-NEXT: ; Note: shader requires additional functionality: +;CHECK-NEXT: ; Minimum-precision data types ;CHECK-NEXT: ; Note: extra DXIL module flags: -;CHECK-NEXT: ; Low-precision data types -;CHECK-NOT: ; Native 16bit types enabled +;CHECK-NEXT: ; Low-precision data types present +;CHECK-NOT: ; Enable native low-precision data types ;CHECK-NEXT: ; ;CHECK-NEXT: ; Shader Flags for Module Functions ;CHECK-LABEL: ; Function add_i16 : 0x00000020 -define i16 @add_i16(i16 %a, i16 %b) { +define i16 @add_i16(i16 %a, i16 %b) "hlsl.export" { %sum = add i16 %a, %b ret i16 %sum } ;CHECK-LABEL: ; Function add_i32 : 0x00000000 -define i32 @add_i32(i32 %a, i32 %b) { +define i32 @add_i32(i32 %a, i32 %b) "hlsl.export" { %sum = add i32 %a, %b ret i32 %sum } ;CHECK-LABEL: ; Function add_half : 0x00000020 -define half @add_half(half %a, half %b) { +define half @add_half(half %a, half %b) "hlsl.export" { %sum = fadd half %a, %b ret half %sum } !llvm.module.flags = !{!0} !0 = !{i32 1, !"dx.nativelowprec", i32 0} + +; DXC: - Name: SFI0 +; DXC-NEXT: Size: 8 +; DXC-NEXT: Flags: +; DXC: MinimumPrecision: true +; DXC: NativeLowPrecision: false +; DXC: ... diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-1.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-1.ll index 07c4b9064d666..ba844b182fd11 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-1.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/use-native-low-precision-1.ll @@ -1,4 +1,9 @@ ; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s +; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC + +; Check that when the dx.nativelowprec module flag is set to 1, the module-level +; shader flag UseNativeLowPrecision is set, and the NativeLowPrecision feature +; flag is set target triple = "dxil-pc-shadermodel6.7-library" @@ -6,32 +11,37 @@ target triple = "dxil-pc-shadermodel6.7-library" ;CHECK-NEXT: ; Shader Flags Value: 0x00800020 ;CHECK-NEXT: ; ;CHECK-NEXT: ; Note: shader requires additional functionality: +;CHECK-NEXT: ; Native low-precision data types ;CHECK-NEXT: ; Note: extra DXIL module flags: -;CHECK-NEXT: ; Low-precision data types -;CHECK-NEXT: ; Use native low precision +;CHECK-NEXT: ; Low-precision data types present +;CHECK-NEXT: ; Enable native low-precision data types ;CHECK-NEXT: ; ;CHECK-NEXT: ; Shader Flags for Module Functions ;CHECK-LABEL: ; Function add_i16 : 0x00800020 -define i16 @add_i16(i16 %a, i16 %b) { +define i16 @add_i16(i16 %a, i16 %b) "hlsl.export" { %sum = add i16 %a, %b ret i16 %sum } -; NOTE: The flag for native low precision (0x80000) is set for every function -; in the module regardless of whether or not the function uses low precision -; data types (flag 0x20). This matches the behavior in DXC ;CHECK-LABEL: ; Function add_i32 : 0x00800000 -define i32 @add_i32(i32 %a, i32 %b) { +define i32 @add_i32(i32 %a, i32 %b) "hlsl.export" { %sum = add i32 %a, %b ret i32 %sum } ;CHECK-LABEL: ; Function add_half : 0x00800020 -define half @add_half(half %a, half %b) { +define half @add_half(half %a, half %b) "hlsl.export" { %sum = fadd half %a, %b ret half %sum } !llvm.module.flags = !{!0} !0 = !{i32 1, !"dx.nativelowprec", i32 1} + +; DXC: - Name: SFI0 +; DXC-NEXT: Size: 8 +; DXC-NEXT: Flags: +; DXC: MinimumPrecision: false +; DXC: NativeLowPrecision: true +; DXC: ... diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll index 06aa6adbf0fb7..088040a491bdc 100644 --- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll +++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll @@ -16,12 +16,13 @@ ; CHECK-NEXT: DXIL Finalize Linkage ; CHECK-NEXT: DXIL Resource Binding Analysis ; CHECK-NEXT: DXIL Resource Implicit Binding +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: DXIL Resource Access ; CHECK-NEXT: DXIL Intrinsic Expansion ; CHECK-NEXT: DXIL CBuffer Access ; CHECK-NEXT: DXIL Data Scalarization ; CHECK-NEXT: DXIL Array Flattener ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: DXIL Resource Access ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Scalarize vector operations ; CHECK-NEXT: DXIL Forward Handle Accesses diff --git a/llvm/test/CodeGen/Hexagon/fminmax-v67.ll b/llvm/test/CodeGen/Hexagon/fminmax-v67.ll index d5b803c6c8926..ba4fcb5afdba3 100644 --- a/llvm/test/CodeGen/Hexagon/fminmax-v67.ll +++ b/llvm/test/CodeGen/Hexagon/fminmax-v67.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: t1 -; CHECK: dfmax +; CHECK: call fmax define dso_local double @t1(double %a, double %b) local_unnamed_addr { entry: @@ -11,7 +11,7 @@ entry: } ; CHECK-LABEL: t2 -; CHECK: dfmin +; CHECK: call fmin define dso_local double @t2(double %a, double %b) local_unnamed_addr { entry: @@ -20,7 +20,7 @@ entry: } ; CHECK-LABEL: t3 -; CHECK: sfmax +; CHECK: call fmaxf define dso_local float @t3(float %a, float %b) local_unnamed_addr { entry: @@ -29,7 +29,7 @@ entry: } ; CHECK-LABEL: t4 -; CHECK: sfmin +; CHECK: call fminf define dso_local float @t4(float %a, float %b) local_unnamed_addr { entry: @@ -37,6 +37,43 @@ entry: ret float %0 } +; CHECK-LABEL: t1num +; CHECK: dfmax + +define dso_local double @t1num(double %a, double %b) local_unnamed_addr { +entry: + %0 = tail call double @llvm.maximumnum.f64(double %a, double %b) + ret double %0 +} + +; CHECK-LABEL: t2num +; CHECK: dfmin + +define dso_local double @t2num(double %a, double %b) local_unnamed_addr { +entry: + %0 = tail call double @llvm.minimumnum.f64(double %a, double %b) + ret double %0 +} + +; CHECK-LABEL: t3num +; CHECK: sfmax + +define dso_local float @t3num(float %a, float %b) local_unnamed_addr { +entry: + %0 = tail call float @llvm.maximumnum.f32(float %a, float %b) + ret float %0 +} + +; CHECK-LABEL: t4num +; CHECK: sfmin + +define dso_local float @t4num(float %a, float %b) local_unnamed_addr { +entry: + %0 = tail call float @llvm.minimumnum.f32(float %a, float %b) + ret float %0 +} + + declare double @llvm.minnum.f64(double, double) #1 declare double @llvm.maxnum.f64(double, double) #1 declare float @llvm.maxnum.f32(float, float) #1 diff --git a/llvm/test/CodeGen/Hexagon/fminmax.ll b/llvm/test/CodeGen/Hexagon/fminmax.ll index a581bd3b21868..2aae79e6b9bf3 100644 --- a/llvm/test/CodeGen/Hexagon/fminmax.ll +++ b/llvm/test/CodeGen/Hexagon/fminmax.ll @@ -3,22 +3,55 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" target triple = "hexagon" -; CHECK-LABEL: minimum -; CHECK: sfmin -define float @minimum(float %x, float %y) #0 { +; CHECK-LABEL: cfminf +; CHECK: call fminf +define float @cfminf(float %x, float %y) #0 { entry: %call = tail call float @fminf(float %x, float %y) #1 ret float %call } -; CHECK-LABEL: maximum -; CHECK: sfmax -define float @maximum(float %x, float %y) #0 { +; CHECK-LABEL: cfmaxf +; CHECK: call fmaxf +define float @cfmaxf(float %x, float %y) #0 { entry: %call = tail call float @fmaxf(float %x, float %y) #1 ret float %call } +; CHECK-LABEL: minnum +; CHECK: call fminf +define float @minnum(float %x, float %y) #0 { +entry: + %call = tail call float @llvm.minnum.f32(float %x, float %y) #1 + ret float %call +} + +; CHECK-LABEL: maxnum +; CHECK: call fmaxf +define float @maxnum(float %x, float %y) #0 { +entry: + %call = tail call float @llvm.maxnum.f32(float %x, float %y) #1 + ret float %call +} + +; CHECK-LABEL: minimumnum +; CHECK: sfmin +define float @minimumnum(float %x, float %y) #0 { +entry: + %call = tail call float @llvm.minimumnum.f32(float %x, float %y) #1 + ret float %call +} + +; CHECK-LABEL: maximumnum +; CHECK: sfmax +define float @maximumnum(float %x, float %y) #0 { +entry: + %call = tail call float @llvm.maximumnum.f32(float %x, float %y) #1 + ret float %call +} + + declare float @fminf(float, float) #0 declare float @fmaxf(float, float) #0 diff --git a/llvm/test/CodeGen/Mips/nofpclass.ll b/llvm/test/CodeGen/Mips/nofpclass.ll new file mode 100644 index 0000000000000..b9737fe1175b9 --- /dev/null +++ b/llvm/test/CodeGen/Mips/nofpclass.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=mipsisa32r6-linux-gnu < %s | FileCheck %s --check-prefix=MIPS32R6 +; RUN: llc --mtriple=mipsisa64r6-linux-gnu < %s | FileCheck %s --check-prefix=MIPS64R6 + +define float @f(float nofpclass(nan) %a, float nofpclass(nan) %b) { +; MIPS32R6-LABEL: f: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f0, $f12, $f14 +; +; MIPS64R6-LABEL: f: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f0, $f12, $f13 +entry: + %cond = tail call float @llvm.maximumnum.f32(float %a, float %b) + ret float %cond +} + +define {float, float} @m({float, float} nofpclass(nan) %a0, {float, float} nofpclass(nan) %a1) { +; MIPS32R6-LABEL: m: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: mtc1 $6, $f0 +; MIPS32R6-NEXT: max.s $f0, $f12, $f0 +; MIPS32R6-NEXT: mtc1 $7, $f1 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f2, $f14, $f1 +; +; MIPS64R6-LABEL: m: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: max.s $f0, $f12, $f14 +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f2, $f13, $f15 +entry: + %a0f0 = extractvalue {float, float} %a0, 0 + %a0f1 = extractvalue {float, float} %a0, 1 + %a1f0 = extractvalue {float, float} %a1, 0 + %a1f1 = extractvalue {float, float} %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue {float, float} poison, float %max0, 0 + %ret1 = insertvalue {float, float} %ret0, float %max1, 1 + ret {float, float} %ret1 +} + +define [2 x float] @mA([2 x float] nofpclass(nan) %a0, [2 x float] nofpclass(nan) %a1) { +; MIPS32R6-LABEL: mA: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: mtc1 $6, $f0 +; MIPS32R6-NEXT: max.s $f0, $f12, $f0 +; MIPS32R6-NEXT: mtc1 $7, $f1 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f2, $f14, $f1 +; +; MIPS64R6-LABEL: mA: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: max.s $f0, $f12, $f14 +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f2, $f13, $f15 +entry: + %a0f0 = extractvalue [2 x float] %a0, 0 + %a0f1 = extractvalue [2 x float] %a0, 1 + %a1f0 = extractvalue [2 x float] %a1, 0 + %a1f1 = extractvalue [2 x float] %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue [2 x float] poison, float %max0, 0 + %ret1 = insertvalue [2 x float] %ret0, float %max1, 1 + ret [2 x float] %ret1 +} + +define float @fS(float nofpclass(snan) %a, float nofpclass(snan) %b) { +; MIPS32R6-LABEL: fS: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f0, $f12, $f14 +; +; MIPS64R6-LABEL: fS: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f0, $f12, $f13 +entry: + %cond = tail call float @llvm.maximumnum.f32(float %a, float %b) + ret float %cond +} + +define {float, float} @mS({float, float} nofpclass(snan) %a0, {float, float} nofpclass(snan) %a1) { +; MIPS32R6-LABEL: mS: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: mtc1 $6, $f0 +; MIPS32R6-NEXT: max.s $f0, $f12, $f0 +; MIPS32R6-NEXT: mtc1 $7, $f1 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f2, $f14, $f1 +; +; MIPS64R6-LABEL: mS: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: max.s $f0, $f12, $f14 +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f2, $f13, $f15 +entry: + %a0f0 = extractvalue {float, float} %a0, 0 + %a0f1 = extractvalue {float, float} %a0, 1 + %a1f0 = extractvalue {float, float} %a1, 0 + %a1f1 = extractvalue {float, float} %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue {float, float} poison, float %max0, 0 + %ret1 = insertvalue {float, float} %ret0, float %max1, 1 + ret {float, float} %ret1 +} + +define [2 x float] @mAS([2 x float] nofpclass(snan) %a0, [2 x float] nofpclass(snan) %a1) { +; MIPS32R6-LABEL: mAS: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: mtc1 $6, $f0 +; MIPS32R6-NEXT: max.s $f0, $f12, $f0 +; MIPS32R6-NEXT: mtc1 $7, $f1 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f2, $f14, $f1 +; +; MIPS64R6-LABEL: mAS: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: max.s $f0, $f12, $f14 +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f2, $f13, $f15 +entry: + %a0f0 = extractvalue [2 x float] %a0, 0 + %a0f1 = extractvalue [2 x float] %a0, 1 + %a1f0 = extractvalue [2 x float] %a1, 0 + %a1f1 = extractvalue [2 x float] %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue [2 x float] poison, float %max0, 0 + %ret1 = insertvalue [2 x float] %ret0, float %max1, 1 + ret [2 x float] %ret1 +} + +define float @fQ(float nofpclass(qnan) %a, float nofpclass(qnan) %b) { +; MIPS32R6-LABEL: fQ: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: min.s $f0, $f14, $f14 +; MIPS32R6-NEXT: min.s $f1, $f12, $f12 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f0, $f1, $f0 +; +; MIPS64R6-LABEL: fQ: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: min.s $f0, $f13, $f13 +; MIPS64R6-NEXT: min.s $f1, $f12, $f12 +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f0, $f1, $f0 +entry: + %cond = tail call float @llvm.maximumnum.f32(float %a, float %b) + ret float %cond +} + +define {float, float} @mQ({float, float} nofpclass(qnan) %a0, {float, float} nofpclass(qnan) %a1) { +; MIPS32R6-LABEL: mQ: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: min.s $f0, $f12, $f12 +; MIPS32R6-NEXT: mtc1 $6, $f1 +; MIPS32R6-NEXT: min.s $f1, $f1, $f1 +; MIPS32R6-NEXT: max.s $f0, $f0, $f1 +; MIPS32R6-NEXT: min.s $f1, $f14, $f14 +; MIPS32R6-NEXT: mtc1 $7, $f2 +; MIPS32R6-NEXT: min.s $f2, $f2, $f2 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f2, $f1, $f2 +; +; MIPS64R6-LABEL: mQ: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: min.s $f0, $f14, $f14 +; MIPS64R6-NEXT: min.s $f1, $f12, $f12 +; MIPS64R6-NEXT: max.s $f0, $f1, $f0 +; MIPS64R6-NEXT: min.s $f1, $f15, $f15 +; MIPS64R6-NEXT: min.s $f2, $f13, $f13 +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f2, $f2, $f1 +entry: + %a0f0 = extractvalue {float, float} %a0, 0 + %a0f1 = extractvalue {float, float} %a0, 1 + %a1f0 = extractvalue {float, float} %a1, 0 + %a1f1 = extractvalue {float, float} %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue {float, float} poison, float %max0, 0 + %ret1 = insertvalue {float, float} %ret0, float %max1, 1 + ret {float, float} %ret1 +} + +define [2 x float] @mAQ([2 x float] nofpclass(qnan) %a0, [2 x float] nofpclass(qnan) %a1) { +; MIPS32R6-LABEL: mAQ: +; MIPS32R6: # %bb.0: # %entry +; MIPS32R6-NEXT: min.s $f0, $f12, $f12 +; MIPS32R6-NEXT: mtc1 $6, $f1 +; MIPS32R6-NEXT: min.s $f1, $f1, $f1 +; MIPS32R6-NEXT: max.s $f0, $f0, $f1 +; MIPS32R6-NEXT: min.s $f1, $f14, $f14 +; MIPS32R6-NEXT: mtc1 $7, $f2 +; MIPS32R6-NEXT: min.s $f2, $f2, $f2 +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f2, $f1, $f2 +; +; MIPS64R6-LABEL: mAQ: +; MIPS64R6: # %bb.0: # %entry +; MIPS64R6-NEXT: min.s $f0, $f14, $f14 +; MIPS64R6-NEXT: min.s $f1, $f12, $f12 +; MIPS64R6-NEXT: max.s $f0, $f1, $f0 +; MIPS64R6-NEXT: min.s $f1, $f15, $f15 +; MIPS64R6-NEXT: min.s $f2, $f13, $f13 +; MIPS64R6-NEXT: jr $ra +; MIPS64R6-NEXT: max.s $f2, $f2, $f1 +entry: + %a0f0 = extractvalue [2 x float] %a0, 0 + %a0f1 = extractvalue [2 x float] %a0, 1 + %a1f0 = extractvalue [2 x float] %a1, 0 + %a1f1 = extractvalue [2 x float] %a1, 1 + %max0 = tail call float @llvm.maximumnum.f32(float %a0f0, float %a1f0) + %max1 = tail call float @llvm.maximumnum.f32(float %a0f1, float %a1f1) + %ret0 = insertvalue [2 x float] poison, float %max0, 0 + %ret1 = insertvalue [2 x float] %ret0, float %max1, 1 + ret [2 x float] %ret1 +} diff --git a/llvm/test/CodeGen/Mips/private-global-prefix.ll b/llvm/test/CodeGen/Mips/private-global-prefix.ll new file mode 100644 index 0000000000000..e60b9c1d7eba5 --- /dev/null +++ b/llvm/test/CodeGen/Mips/private-global-prefix.ll @@ -0,0 +1,24 @@ +; RUN: llc -mtriple=mipsel-w64-windows-gnu < %s | FileCheck %s -check-prefix=MIPSEL + +define void @f() { +; MIPSEL-LABEL: f: +; MIPSEL: # %bb.0: # %entry +; MIPSEL-NEXT: addiu $sp, $sp, -24 +; MIPSEL-NEXT: sw $ra, 20($sp) +; MIPSEL-NEXT: jal LeaveFoo +; MIPSEL-NEXT: nop +; MIPSEL-NEXT: jal LocalBar +; MIPSEL-NEXT: nop +; MIPSEL-NEXT: lw $ra, 20($sp) +; MIPSEL-NEXT: jr $ra +; MIPSEL-NEXT: addiu $sp, $sp, 24 + +entry: + call void @LeaveFoo() + call void @LocalBar() + ret void +} + +declare void @LeaveFoo() +declare void @LocalBar() + diff --git a/llvm/test/CodeGen/Mips/qnan.ll b/llvm/test/CodeGen/Mips/qnan.ll new file mode 100644 index 0000000000000..e5b4aa1b42ee7 --- /dev/null +++ b/llvm/test/CodeGen/Mips/qnan.ll @@ -0,0 +1,14 @@ +; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu < %s -o - | FileCheck %s -check-prefixes=MIPS_Legacy +; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu -mattr=+nan2008 < %s -o - | FileCheck %s -check-prefixes=MIPS_NaN2008 + +define dso_local float @nan(float noundef %a, float noundef %b) local_unnamed_addr #0 { +; MIPS_Legacy: $CPI0_0: +; MIPS_Legacy-NEXT: .4byte 0x7fa00000 # float NaN + +; MIPS_NaN2008: $CPI0_0: +; MIPS_NaN2008-NEXT: .4byte 0x7fc00000 # float NaN + +entry: + %0 = tail call float @llvm.minimum.f32(float %a, float %b) + ret float %0 +} diff --git a/llvm/test/CodeGen/Mips/unreachable.ll b/llvm/test/CodeGen/Mips/unreachable.ll new file mode 100644 index 0000000000000..30087411f7b89 --- /dev/null +++ b/llvm/test/CodeGen/Mips/unreachable.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=mipsel-windows-gnu < %s | FileCheck %s -check-prefix=MIPSEL + +define void @unreachable() { +; MIPSEL-LABEL: unreachable: +; MIPSEL: # %bb.0: # %entry +; MIPSEL-NEXT: .insn +; MIPSEL-NEXT: nop + +entry: + unreachable +} + diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll index def2575deb042..9acbb7984638a 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll @@ -288,3 +288,85 @@ define <2 x bfloat> @cvt_bf16x2_ue8m0x2(i16 %in) { %val = call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 %in) ret <2 x bfloat> %val } + +define i16 @cvt_rn_sf_e2m1x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_sf_e2m1x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_sf_e2m1x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_sf_e2m1x2_f32_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b8 %e2m1x2_out; +; CHECK-NEXT: cvt.rn.satfinite.e2m1x2.f32 %e2m1x2_out, %f1, %f2; +; CHECK-NEXT: cvt.u16.u8 %rs1, %e2m1x2_out; +; CHECK-NEXT: } +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rn_relu_sf_e2m1x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_sf_e2m1x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_sf_e2m1x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_sf_e2m1x2_f32_param_1]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b8 %e2m1x2_out; +; CHECK-NEXT: cvt.rn.satfinite.relu.e2m1x2.f32 %e2m1x2_out, %f1, %f2; +; CHECK-NEXT: cvt.u16.u8 %rs1, %e2m1x2_out; +; CHECK-NEXT: } +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float %f1, float %f2) + ret i16 %val +} + +define <2 x half> @cvt_rn_f16x2_e2m1x2(i16 %in) { +; CHECK-LABEL: cvt_rn_f16x2_e2m1x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_f16x2_e2m1x2_param_0]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b8 %e2m1x2_in; +; CHECK-NEXT: cvt.u8.u16 %e2m1x2_in, %rs1; +; CHECK-NEXT: cvt.rn.f16x2.e2m1x2 %r1, %e2m1x2_in; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 %in) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_e2m1x2(i16 %in) { +; CHECK-LABEL: cvt_rn_relu_f16x2_e2m1x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_relu_f16x2_e2m1x2_param_0]; +; CHECK-NEXT: { +; CHECK-NEXT: .reg .b8 %e2m1x2_in; +; CHECK-NEXT: cvt.u8.u16 %e2m1x2_in, %rs1; +; CHECK-NEXT: cvt.rn.relu.f16x2.e2m1x2 %r1, %e2m1x2_in; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 %in) + ret <2 x half> %val +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll new file mode 100644 index 0000000000000..1e6b04635edd5 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1), ptr addrspace(3), i32, i64, i1, i16) + +define void @cp_async_bulk_s2g_bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i16 %mask) { +; CHECK-PTX64-LABEL: cp_async_bulk_s2g_bytemask( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_2]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_bytemask_param_3]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%rd2], %r1, %rd3, %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%rd2], %r1, %rs1; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g_bytemask( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_bytemask_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%r1], %r2, %rd2, %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%r1], %r2, %rs1; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1, i16 %mask) + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0, i16 %mask) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index 77694ac82459a..d7f2a5df5547e 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -66,8 +66,8 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 ; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2]; -; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; ; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3; ; CHECK-PTX64-NEXT: ret; ; @@ -80,11 +80,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; - tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0) tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1) ret void } diff --git a/llvm/test/CodeGen/NVPTX/shift-opt.ll b/llvm/test/CodeGen/NVPTX/shift-opt.ll new file mode 100644 index 0000000000000..65bcbb8e67156 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/shift-opt.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s + +; Fold: srl (or (x, shl(zext(y),c1)),c1) -> or(srl(x,c1), zext(y)) +; c1 <= leadingzeros(zext(y)) +define i64 @test_or(i64 %x, i32 %y) { +; CHECK-LABEL: test_or( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_or_param_0]; +; CHECK-NEXT: ld.param.b32 %rd2, [test_or_param_1]; +; CHECK-NEXT: shr.u64 %rd3, %rd1, 5; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %ext = zext i32 %y to i64 + %shl = shl i64 %ext, 5 + %or = or i64 %x, %shl + %srl = lshr i64 %or, 5 + ret i64 %srl +} + +; Fold: srl (xor (x, shl(zext(y),c1)),c1) -> xor(srl(x,c1), zext(y)) +; c1 <= leadingzeros(zext(y)) +define i64 @test_xor(i64 %x, i32 %y) { +; CHECK-LABEL: test_xor( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_xor_param_0]; +; CHECK-NEXT: ld.param.b32 %rd2, [test_xor_param_1]; +; CHECK-NEXT: shr.u64 %rd3, %rd1, 5; +; CHECK-NEXT: xor.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %ext = zext i32 %y to i64 + %shl = shl i64 %ext, 5 + %or = xor i64 %x, %shl + %srl = lshr i64 %or, 5 + ret i64 %srl +} + +; Fold: srl (and (x, shl(zext(y),c1)),c1) -> and(srl(x,c1), zext(y)) +; c1 <= leadingzeros(zext(y)) +define i64 @test_and(i64 %x, i32 %y) { +; CHECK-LABEL: test_and( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_and_param_0]; +; CHECK-NEXT: ld.param.b32 %rd2, [test_and_param_1]; +; CHECK-NEXT: shr.u64 %rd3, %rd1, 5; +; CHECK-NEXT: and.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %ext = zext i32 %y to i64 + %shl = shl i64 %ext, 5 + %or = and i64 %x, %shl + %srl = lshr i64 %or, 5 + ret i64 %srl +} + +; Fold: srl (or (x, shl(zext(y),c1)),c1) -> or(srl(x,c1), zext(y)) +; c1 <= leadingzeros(zext(y)) +; x, y - vectors +define <2 x i16> @test_vec(<2 x i16> %x, <2 x i8> %y) { +; CHECK-LABEL: test_vec( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_vec_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_vec_param_1]; +; CHECK-NEXT: and.b32 %r3, %r2, 16711935; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: shr.u16 %rs3, %rs2, 5; +; CHECK-NEXT: shr.u16 %rs4, %rs1, 5; +; CHECK-NEXT: mov.b32 %r4, {%rs4, %rs3}; +; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; + %ext = zext <2 x i8> %y to <2 x i16> + %shl = shl <2 x i16> %ext, splat(i16 5) + %or = or <2 x i16> %x, %shl + %srl = lshr <2 x i16> %or, splat(i16 5) + ret <2 x i16> %srl +} + +; Do not fold: srl (or (x, shl(zext(y),c1)),c1) -> or(srl(x,c1), zext(y)) +; Reason: c1 > leadingzeros(zext(y)). +define i64 @test_negative_c(i64 %x, i32 %y) { +; CHECK-LABEL: test_negative_c( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_negative_c_param_0]; +; CHECK-NEXT: ld.param.b32 %rd2, [test_negative_c_param_1]; +; CHECK-NEXT: shl.b64 %rd3, %rd2, 33; +; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3; +; CHECK-NEXT: shr.u64 %rd5, %rd4, 33; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; +; CHECK-NEXT: ret; + %ext = zext i32 %y to i64 + %shl = shl i64 %ext, 33 + %or = or i64 %x, %shl + %srl = lshr i64 %or, 33 + ret i64 %srl +} + +declare void @use(i64) + +; Do not fold: srl (or (x, shl(zext(y),c1)),c1) -> or(srl(x,c1), zext(y)) +; Reason: multiple usage of "or" +define i64 @test_negative_use_lop(i64 %x, i32 %y) { +; CHECK-LABEL: test_negative_use_lop( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_negative_use_lop_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_negative_use_lop_param_1]; +; CHECK-NEXT: mul.wide.u32 %rd2, %r1, 32; +; CHECK-NEXT: or.b64 %rd3, %rd1, %rd2; +; CHECK-NEXT: shr.u64 %rd4, %rd3, 5; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd3; +; CHECK-NEXT: call.uni +; CHECK-NEXT: use, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %ext = zext i32 %y to i64 + %shl = shl i64 %ext, 5 + %or = or i64 %x, %shl + %srl = lshr i64 %or, 5 + call void @use(i64 %or) + ret i64 %srl +} + +; Do not fold: srl (or (x, shl(zext(y),c1)),c1) -> or(srl(x,c1), zext(y)) +; Reason: multiple usage of "shl" +define i64 @test_negative_use_shl(i64 %x, i32 %y) { +; CHECK-LABEL: test_negative_use_shl( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_negative_use_shl_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_negative_use_shl_param_1]; +; CHECK-NEXT: mul.wide.u32 %rd2, %r1, 32; +; CHECK-NEXT: or.b64 %rd3, %rd1, %rd2; +; CHECK-NEXT: shr.u64 %rd4, %rd3, 5; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: call.uni +; CHECK-NEXT: use, +; CHECK-NEXT: ( +; CHECK-NEXT: param0 +; CHECK-NEXT: ); +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %ext = zext i32 %y to i64 + %shl = shl i64 %ext, 5 + %or = or i64 %x, %shl + %srl = lshr i64 %or, 5 + call void @use(i64 %shl) + ret i64 %srl +} diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 7ee912a2006fd..ab73a85bfd7b1 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -9,6 +9,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+zba,+zbb,+zbs %s -o - | FileCheck --check-prefixes=CHECK,RV32COMBINEINTOB %s ; RUN: llc -mtriple=riscv32 -mattr=+f %s -o - | FileCheck --check-prefixes=CHECK,RV32F %s ; RUN: llc -mtriple=riscv32 -mattr=+d %s -o - | FileCheck --check-prefixes=CHECK,RV32D %s +; RUN: llc -mtriple=riscv32 -mattr=+q %s -o - | FileCheck --check-prefixes=CHECK,RV32Q %s ; RUN: llc -mtriple=riscv32 -mattr=+c %s -o - | FileCheck --check-prefixes=CHECK,RV32C %s ; RUN: llc -mtriple=riscv32 -mattr=+c,+f %s -o - | FileCheck --check-prefixes=CHECK,RV32CF %s ; RUN: llc -mtriple=riscv32 -mattr=+c,+d %s -o - | FileCheck --check-prefixes=CHECK,RV32CD %s @@ -104,6 +105,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisync %s -o - | FileCheck --check-prefix=RV32XQCISYNC %s ; RUN: llc -mtriple=riscv32 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV32XANDESPERF %s +; RUN: llc -mtriple=riscv32 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV32XANDESVDOT %s ; RUN: llc -mtriple=riscv32 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV32XANDESVPACKFPH %s ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s ; RUN: llc -mtriple=riscv32 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s @@ -181,6 +183,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zba,+zbb,+zbs %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOB %s ; RUN: llc -mtriple=riscv64 -mattr=+f %s -o - | FileCheck --check-prefixes=CHECK,RV64F %s ; RUN: llc -mtriple=riscv64 -mattr=+d %s -o - | FileCheck --check-prefixes=CHECK,RV64D %s +; RUN: llc -mtriple=riscv64 -mattr=+q %s -o - | FileCheck --check-prefixes=CHECK,RV64Q %s ; RUN: llc -mtriple=riscv64 -mattr=+c %s -o - | FileCheck --check-prefixes=CHECK,RV64C %s ; RUN: llc -mtriple=riscv64 -mattr=+c,+f %s -o - | FileCheck --check-prefixes=CHECK,RV64CF %s ; RUN: llc -mtriple=riscv64 -mattr=+c,+d %s -o - | FileCheck --check-prefixes=CHECK,RV64CD %s @@ -255,6 +258,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV64XTHEADSYNC %s ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s ; RUN: llc -mtriple=riscv64 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV64XANDESPERF %s +; RUN: llc -mtriple=riscv64 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV64XANDESVDOT %s ; RUN: llc -mtriple=riscv64 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV64XANDESVPACKFPH %s ; RUN: llc -mtriple=riscv64 -mattr=+za64rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA64RS %s ; RUN: llc -mtriple=riscv64 -mattr=+za128rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA128RS %s @@ -354,6 +358,7 @@ ; RV32COMBINEINTOB: .attribute 5, "rv32i2p1_b1p0_zba1p0_zbb1p0_zbs1p0" ; RV32F: .attribute 5, "rv32i2p1_f2p2_zicsr2p0" ; RV32D: .attribute 5, "rv32i2p1_f2p2_d2p2_zicsr2p0" +; RV32Q: .attribute 5, "rv32i2p1_f2p2_d2p2_q2p2_zicsr2p0" ; RV32C: .attribute 5, "rv32i2p1_c2p0_zca1p0" ; RV32CF: .attribute 5, "rv32i2p1_f2p2_c2p0_zicsr2p0_zca1p0_zcf1p0" ; RV32CD: .attribute 5, "rv32i2p1_f2p2_d2p2_c2p0_zicsr2p0_zca1p0_zcd1p0_zcf1p0" @@ -449,6 +454,7 @@ ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2" ; RV32XQCISYNC: attribute 5, "rv32i2p1_zca1p0_xqcisync0p3" ; RV32XANDESPERF: .attribute 5, "rv32i2p1_xandesperf5p0" +; RV32XANDESVDOT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0" ; RV32XANDESVPACKFPH: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0" ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0" ; RV32ZALRSC: .attribute 5, "rv32i2p1_zalrsc1p0" @@ -523,6 +529,7 @@ ; RV64COMBINEINTOB: .attribute 5, "rv64i2p1_b1p0_zba1p0_zbb1p0_zbs1p0" ; RV64F: .attribute 5, "rv64i2p1_f2p2_zicsr2p0" ; RV64D: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0" +; RV64Q: .attribute 5, "rv64i2p1_f2p2_d2p2_q2p2_zicsr2p0" ; RV64C: .attribute 5, "rv64i2p1_c2p0_zca1p0" ; RV64CF: .attribute 5, "rv64i2p1_f2p2_c2p0_zicsr2p0_zca1p0" ; RV64CD: .attribute 5, "rv64i2p1_f2p2_d2p2_c2p0_zicsr2p0_zca1p0_zcd1p0" @@ -601,6 +608,7 @@ ; RV64XTHEADSYNC: .attribute 5, "rv64i2p1_xtheadsync1p0" ; RV64XTHEADVDOT: .attribute 5, "rv64i2p1_f2p2_d2p2_v1p0_zicsr2p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xtheadvdot1p0" ; RV64XANDESPERF: .attribute 5, "rv64i2p1_xandesperf5p0" +; RV64XANDESVDOT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0" ; RV64XANDESVPACKFPH: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0" ; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0" ; RV64ZAAMO: .attribute 5, "rv64i2p1_zaamo1p0" diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 3a7d31253b05d..8b9d602dcde83 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -262,20 +262,33 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: beqz a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI2_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI2_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB2_2: ; RV64I-NEXT: li a0, 32 @@ -730,20 +743,33 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; ; RV64I-LABEL: test_cttz_i32_zero_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI6_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32M-LABEL: test_cttz_i32_zero_undef: diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index 03a6a6b1c4b7d..33907e10730a7 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -162,27 +162,38 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind { ; ; RV64I-LABEL: ctz_dereferencing_pointer_zext: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw s0, 0(a0) -; RV64I-NEXT: neg a0, s0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI1_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI1_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI1_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI1_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 31 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -237,27 +248,37 @@ define signext i32 @ctz1(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz1: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI2_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI2_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI2_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI2_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 31 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -310,27 +331,37 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz1_flipped: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI3_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI3_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI3_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI3_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 31 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -381,20 +412,33 @@ define signext i32 @ctz2(i32 signext %x) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI4_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI4_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB4_2: ; RV64I-NEXT: li a0, 32 @@ -446,20 +490,33 @@ define signext i32 @ctz3(i32 signext %x) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI5_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI5_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB5_2: ; RV64I-NEXT: li a0, 32 @@ -767,27 +824,37 @@ define signext i32 @ctz5(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz5: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI8_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI8_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI8_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 31 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -840,27 +907,37 @@ define signext i32 @ctz6(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz6: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI9_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI9_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI9_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI9_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 31 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret @@ -918,28 +995,39 @@ define signext i32 @globalVar() nounwind { ; ; RV64I-LABEL: globalVar: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: lui a0, %hi(global_x) -; RV64I-NEXT: lw s0, %lo(global_x)(a0) -; RV64I-NEXT: neg a0, s0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI10_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI10_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: lw a0, %lo(global_x)(a0) +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI10_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI10_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 31 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/double-calling-conv.ll b/llvm/test/CodeGen/RISCV/double-calling-conv.ll index 798eac64e9fc2..1a01fceca75a5 100644 --- a/llvm/test/CodeGen/RISCV/double-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/double-calling-conv.ll @@ -165,10 +165,10 @@ define double @callee_double_stack(i64 %a, i64 %b, i64 %c, i64 %d, double %e, do ; ; RV32IZFINXZDINX-LABEL: callee_double_stack: ; RV32IZFINXZDINX: # %bb.0: -; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: lw a2, 0(sp) ; RV32IZFINXZDINX-NEXT: lw a3, 4(sp) +; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) +; RV32IZFINXZDINX-NEXT: lw a2, 0(sp) ; RV32IZFINXZDINX-NEXT: fadd.d a0, a2, a0 ; RV32IZFINXZDINX-NEXT: ret %1 = fadd double %e, %f diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index 03ab83ece8ce7..0716650374d0d 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -734,38 +734,42 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI12_0) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI12_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI12_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: mv s0, a0 +; RV32IZFINXZDINX-NEXT: fle.d s2, a4, s0 ; RV32IZFINXZDINX-NEXT: call __fixdfdi -; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI12_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI12_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI12_0)(a2) -; RV32IZFINXZDINX-NEXT: fle.d a3, a2, s0 -; RV32IZFINXZDINX-NEXT: lui a4, 524288 +; RV32IZFINXZDINX-NEXT: lui a3, 524288 ; RV32IZFINXZDINX-NEXT: lui a2, 524288 -; RV32IZFINXZDINX-NEXT: beqz a3, .LBB12_2 +; RV32IZFINXZDINX-NEXT: beqz s2, .LBB12_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: # %start ; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB12_2: # %start ; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI12_1) -; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI12_1)(a1) -; RV32IZFINXZDINX-NEXT: lw a7, %lo(.LCPI12_1+4)(a1) -; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI12_1)(a1) +; RV32IZFINXZDINX-NEXT: addi a1, a1, %lo(.LCPI12_1) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a1) +; RV32IZFINXZDINX-NEXT: flt.d a1, a4, s0 ; RV32IZFINXZDINX-NEXT: beqz a1, .LBB12_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a3, -1 ; RV32IZFINXZDINX-NEXT: .LBB12_4: # %start -; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: feq.d a3, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a4, a1 +; RV32IZFINXZDINX-NEXT: neg a1, s2 ; RV32IZFINXZDINX-NEXT: neg a3, a3 -; RV32IZFINXZDINX-NEXT: neg a5, a1 -; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a1, a0 +; RV32IZFINXZDINX-NEXT: and a1, a3, a2 +; RV32IZFINXZDINX-NEXT: or a0, a4, a0 ; RV32IZFINXZDINX-NEXT: and a0, a3, a0 -; RV32IZFINXZDINX-NEXT: and a1, a4, a2 -; RV32IZFINXZDINX-NEXT: or a0, a5, a0 -; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; @@ -986,12 +990,13 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind { ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero ; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI14_0) ; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI14_0+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI14_0)(a4) +; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI14_0)(a4) +; RV32IZFINXZDINX-NEXT: addi a3, a4, %lo(.LCPI14_0) +; RV32IZFINXZDINX-NEXT: lw a7, 4(a3) ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 +; RV32IZFINXZDINX-NEXT: flt.d a2, a6, s0 ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: or a0, a2, a0 ; RV32IZFINXZDINX-NEXT: or a1, a2, a1 @@ -1653,17 +1658,19 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; RV32IZFINXZDINX-LABEL: fcvt_w_s_sat_i16: ; RV32IZFINXZDINX: # %bb.0: # %start ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI26_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI26_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI26_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI26_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI26_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI26_1)(a4) -; RV32IZFINXZDINX-NEXT: fmax.d a2, a0, a2 -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: neg a0, a0 -; RV32IZFINXZDINX-NEXT: fmin.d a2, a2, a4 -; RV32IZFINXZDINX-NEXT: fcvt.w.d a1, a2, rtz -; RV32IZFINXZDINX-NEXT: and a0, a0, a1 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI26_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI26_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI26_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI26_1)(a3) +; RV32IZFINXZDINX-NEXT: addi a3, a3, %lo(.LCPI26_1) +; RV32IZFINXZDINX-NEXT: lw a3, 4(a3) +; RV32IZFINXZDINX-NEXT: feq.d a6, a0, a0 +; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a4 +; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: neg a1, a6 +; RV32IZFINXZDINX-NEXT: and a0, a1, a0 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16: @@ -1850,11 +1857,12 @@ define zeroext i16 @fcvt_wu_s_sat_i16(double %a) nounwind { ; RV32IZFINXZDINX-LABEL: fcvt_wu_s_sat_i16: ; RV32IZFINXZDINX: # %bb.0: # %start ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI28_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI28_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI28_0)(a2) -; RV32IZFINXZDINX-NEXT: fcvt.d.w a4, zero -; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a4 -; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a2 +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI28_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI28_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero +; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a2 +; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a4 ; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz ; RV32IZFINXZDINX-NEXT: ret ; @@ -2028,17 +2036,19 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; RV32IZFINXZDINX-LABEL: fcvt_w_s_sat_i8: ; RV32IZFINXZDINX: # %bb.0: # %start ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI30_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI30_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI30_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI30_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI30_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI30_1)(a4) -; RV32IZFINXZDINX-NEXT: fmax.d a2, a0, a2 -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: neg a0, a0 -; RV32IZFINXZDINX-NEXT: fmin.d a2, a2, a4 -; RV32IZFINXZDINX-NEXT: fcvt.w.d a1, a2, rtz -; RV32IZFINXZDINX-NEXT: and a0, a0, a1 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI30_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI30_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI30_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI30_1)(a3) +; RV32IZFINXZDINX-NEXT: addi a3, a3, %lo(.LCPI30_1) +; RV32IZFINXZDINX-NEXT: lw a3, 4(a3) +; RV32IZFINXZDINX-NEXT: feq.d a6, a0, a0 +; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a4 +; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: neg a1, a6 +; RV32IZFINXZDINX-NEXT: and a0, a1, a0 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i8: @@ -2224,11 +2234,12 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind { ; RV32IZFINXZDINX-LABEL: fcvt_wu_s_sat_i8: ; RV32IZFINXZDINX: # %bb.0: # %start ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI32_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI32_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI32_0)(a2) -; RV32IZFINXZDINX-NEXT: fcvt.d.w a4, zero -; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a4 -; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a2 +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI32_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI32_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero +; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a2 +; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a4 ; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz ; RV32IZFINXZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll index 5350a28005738..d134f8ff16149 100644 --- a/llvm/test/CodeGen/RISCV/double-imm.ll +++ b/llvm/test/CodeGen/RISCV/double-imm.ll @@ -55,9 +55,10 @@ define double @double_imm_op(double %a) nounwind { ; CHECKRV32ZDINX-LABEL: double_imm_op: ; CHECKRV32ZDINX: # %bb.0: ; CHECKRV32ZDINX-NEXT: lui a2, %hi(.LCPI1_0) -; CHECKRV32ZDINX-NEXT: lw a3, %lo(.LCPI1_0+4)(a2) -; CHECKRV32ZDINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) -; CHECKRV32ZDINX-NEXT: fadd.d a0, a0, a2 +; CHECKRV32ZDINX-NEXT: lw a4, %lo(.LCPI1_0)(a2) +; CHECKRV32ZDINX-NEXT: addi a2, a2, %lo(.LCPI1_0) +; CHECKRV32ZDINX-NEXT: lw a5, 4(a2) +; CHECKRV32ZDINX-NEXT: fadd.d a0, a0, a4 ; CHECKRV32ZDINX-NEXT: ret ; ; CHECKRV64ZDINX-LABEL: double_imm_op: diff --git a/llvm/test/CodeGen/RISCV/double-mem.ll b/llvm/test/CodeGen/RISCV/double-mem.ll index dba9489e7511d..0d02ce127c325 100644 --- a/llvm/test/CodeGen/RISCV/double-mem.ll +++ b/llvm/test/CodeGen/RISCV/double-mem.ll @@ -7,6 +7,8 @@ ; RUN: -target-abi=ilp32 | FileCheck -check-prefixes=RV32IZFINXZDINX %s ; RUN: llc -mtriple=riscv64 -mattr=+zdinx -verify-machineinstrs < %s \ ; RUN: -target-abi=lp64 | FileCheck -check-prefixes=RV64IZFINXZDINX %s +; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zilsd -verify-machineinstrs < %s \ +; RUN: -target-abi=ilp32 | FileCheck -check-prefixes=RV32IZFINXZDINXZILSD %s define dso_local double @fld(ptr %a) nounwind { ; CHECKIFD-LABEL: fld: @@ -18,9 +20,9 @@ define dso_local double @fld(ptr %a) nounwind { ; ; RV32IZFINXZDINX-LABEL: fld: ; RV32IZFINXZDINX: # %bb.0: -; RV32IZFINXZDINX-NEXT: lw a2, 0(a0) ; RV32IZFINXZDINX-NEXT: lw a3, 4(a0) ; RV32IZFINXZDINX-NEXT: lw a1, 28(a0) +; RV32IZFINXZDINX-NEXT: lw a2, 0(a0) ; RV32IZFINXZDINX-NEXT: lw a0, 24(a0) ; RV32IZFINXZDINX-NEXT: fadd.d a0, a2, a0 ; RV32IZFINXZDINX-NEXT: ret @@ -31,6 +33,13 @@ define dso_local double @fld(ptr %a) nounwind { ; RV64IZFINXZDINX-NEXT: ld a0, 24(a0) ; RV64IZFINXZDINX-NEXT: fadd.d a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret +; +; RV32IZFINXZDINXZILSD-LABEL: fld: +; RV32IZFINXZDINXZILSD: # %bb.0: +; RV32IZFINXZDINXZILSD-NEXT: ld a2, 0(a0) +; RV32IZFINXZDINXZILSD-NEXT: ld a0, 24(a0) +; RV32IZFINXZDINXZILSD-NEXT: fadd.d a0, a2, a0 +; RV32IZFINXZDINXZILSD-NEXT: ret %1 = load double, ptr %a %2 = getelementptr double, ptr %a, i32 3 %3 = load double, ptr %2 @@ -67,6 +76,17 @@ define dso_local void @fsd(ptr %a, double %b, double %c) nounwind { ; RV64IZFINXZDINX-NEXT: sd a1, 0(a0) ; RV64IZFINXZDINX-NEXT: sd a1, 64(a0) ; RV64IZFINXZDINX-NEXT: ret +; +; RV32IZFINXZDINXZILSD-LABEL: fsd: +; RV32IZFINXZDINXZILSD: # %bb.0: +; RV32IZFINXZDINXZILSD-NEXT: mv a5, a4 +; RV32IZFINXZDINXZILSD-NEXT: mv a7, a2 +; RV32IZFINXZDINXZILSD-NEXT: mv a4, a3 +; RV32IZFINXZDINXZILSD-NEXT: mv a6, a1 +; RV32IZFINXZDINXZILSD-NEXT: fadd.d a2, a6, a4 +; RV32IZFINXZDINXZILSD-NEXT: sd a2, 0(a0) +; RV32IZFINXZDINXZILSD-NEXT: sd a2, 64(a0) +; RV32IZFINXZDINXZILSD-NEXT: ret ; Use %b and %c in an FP op to ensure floating point registers are used, even ; for the soft float ABI %1 = fadd double %b, %c @@ -95,13 +115,13 @@ define dso_local double @fld_fsd_global(double %a, double %b) nounwind { ; RV32IZFINXZDINX: # %bb.0: ; RV32IZFINXZDINX-NEXT: lui a4, %hi(G) ; RV32IZFINXZDINX-NEXT: fadd.d a0, a0, a2 -; RV32IZFINXZDINX-NEXT: lw a2, %lo(G)(a4) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(G+4)(a4) +; RV32IZFINXZDINX-NEXT: lw zero, %lo(G)(a4) +; RV32IZFINXZDINX-NEXT: lw zero, %lo(G+4)(a4) ; RV32IZFINXZDINX-NEXT: addi a2, a4, %lo(G) ; RV32IZFINXZDINX-NEXT: sw a0, %lo(G)(a4) ; RV32IZFINXZDINX-NEXT: sw a1, %lo(G+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, 72(a2) -; RV32IZFINXZDINX-NEXT: lw a5, 76(a2) +; RV32IZFINXZDINX-NEXT: lw zero, 72(a2) +; RV32IZFINXZDINX-NEXT: lw zero, 76(a2) ; RV32IZFINXZDINX-NEXT: sw a0, 72(a2) ; RV32IZFINXZDINX-NEXT: sw a1, 76(a2) ; RV32IZFINXZDINX-NEXT: ret @@ -116,6 +136,17 @@ define dso_local double @fld_fsd_global(double %a, double %b) nounwind { ; RV64IZFINXZDINX-NEXT: ld zero, 72(a2) ; RV64IZFINXZDINX-NEXT: sd a0, 72(a2) ; RV64IZFINXZDINX-NEXT: ret +; +; RV32IZFINXZDINXZILSD-LABEL: fld_fsd_global: +; RV32IZFINXZDINXZILSD: # %bb.0: +; RV32IZFINXZDINXZILSD-NEXT: lui a4, %hi(G) +; RV32IZFINXZDINXZILSD-NEXT: fadd.d a0, a0, a2 +; RV32IZFINXZDINXZILSD-NEXT: ld a2, %lo(G)(a4) +; RV32IZFINXZDINXZILSD-NEXT: addi a2, a4, %lo(G) +; RV32IZFINXZDINXZILSD-NEXT: sd a0, %lo(G)(a4) +; RV32IZFINXZDINXZILSD-NEXT: ld a4, 72(a2) +; RV32IZFINXZDINXZILSD-NEXT: sd a0, 72(a2) +; RV32IZFINXZDINXZILSD-NEXT: ret ; Use %a and %b in an FP op to ensure floating point registers are used, even ; for the soft float ABI %1 = fadd double %a, %b @@ -164,6 +195,14 @@ define dso_local double @fld_fsd_constant(double %a) nounwind { ; RV64IZFINXZDINX-NEXT: fadd.d a0, a0, a2 ; RV64IZFINXZDINX-NEXT: sd a0, -273(a1) ; RV64IZFINXZDINX-NEXT: ret +; +; RV32IZFINXZDINXZILSD-LABEL: fld_fsd_constant: +; RV32IZFINXZDINXZILSD: # %bb.0: +; RV32IZFINXZDINXZILSD-NEXT: lui a2, 912092 +; RV32IZFINXZDINXZILSD-NEXT: ld a4, -273(a2) +; RV32IZFINXZDINXZILSD-NEXT: fadd.d a0, a0, a4 +; RV32IZFINXZDINXZILSD-NEXT: sd a0, -273(a2) +; RV32IZFINXZDINXZILSD-NEXT: ret %1 = inttoptr i32 3735928559 to ptr %2 = load volatile double, ptr %1 %3 = fadd double %a, %2 @@ -214,8 +253,8 @@ define dso_local double @fld_stack(double %a) nounwind { ; RV32IZFINXZDINX-NEXT: mv s0, a0 ; RV32IZFINXZDINX-NEXT: addi a0, sp, 8 ; RV32IZFINXZDINX-NEXT: call notdead -; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) +; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: fadd.d a0, a0, s0 ; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -237,6 +276,24 @@ define dso_local double @fld_stack(double %a) nounwind { ; RV64IZFINXZDINX-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64IZFINXZDINX-NEXT: addi sp, sp, 32 ; RV64IZFINXZDINX-NEXT: ret +; +; RV32IZFINXZDINXZILSD-LABEL: fld_stack: +; RV32IZFINXZDINXZILSD: # %bb.0: +; RV32IZFINXZDINXZILSD-NEXT: addi sp, sp, -32 +; RV32IZFINXZDINXZILSD-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IZFINXZDINXZILSD-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IZFINXZDINXZILSD-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IZFINXZDINXZILSD-NEXT: mv s1, a1 +; RV32IZFINXZDINXZILSD-NEXT: mv s0, a0 +; RV32IZFINXZDINXZILSD-NEXT: addi a0, sp, 8 +; RV32IZFINXZDINXZILSD-NEXT: call notdead +; RV32IZFINXZDINXZILSD-NEXT: ld a0, 8(sp) +; RV32IZFINXZDINXZILSD-NEXT: fadd.d a0, a0, s0 +; RV32IZFINXZDINXZILSD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IZFINXZDINXZILSD-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IZFINXZDINXZILSD-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IZFINXZDINXZILSD-NEXT: addi sp, sp, 32 +; RV32IZFINXZDINXZILSD-NEXT: ret %1 = alloca double, align 8 call void @notdead(ptr %1) %2 = load double, ptr %1 @@ -293,6 +350,18 @@ define dso_local void @fsd_stack(double %a, double %b) nounwind { ; RV64IZFINXZDINX-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV64IZFINXZDINX-NEXT: ret +; +; RV32IZFINXZDINXZILSD-LABEL: fsd_stack: +; RV32IZFINXZDINXZILSD: # %bb.0: +; RV32IZFINXZDINXZILSD-NEXT: addi sp, sp, -16 +; RV32IZFINXZDINXZILSD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IZFINXZDINXZILSD-NEXT: fadd.d a0, a0, a2 +; RV32IZFINXZDINXZILSD-NEXT: sd a0, 0(sp) +; RV32IZFINXZDINXZILSD-NEXT: mv a0, sp +; RV32IZFINXZDINXZILSD-NEXT: call notdead +; RV32IZFINXZDINXZILSD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IZFINXZDINXZILSD-NEXT: addi sp, sp, 16 +; RV32IZFINXZDINXZILSD-NEXT: ret %1 = fadd double %a, %b ; force store from FPR64 %2 = alloca double, align 8 store double %1, ptr %2 @@ -321,6 +390,14 @@ define dso_local void @fsd_trunc(ptr %a, double %b) nounwind noinline optnone { ; RV64IZFINXZDINX-NEXT: fcvt.s.d a1, a1 ; RV64IZFINXZDINX-NEXT: sw a1, 0(a0) ; RV64IZFINXZDINX-NEXT: ret +; +; RV32IZFINXZDINXZILSD-LABEL: fsd_trunc: +; RV32IZFINXZDINXZILSD: # %bb.0: +; RV32IZFINXZDINXZILSD-NEXT: mv a3, a2 +; RV32IZFINXZDINXZILSD-NEXT: mv a2, a1 +; RV32IZFINXZDINXZILSD-NEXT: fcvt.s.d a1, a2 +; RV32IZFINXZDINXZILSD-NEXT: sw a1, 0(a0) +; RV32IZFINXZDINXZILSD-NEXT: ret %1 = fptrunc double %b to float store float %1, ptr %a, align 4 ret void diff --git a/llvm/test/CodeGen/RISCV/double-previous-failure.ll b/llvm/test/CodeGen/RISCV/double-previous-failure.ll index c5a7ee79364c6..c0993faa9584a 100644 --- a/llvm/test/CodeGen/RISCV/double-previous-failure.ll +++ b/llvm/test/CodeGen/RISCV/double-previous-failure.ll @@ -51,15 +51,17 @@ define i32 @main() nounwind { ; RV32IZFINXZDINX-NEXT: li a0, 0 ; RV32IZFINXZDINX-NEXT: call test ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_0)(a2) -; RV32IZFINXZDINX-NEXT: flt.d a2, a0, a2 +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI1_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI1_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: flt.d a2, a0, a4 ; RV32IZFINXZDINX-NEXT: bnez a2, .LBB1_3 ; RV32IZFINXZDINX-NEXT: # %bb.1: # %entry ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI1_1) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_1+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_1)(a2) -; RV32IZFINXZDINX-NEXT: flt.d a0, a2, a0 +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI1_1)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI1_1) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: flt.d a0, a4, a0 ; RV32IZFINXZDINX-NEXT: bnez a0, .LBB1_3 ; RV32IZFINXZDINX-NEXT: # %bb.2: # %if.end ; RV32IZFINXZDINX-NEXT: call exit diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll index cd87f2d2301d7..e685d21cc0928 100644 --- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll @@ -100,31 +100,33 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI1_0) -; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI1_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI1_0)(a2) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI1_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_1)(a3) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 -; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI1_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fle.d a3, a4, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: neg a5, a6 -; RV32IZFINXZDINX-NEXT: and a0, a5, a0 -; RV32IZFINXZDINX-NEXT: neg a5, a3 -; RV32IZFINXZDINX-NEXT: or a0, a5, a0 -; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: beqz a6, .LBB1_2 +; RV32IZFINXZDINX-NEXT: lui a2, 524288 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB1_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a5, a1 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB1_2: -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a3, .LBB1_4 +; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI1_1) +; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI1_1)(a1) +; RV32IZFINXZDINX-NEXT: addi a1, a1, %lo(.LCPI1_1) +; RV32IZFINXZDINX-NEXT: lw a7, 4(a1) +; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: beqz a1, .LBB1_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB1_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a5 +; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a3, a3 +; RV32IZFINXZDINX-NEXT: neg a5, a1 +; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a3, a0 +; RV32IZFINXZDINX-NEXT: and a1, a4, a2 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 +; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -217,25 +219,29 @@ define i64 @test_floor_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call floor +; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI3_0) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI3_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI3_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) ; RV32IZFINXZDINX-NEXT: mv s0, a0 ; RV32IZFINXZDINX-NEXT: mv s1, a1 +; RV32IZFINXZDINX-NEXT: flt.d a0, a4, s0 +; RV32IZFINXZDINX-NEXT: neg s2, a0 +; RV32IZFINXZDINX-NEXT: mv a0, s0 ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI3_0) ; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI3_0+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI3_0)(a4) ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 -; RV32IZFINXZDINX-NEXT: or a1, a2, a1 +; RV32IZFINXZDINX-NEXT: or a0, s2, a0 +; RV32IZFINXZDINX-NEXT: or a1, s2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; @@ -344,31 +350,33 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI5_0) -; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI5_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a2) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI5_1)(a3) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI5_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 -; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI5_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fle.d a3, a4, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: neg a5, a6 -; RV32IZFINXZDINX-NEXT: and a0, a5, a0 -; RV32IZFINXZDINX-NEXT: neg a5, a3 -; RV32IZFINXZDINX-NEXT: or a0, a5, a0 -; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: beqz a6, .LBB5_2 +; RV32IZFINXZDINX-NEXT: lui a2, 524288 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB5_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a5, a1 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB5_2: -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a3, .LBB5_4 +; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI5_1) +; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI5_1)(a1) +; RV32IZFINXZDINX-NEXT: addi a1, a1, %lo(.LCPI5_1) +; RV32IZFINXZDINX-NEXT: lw a7, 4(a1) +; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: beqz a1, .LBB5_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB5_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a5 +; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a3, a3 +; RV32IZFINXZDINX-NEXT: neg a5, a1 +; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a3, a0 +; RV32IZFINXZDINX-NEXT: and a1, a4, a2 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 +; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -461,25 +469,29 @@ define i64 @test_ceil_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call ceil +; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI7_0) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI7_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI7_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) ; RV32IZFINXZDINX-NEXT: mv s0, a0 ; RV32IZFINXZDINX-NEXT: mv s1, a1 +; RV32IZFINXZDINX-NEXT: flt.d a0, a4, s0 +; RV32IZFINXZDINX-NEXT: neg s2, a0 +; RV32IZFINXZDINX-NEXT: mv a0, s0 ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI7_0) ; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI7_0+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI7_0)(a4) ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 -; RV32IZFINXZDINX-NEXT: or a1, a2, a1 +; RV32IZFINXZDINX-NEXT: or a0, s2, a0 +; RV32IZFINXZDINX-NEXT: or a1, s2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; @@ -588,31 +600,33 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI9_0) -; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI9_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI9_0)(a2) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI9_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI9_1)(a3) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI9_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 -; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI9_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fle.d a3, a4, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: neg a5, a6 -; RV32IZFINXZDINX-NEXT: and a0, a5, a0 -; RV32IZFINXZDINX-NEXT: neg a5, a3 -; RV32IZFINXZDINX-NEXT: or a0, a5, a0 -; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: beqz a6, .LBB9_2 +; RV32IZFINXZDINX-NEXT: lui a2, 524288 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB9_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a5, a1 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB9_2: -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a3, .LBB9_4 +; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI9_1) +; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI9_1)(a1) +; RV32IZFINXZDINX-NEXT: addi a1, a1, %lo(.LCPI9_1) +; RV32IZFINXZDINX-NEXT: lw a7, 4(a1) +; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: beqz a1, .LBB9_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB9_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a5 +; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a3, a3 +; RV32IZFINXZDINX-NEXT: neg a5, a1 +; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a3, a0 +; RV32IZFINXZDINX-NEXT: and a1, a4, a2 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 +; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -705,25 +719,29 @@ define i64 @test_trunc_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call trunc +; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI11_0) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI11_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI11_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) ; RV32IZFINXZDINX-NEXT: mv s0, a0 ; RV32IZFINXZDINX-NEXT: mv s1, a1 +; RV32IZFINXZDINX-NEXT: flt.d a0, a4, s0 +; RV32IZFINXZDINX-NEXT: neg s2, a0 +; RV32IZFINXZDINX-NEXT: mv a0, s0 ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI11_0) ; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI11_0+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI11_0)(a4) ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 -; RV32IZFINXZDINX-NEXT: or a1, a2, a1 +; RV32IZFINXZDINX-NEXT: or a0, s2, a0 +; RV32IZFINXZDINX-NEXT: or a1, s2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; @@ -832,31 +850,33 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI13_0) -; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI13_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI13_0)(a2) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI13_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI13_1)(a3) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI13_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 -; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI13_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fle.d a3, a4, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: neg a5, a6 -; RV32IZFINXZDINX-NEXT: and a0, a5, a0 -; RV32IZFINXZDINX-NEXT: neg a5, a3 -; RV32IZFINXZDINX-NEXT: or a0, a5, a0 -; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: beqz a6, .LBB13_2 +; RV32IZFINXZDINX-NEXT: lui a2, 524288 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB13_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a5, a1 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB13_2: -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a3, .LBB13_4 +; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI13_1) +; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI13_1)(a1) +; RV32IZFINXZDINX-NEXT: addi a1, a1, %lo(.LCPI13_1) +; RV32IZFINXZDINX-NEXT: lw a7, 4(a1) +; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: beqz a1, .LBB13_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB13_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a5 +; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a3, a3 +; RV32IZFINXZDINX-NEXT: neg a5, a1 +; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a3, a0 +; RV32IZFINXZDINX-NEXT: and a1, a4, a2 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 +; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -949,25 +969,29 @@ define i64 @test_round_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call round +; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI15_0) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI15_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI15_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) ; RV32IZFINXZDINX-NEXT: mv s0, a0 ; RV32IZFINXZDINX-NEXT: mv s1, a1 +; RV32IZFINXZDINX-NEXT: flt.d a0, a4, s0 +; RV32IZFINXZDINX-NEXT: neg s2, a0 +; RV32IZFINXZDINX-NEXT: mv a0, s0 ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI15_0) ; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI15_0+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI15_0)(a4) ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 -; RV32IZFINXZDINX-NEXT: or a1, a2, a1 +; RV32IZFINXZDINX-NEXT: or a0, s2, a0 +; RV32IZFINXZDINX-NEXT: or a1, s2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; @@ -1076,31 +1100,33 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI17_0) -; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI17_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI17_0)(a2) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI17_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI17_1)(a3) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI17_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 -; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI17_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fle.d a3, a4, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: neg a5, a6 -; RV32IZFINXZDINX-NEXT: and a0, a5, a0 -; RV32IZFINXZDINX-NEXT: neg a5, a3 -; RV32IZFINXZDINX-NEXT: or a0, a5, a0 -; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: beqz a6, .LBB17_2 +; RV32IZFINXZDINX-NEXT: lui a2, 524288 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB17_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a5, a1 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB17_2: -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a3, .LBB17_4 +; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI17_1) +; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI17_1)(a1) +; RV32IZFINXZDINX-NEXT: addi a1, a1, %lo(.LCPI17_1) +; RV32IZFINXZDINX-NEXT: lw a7, 4(a1) +; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: beqz a1, .LBB17_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB17_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a5 +; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a3, a3 +; RV32IZFINXZDINX-NEXT: neg a5, a1 +; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a3, a0 +; RV32IZFINXZDINX-NEXT: and a1, a4, a2 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 +; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1193,25 +1219,29 @@ define i64 @test_roundeven_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call roundeven +; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI19_0) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI19_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI19_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) ; RV32IZFINXZDINX-NEXT: mv s0, a0 ; RV32IZFINXZDINX-NEXT: mv s1, a1 +; RV32IZFINXZDINX-NEXT: flt.d a0, a4, s0 +; RV32IZFINXZDINX-NEXT: neg s2, a0 +; RV32IZFINXZDINX-NEXT: mv a0, s0 ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI19_0) ; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI19_0+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI19_0)(a4) ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 -; RV32IZFINXZDINX-NEXT: or a1, a2, a1 +; RV32IZFINXZDINX-NEXT: or a0, s2, a0 +; RV32IZFINXZDINX-NEXT: or a1, s2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; @@ -1320,31 +1350,33 @@ define i64 @test_rint_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: mv s1, a1 ; RV32IZFINXZDINX-NEXT: call __fixdfdi ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI21_0) -; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI21_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI21_0)(a2) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI21_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI21_1)(a3) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI21_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 -; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 -; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI21_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) +; RV32IZFINXZDINX-NEXT: fle.d a3, a4, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: neg a5, a6 -; RV32IZFINXZDINX-NEXT: and a0, a5, a0 -; RV32IZFINXZDINX-NEXT: neg a5, a3 -; RV32IZFINXZDINX-NEXT: or a0, a5, a0 -; RV32IZFINXZDINX-NEXT: lui a5, 524288 -; RV32IZFINXZDINX-NEXT: beqz a6, .LBB21_2 +; RV32IZFINXZDINX-NEXT: lui a2, 524288 +; RV32IZFINXZDINX-NEXT: beqz a3, .LBB21_2 ; RV32IZFINXZDINX-NEXT: # %bb.1: -; RV32IZFINXZDINX-NEXT: mv a5, a1 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: .LBB21_2: -; RV32IZFINXZDINX-NEXT: and a0, a2, a0 -; RV32IZFINXZDINX-NEXT: beqz a3, .LBB21_4 +; RV32IZFINXZDINX-NEXT: lui a1, %hi(.LCPI21_1) +; RV32IZFINXZDINX-NEXT: lw a6, %lo(.LCPI21_1)(a1) +; RV32IZFINXZDINX-NEXT: addi a1, a1, %lo(.LCPI21_1) +; RV32IZFINXZDINX-NEXT: lw a7, 4(a1) +; RV32IZFINXZDINX-NEXT: flt.d a1, a6, s0 +; RV32IZFINXZDINX-NEXT: beqz a1, .LBB21_4 ; RV32IZFINXZDINX-NEXT: # %bb.3: -; RV32IZFINXZDINX-NEXT: addi a5, a4, -1 +; RV32IZFINXZDINX-NEXT: addi a2, a4, -1 ; RV32IZFINXZDINX-NEXT: .LBB21_4: -; RV32IZFINXZDINX-NEXT: and a1, a2, a5 +; RV32IZFINXZDINX-NEXT: feq.d a4, s0, s0 +; RV32IZFINXZDINX-NEXT: neg a3, a3 +; RV32IZFINXZDINX-NEXT: neg a5, a1 +; RV32IZFINXZDINX-NEXT: neg a4, a4 +; RV32IZFINXZDINX-NEXT: and a0, a3, a0 +; RV32IZFINXZDINX-NEXT: and a1, a4, a2 +; RV32IZFINXZDINX-NEXT: or a0, a5, a0 +; RV32IZFINXZDINX-NEXT: and a0, a4, a0 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1437,25 +1469,29 @@ define i64 @test_rint_ui64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: call rint +; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI23_0) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI23_0)(a2) +; RV32IZFINXZDINX-NEXT: addi a2, a2, %lo(.LCPI23_0) +; RV32IZFINXZDINX-NEXT: lw a5, 4(a2) ; RV32IZFINXZDINX-NEXT: mv s0, a0 ; RV32IZFINXZDINX-NEXT: mv s1, a1 +; RV32IZFINXZDINX-NEXT: flt.d a0, a4, s0 +; RV32IZFINXZDINX-NEXT: neg s2, a0 +; RV32IZFINXZDINX-NEXT: mv a0, s0 ; RV32IZFINXZDINX-NEXT: call __fixunsdfdi ; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI23_0) ; RV32IZFINXZDINX-NEXT: fle.d a2, a2, s0 -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI23_0+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI23_0)(a4) ; RV32IZFINXZDINX-NEXT: neg a2, a2 ; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: and a1, a2, a1 -; RV32IZFINXZDINX-NEXT: flt.d a2, a4, s0 -; RV32IZFINXZDINX-NEXT: neg a2, a2 -; RV32IZFINXZDINX-NEXT: or a0, a2, a0 -; RV32IZFINXZDINX-NEXT: or a1, a2, a1 +; RV32IZFINXZDINX-NEXT: or a0, s2, a0 +; RV32IZFINXZDINX-NEXT: or a1, s2, a1 ; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index cdbb6e6425189..00311bab50836 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -74,6 +74,7 @@ ; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects. ; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN. ; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix. +; CHECK-NEXT: q - 'Q' (Quad-Precision Floating-Point). ; CHECK-NEXT: relax - Enable Linker relaxation.. ; CHECK-NEXT: reserve-x1 - Reserve X1. ; CHECK-NEXT: reserve-x10 - Reserve X10. @@ -171,6 +172,7 @@ ; CHECK-NEXT: ventana-veyron - Ventana Veyron-Series processors. ; CHECK-NEXT: vxrm-pipeline-flush - VXRM writes causes pipeline flush. ; CHECK-NEXT: xandesperf - 'XAndesPerf' (Andes Performance Extension). +; CHECK-NEXT: xandesvdot - 'XAndesVDot' (Andes Vector Dot Product Extension). ; CHECK-NEXT: xandesvpackfph - 'XAndesVPackFPH' (Andes Vector Packed FP16 Extension). ; CHECK-NEXT: xcvalu - 'XCValu' (CORE-V ALU Operations). ; CHECK-NEXT: xcvbi - 'XCVbi' (CORE-V Immediate Branching). diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore-zilsd.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore-zilsd.ll new file mode 100644 index 0000000000000..e34c5272ebaeb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore-zilsd.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+zdinx,+zilsd -verify-machineinstrs \ +; RUN: -code-model=medium < %s | FileCheck %s + +@g_0 = global double 0.0 + +define double @load_g_0() nounwind { +; CHECK-LABEL: load_g_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: .Lpcrel_hi0: +; CHECK-NEXT: auipc a0, %pcrel_hi(g_0) +; CHECK-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi0)(a0) +; CHECK-NEXT: ret +entry: + %0 = load double, ptr @g_0 + ret double %0 +} + +define void @store_g_0() nounwind { +; CHECK-LABEL: store_g_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: .Lpcrel_hi1: +; CHECK-NEXT: auipc a0, %pcrel_hi(g_0) +; CHECK-NEXT: fcvt.d.w a2, zero +; CHECK-NEXT: sd a2, %pcrel_lo(.Lpcrel_hi1)(a0) +; CHECK-NEXT: ret +entry: + store double 0.0, ptr @g_0 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/make-compressible-zilsd.mir b/llvm/test/CodeGen/RISCV/make-compressible-zilsd.mir new file mode 100644 index 0000000000000..c5ac599d8d53f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/make-compressible-zilsd.mir @@ -0,0 +1,299 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -o - %s -mtriple=riscv32 -mattr=+zilsd,+zclsd,+zdinx -simplify-mir \ +# RUN: -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32 %s +--- | + define void @store_common_value_double(ptr %a, ptr %b, ptr %c, i32 %d, double %e, double %f) #0 { + entry: + store double %f, ptr %a, align 8 + store double %f, ptr %b, align 8 + store double %f, ptr %c, align 8 + ret void + } + + define void @store_common_ptr_double(double %a, double %b, double %d, ptr %p) #0 { + entry: + store volatile double %a, ptr %p, align 8 + store volatile double %b, ptr %p, align 8 + store volatile double %b, ptr %p, align 8 + ret void + } + + define void @load_common_ptr_double(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ptr %g) #0 { + entry: + %0 = load double, ptr %g, align 8 + %arrayidx1 = getelementptr inbounds { double, double, i32 }, ptr %g, i32 0, i32 1 + %1 = load double, ptr %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds { double, double, i32 }, ptr %g, i32 0, i32 2 + %2 = load i32, ptr %arrayidx2, align 8 + tail call void @load_common_ptr_double_1(double %0, double %1, i32 %2) + ret void + } + + declare void @load_common_ptr_double_1(double, double, double) #0 + + define void @store_large_offset_double(ptr %p, i32 %dummy, double %a, double %b, double %c) #0 { + entry: + %0 = getelementptr inbounds double, ptr %p, i32 100 + store volatile double %a, ptr %0, align 8 + %1 = getelementptr inbounds double, ptr %p, i32 101 + store volatile double %b, ptr %1, align 8 + %2 = getelementptr inbounds double, ptr %p, i32 102 + store volatile double %b, ptr %2, align 8 + ret void + } + + define void @load_large_offset_double(i32 %a, i32 %b, i32 %c, i32 %d, ptr %p) #0 { + entry: + %arrayidx = getelementptr inbounds { [102 x double], i32 }, ptr %p, i32 0, i32 0, i32 100 + %0 = load double, ptr %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds { [102 x double], i32 }, ptr %p, i32 0, i32 0, i32 101 + %1 = load double, ptr %arrayidx1, align 8 + %arrayidx2 = getelementptr inbounds { [102 x double], i32 }, ptr %p, i32 0, i32 1 + %2 = load i32, ptr %arrayidx2, align 8 + tail call void @load_large_offset_double_1(double %0, double %1, i32 %2) + ret void + } + + declare void @load_large_offset_double_1(double, double) #0 + + define void @store_common_value_double_no_opt(ptr %a, i32 %b, double %c, double %d, double %e) #0 { + entry: + store double %e, ptr %a, align 8 + ret void + } + + define void @store_common_ptr_double_no_opt(double %a, i32 %b, i32 %c, i32 %d, i32 %e, ptr %p) #0 { + entry: + store volatile double %a, ptr %p, align 8 + ret void + } + + define double @load_common_ptr_double_no_opt(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ptr %g) #0 { + entry: + %0 = load double, ptr %g, align 8 + ret double %0 + } + + define void @store_large_offset_double_no_opt(ptr %p, double %a, double %b) #0 { + entry: + %0 = getelementptr inbounds double, ptr %p, i32 100 + store volatile double %a, ptr %0, align 8 + %1 = getelementptr inbounds double, ptr %p, i32 101 + store volatile double %b, ptr %1, align 8 + ret void + } + + define { double, double } @load_large_offset_double_no_opt(ptr %p) #0 { + entry: + %arrayidx = getelementptr inbounds double, ptr %p, i32 100 + %0 = load double, ptr %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %p, i32 101 + %1 = load double, ptr %arrayidx1, align 8 + %2 = insertvalue { double, double } undef, double %0, 0 + %3 = insertvalue { double, double } %2, double %1, 1 + ret { double, double } %3 + } + + attributes #0 = { minsize "target-features"="+zilsd,+zdinx" } +... +--- +name: store_common_value_double +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11, $x12, $x16, $x17 + + ; RV32-LABEL: name: store_common_value_double + ; RV32: liveins: $x10, $x11, $x12, $x16, $x17 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: $x14 = ADDI $x16, 0 + ; RV32-NEXT: $x15 = ADDI $x17, 0 + ; RV32-NEXT: SD_RV32 $x14_x15, killed renamable $x10, 0 :: (store (s64) into %ir.a) + ; RV32-NEXT: SD_RV32 $x14_x15, killed renamable $x11, 0 :: (store (s64) into %ir.b) + ; RV32-NEXT: SD_RV32 killed $x14_x15, killed renamable $x12, 0 :: (store (s64) into %ir.c) + ; RV32-NEXT: PseudoRET + SD_RV32 renamable $x16_x17, killed renamable $x10, 0 :: (store (s64) into %ir.a) + SD_RV32 renamable $x16_x17, killed renamable $x11, 0 :: (store (s64) into %ir.b) + SD_RV32 killed renamable $x16_x17, killed renamable $x12, 0 :: (store (s64) into %ir.c) + PseudoRET + +... +--- +name: store_common_ptr_double +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16 + + ; RV32-LABEL: name: store_common_ptr_double + ; RV32: liveins: $x10, $x11, $x12, $x13, $x14, $x15, $x16 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: $x14 = ADDI $x16, 0 + ; RV32-NEXT: SD_RV32 killed renamable $x10_x11, $x14, 0 :: (volatile store (s64) into %ir.p) + ; RV32-NEXT: SD_RV32 renamable $x12_x13, $x14, 0 :: (volatile store (s64) into %ir.p) + ; RV32-NEXT: SD_RV32 killed renamable $x12_x13, killed $x14, 0 :: (volatile store (s64) into %ir.p) + ; RV32-NEXT: PseudoRET + SD_RV32 killed renamable $x10_x11, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + SD_RV32 renamable $x12_x13, renamable $x16, 0 :: (volatile store (s64) into %ir.p) + SD_RV32 killed renamable $x12_x13, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) + PseudoRET + +... +--- +name: load_common_ptr_double +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x16 + + ; RV32-LABEL: name: load_common_ptr_double + ; RV32: liveins: $x16 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: $x15 = ADDI $x16, 0 + ; RV32-NEXT: renamable $x10_x11 = LD_RV32 $x15, 0 :: (load (s64) from %ir.g) + ; RV32-NEXT: renamable $x12_x13 = LD_RV32 $x15, 8 :: (load (s64) from %ir.arrayidx1) + ; RV32-NEXT: renamable $x14 = LW killed $x15, 16 :: (load (s32) from %ir.arrayidx2, align 8) + ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, csr_ilp32_lp64, implicit $x2, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14 + renamable $x10_x11 = LD_RV32 renamable $x16, 0 :: (load (s64) from %ir.g) + renamable $x12_x13 = LD_RV32 renamable $x16, 8 :: (load (s64) from %ir.arrayidx1) + renamable $x14 = LW killed renamable $x16, 16 :: (load (s32) from %ir.arrayidx2, align 8) + PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, csr_ilp32_lp64, implicit $x2, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14 + +... +--- +name: store_large_offset_double +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x12, $x13, $x14, $x15 + + ; RV32-LABEL: name: store_large_offset_double + ; RV32: liveins: $x10, $x12, $x13, $x14, $x15 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: $x11 = ADDI $x10, 768 + ; RV32-NEXT: SD_RV32 killed renamable $x12_x13, $x11, 32 :: (volatile store (s64) into %ir.0) + ; RV32-NEXT: SD_RV32 renamable $x14_x15, $x11, 40 :: (volatile store (s64) into %ir.1) + ; RV32-NEXT: SD_RV32 killed renamable $x14_x15, killed $x11, 48 :: (volatile store (s64) into %ir.2) + ; RV32-NEXT: PseudoRET + SD_RV32 killed renamable $x12_x13, renamable $x10, 800 :: (volatile store (s64) into %ir.0) + SD_RV32 renamable $x14_x15, renamable $x10, 808 :: (volatile store (s64) into %ir.1) + SD_RV32 killed renamable $x14_x15, killed renamable $x10, 816 :: (volatile store (s64) into %ir.2) + PseudoRET + +... +--- +name: load_large_offset_double +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x14 + + ; RV32-LABEL: name: load_large_offset_double + ; RV32: liveins: $x14 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: $x15 = ADDI $x14, 768 + ; RV32-NEXT: renamable $x10_x11 = LD_RV32 $x15, 32 :: (load (s64) from %ir.arrayidx) + ; RV32-NEXT: renamable $x12_x13 = LD_RV32 $x15, 40 :: (load (s64) from %ir.arrayidx1) + ; RV32-NEXT: renamable $x14 = LW killed $x15, 48 :: (load (s32) from %ir.arrayidx2, align 8) + ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, csr_ilp32_lp64, implicit $x2, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14 + renamable $x10_x11 = LD_RV32 renamable $x14, 800 :: (load (s64) from %ir.arrayidx) + renamable $x12_x13 = LD_RV32 renamable $x14, 808 :: (load (s64) from %ir.arrayidx1) + renamable $x14 = LW killed renamable $x14, 816 :: (load (s32) from %ir.arrayidx2, align 8) + PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, csr_ilp32_lp64, implicit $x2, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14 +... +--- +name: store_common_value_double_no_opt +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x16, $x17 + + ; RV32-LABEL: name: store_common_value_double_no_opt + ; RV32: liveins: $x10, $x16, $x17 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: SD_RV32 killed renamable $x16_x17, killed renamable $x10, 0 :: (store (s64) into %ir.a) + ; RV32-NEXT: PseudoRET + SD_RV32 killed renamable $x16_x17, killed renamable $x10, 0 :: (store (s64) into %ir.a) + PseudoRET + +... +--- +name: store_common_ptr_double_no_opt +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11, $x16 + + ; RV32-LABEL: name: store_common_ptr_double_no_opt + ; RV32: liveins: $x10, $x11, $x16 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: SD_RV32 killed renamable $x10_x11, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) + ; RV32-NEXT: PseudoRET + SD_RV32 killed renamable $x10_x11, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p) + PseudoRET + +... +--- +name: load_common_ptr_double_no_opt +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x16 + + ; RV32-LABEL: name: load_common_ptr_double_no_opt + ; RV32: liveins: $x16 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: renamable $x10_x11 = LD_RV32 killed renamable $x16, 0 :: (load (s64) from %ir.g) + ; RV32-NEXT: PseudoRET implicit $x10, implicit $x11 + renamable $x10_x11 = LD_RV32 killed renamable $x16, 0 :: (load (s64) from %ir.g) + PseudoRET implicit $x10, implicit $x11 + +... +--- +name: store_large_offset_double_no_opt +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10, $x11, $x12, $x13, $x14 + + ; RV32-LABEL: name: store_large_offset_double_no_opt + ; RV32: liveins: $x10, $x11, $x12, $x13, $x14 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: $x15 = ADDI $x14, 0 + ; RV32-NEXT: $x17 = ADDI $x12, 0 + ; RV32-NEXT: $x16 = ADDI $x11, 0 + ; RV32-NEXT: SD_RV32 killed renamable $x16_x17, renamable $x10, 800 :: (volatile store (s64) into %ir.0) + ; RV32-NEXT: $x14 = ADDI $x13, 0 + ; RV32-NEXT: SD_RV32 killed renamable $x14_x15, killed renamable $x10, 808 :: (volatile store (s64) into %ir.1) + ; RV32-NEXT: PseudoRET + $x15 = ADDI $x14, 0 + $x17 = ADDI $x12, 0 + $x16 = ADDI $x11, 0 + SD_RV32 killed renamable $x16_x17, renamable $x10, 800 :: (volatile store (s64) into %ir.0) + $x14 = ADDI $x13, 0 + SD_RV32 killed renamable $x14_x15, killed renamable $x10, 808 :: (volatile store (s64) into %ir.1) + PseudoRET + +... +--- +name: load_large_offset_double_no_opt +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x10 + + ; RV32-LABEL: name: load_large_offset_double_no_opt + ; RV32: liveins: $x10 + ; RV32-NEXT: {{ $}} + ; RV32-NEXT: renamable $x14_x15 = LD_RV32 renamable $x10, 800 :: (load (s64) from %ir.arrayidx) + ; RV32-NEXT: renamable $x12_x13 = LD_RV32 killed renamable $x10, 808 :: (load (s64) from %ir.arrayidx1) + ; RV32-NEXT: $x10 = ADDI renamable $x14, 0 + ; RV32-NEXT: $x11 = ADDI killed renamable $x15, 0 + ; RV32-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12, implicit $x13 + renamable $x14_x15 = LD_RV32 renamable $x10, 800 :: (load (s64) from %ir.arrayidx) + renamable $x12_x13 = LD_RV32 killed renamable $x10, 808 :: (load (s64) from %ir.arrayidx1) + $x10 = ADDI renamable $x14, 0 + $x11 = ADDI killed renamable $x15, 0 + PseudoRET implicit $x10, implicit $x11, implicit $x12, implicit $x13 + +... diff --git a/llvm/test/CodeGen/RISCV/mul-expand.ll b/llvm/test/CodeGen/RISCV/mul-expand.ll index 5bb74bc184d8b..a75a7355fa407 100644 --- a/llvm/test/CodeGen/RISCV/mul-expand.ll +++ b/llvm/test/CodeGen/RISCV/mul-expand.ll @@ -7,17 +7,30 @@ define i32 @muli32_0x555(i32 %a) nounwind { ; RV32I-LABEL: muli32_0x555: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 1365 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 2 +; RV32I-NEXT: slli a2, a0, 4 +; RV32I-NEXT: slli a3, a0, 6 +; RV32I-NEXT: add a2, a2, a3 +; RV32I-NEXT: slli a3, a0, 8 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: slli a0, a0, 10 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: ret ; ; RV64I-LABEL: muli32_0x555: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: li a1, 1365 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 2 +; RV64I-NEXT: slli a2, a0, 4 +; RV64I-NEXT: slli a3, a0, 6 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a0, 8 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: slli a0, a0, 10 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: ret %a1 = mul i32 %a, 1365 ret i32 %a1 @@ -37,8 +50,17 @@ define i64 @muli64_0x555(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_0x555: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 1365 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 2 +; RV64I-NEXT: slli a2, a0, 4 +; RV64I-NEXT: slli a3, a0, 6 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a0, 8 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: slli a0, a0, 10 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: ret %a1 = mul i64 %a, 1365 ret i64 %a1 } @@ -46,19 +68,70 @@ define i64 @muli64_0x555(i64 %a) nounwind { define i32 @muli32_0x33333333(i32 %a) nounwind { ; RV32I-LABEL: muli32_0x33333333: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi a1, a1, 819 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 4 +; RV32I-NEXT: slli a2, a0, 6 +; RV32I-NEXT: slli a3, a0, 8 +; RV32I-NEXT: slli a4, a0, 10 +; RV32I-NEXT: slli a5, a0, 14 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 22 +; RV32I-NEXT: sub a5, a5, a2 +; RV32I-NEXT: slli a2, a0, 24 +; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: slli a2, a0, 2 +; RV32I-NEXT: sub a2, a2, a0 +; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: slli a1, a0, 12 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: slli a3, a0, 18 +; RV32I-NEXT: add a3, a5, a3 +; RV32I-NEXT: slli a5, a0, 26 +; RV32I-NEXT: add a4, a4, a5 +; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: slli a1, a0, 20 +; RV32I-NEXT: sub a3, a3, a1 +; RV32I-NEXT: slli a1, a0, 28 +; RV32I-NEXT: sub a4, a4, a1 +; RV32I-NEXT: slli a0, a0, 30 +; RV32I-NEXT: add a2, a2, a3 +; RV32I-NEXT: add a0, a4, a0 +; RV32I-NEXT: add a0, a2, a0 +; RV32I-NEXT: ret ; ; RV64I-LABEL: muli32_0x33333333: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 4 +; RV64I-NEXT: slli a2, a0, 6 +; RV64I-NEXT: slli a3, a0, 8 +; RV64I-NEXT: slli a4, a0, 10 +; RV64I-NEXT: slli a5, a0, 14 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: sub a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 22 +; RV64I-NEXT: sub a5, a5, a2 +; RV64I-NEXT: slli a2, a0, 24 +; RV64I-NEXT: sub a4, a4, a2 +; RV64I-NEXT: slli a2, a0, 2 +; RV64I-NEXT: sub a2, a2, a0 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 12 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: slli a3, a0, 18 +; RV64I-NEXT: add a3, a5, a3 +; RV64I-NEXT: slli a5, a0, 26 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 20 +; RV64I-NEXT: sub a3, a3, a1 +; RV64I-NEXT: slli a1, a0, 28 +; RV64I-NEXT: sub a4, a4, a1 +; RV64I-NEXT: slli a0, a0, 30 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: add a0, a4, a0 +; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: ret %a1 = mul i32 %a, 858993459 ret i32 %a1 @@ -79,9 +152,37 @@ define i64 @muli64_0x33333333(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_0x33333333: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 4 +; RV64I-NEXT: slli a2, a0, 6 +; RV64I-NEXT: slli a3, a0, 8 +; RV64I-NEXT: slli a4, a0, 10 +; RV64I-NEXT: slli a5, a0, 14 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: sub a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 22 +; RV64I-NEXT: sub a5, a5, a2 +; RV64I-NEXT: slli a2, a0, 24 +; RV64I-NEXT: sub a4, a4, a2 +; RV64I-NEXT: slli a2, a0, 2 +; RV64I-NEXT: sub a2, a2, a0 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 12 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: slli a3, a0, 18 +; RV64I-NEXT: add a3, a5, a3 +; RV64I-NEXT: slli a5, a0, 26 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 20 +; RV64I-NEXT: sub a3, a3, a1 +; RV64I-NEXT: slli a1, a0, 28 +; RV64I-NEXT: sub a4, a4, a1 +; RV64I-NEXT: slli a0, a0, 30 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: add a0, a4, a0 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: ret %a1 = mul i64 %a, 858993459 ret i64 %a1 } @@ -89,19 +190,72 @@ define i64 @muli64_0x33333333(i64 %a) nounwind { define i32 @muli32_0xaaaaaaaa(i32 %a) nounwind { ; RV32I-LABEL: muli32_0xaaaaaaaa: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 699051 -; RV32I-NEXT: addi a1, a1, -1366 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 3 +; RV32I-NEXT: slli a2, a0, 1 +; RV32I-NEXT: slli a3, a0, 5 +; RV32I-NEXT: slli a4, a0, 7 +; RV32I-NEXT: slli a5, a0, 9 +; RV32I-NEXT: slli a6, a0, 11 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: slli a2, a0, 15 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 17 +; RV32I-NEXT: add a5, a5, a6 +; RV32I-NEXT: slli a6, a0, 23 +; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: slli a4, a0, 25 +; RV32I-NEXT: add a4, a6, a4 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a3, a0, 13 +; RV32I-NEXT: add a3, a5, a3 +; RV32I-NEXT: slli a5, a0, 19 +; RV32I-NEXT: add a2, a2, a5 +; RV32I-NEXT: slli a5, a0, 27 +; RV32I-NEXT: add a4, a4, a5 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: slli a3, a0, 21 +; RV32I-NEXT: add a2, a2, a3 +; RV32I-NEXT: slli a3, a0, 29 +; RV32I-NEXT: add a3, a4, a3 +; RV32I-NEXT: slli a0, a0, 31 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: ret ; ; RV64I-LABEL: muli32_0xaaaaaaaa: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a1, 699051 -; RV64I-NEXT: addiw a1, a1, -1366 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a2, a0, 1 +; RV64I-NEXT: slli a3, a0, 5 +; RV64I-NEXT: slli a4, a0, 7 +; RV64I-NEXT: slli a5, a0, 9 +; RV64I-NEXT: slli a6, a0, 11 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: slli a2, a0, 15 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 17 +; RV64I-NEXT: add a5, a5, a6 +; RV64I-NEXT: slli a6, a0, 23 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 25 +; RV64I-NEXT: add a4, a6, a4 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: slli a3, a0, 13 +; RV64I-NEXT: add a3, a5, a3 +; RV64I-NEXT: slli a5, a0, 19 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a0, 27 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: slli a3, a0, 21 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a0, 29 +; RV64I-NEXT: add a3, a4, a3 +; RV64I-NEXT: slli a0, a0, 31 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: sub a0, a3, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: ret %a1 = mul i32 %a, -1431655766 ret i32 %a1 @@ -122,10 +276,38 @@ define i64 @muli64_0xaaaaaaaa(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_0xaaaaaaaa: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 349525 -; RV64I-NEXT: addiw a1, a1, 1365 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a2, a0, 1 +; RV64I-NEXT: slli a3, a0, 5 +; RV64I-NEXT: slli a4, a0, 7 +; RV64I-NEXT: slli a5, a0, 9 +; RV64I-NEXT: slli a6, a0, 11 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: slli a2, a0, 15 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 17 +; RV64I-NEXT: add a5, a5, a6 +; RV64I-NEXT: slli a6, a0, 23 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 25 +; RV64I-NEXT: add a4, a6, a4 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: slli a3, a0, 13 +; RV64I-NEXT: add a3, a5, a3 +; RV64I-NEXT: slli a5, a0, 19 +; RV64I-NEXT: add a2, a2, a5 +; RV64I-NEXT: slli a5, a0, 27 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: slli a3, a0, 21 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a0, 29 +; RV64I-NEXT: add a3, a4, a3 +; RV64I-NEXT: slli a0, a0, 31 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: ret %a1 = mul i64 %a, 2863311530 ret i64 %a1 } @@ -171,19 +353,36 @@ define i64 @muli64_0x0fffffff(i64 %a) nounwind { define i32 @muli32_0xf0f0f0f0(i32 %a) nounwind { ; RV32I-LABEL: muli32_0xf0f0f0f0: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 986895 -; RV32I-NEXT: addi a1, a1, 240 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 4 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a0, 12 +; RV32I-NEXT: slli a4, a0, 16 +; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: slli a1, a0, 20 +; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 24 +; RV32I-NEXT: sub a1, a1, a4 +; RV32I-NEXT: slli a0, a0, 28 +; RV32I-NEXT: sub a2, a2, a3 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: ret ; ; RV64I-LABEL: muli32_0xf0f0f0f0: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a1, 986895 -; RV64I-NEXT: addiw a1, a1, 240 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 4 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 12 +; RV64I-NEXT: slli a4, a0, 16 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 20 +; RV64I-NEXT: sub a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 24 +; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: slli a0, a0, 28 +; RV64I-NEXT: sub a2, a2, a3 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: sub a0, a2, a0 ; RV64I-NEXT: ret %a1 = mul i32 %a, -252645136 ret i32 %a1 @@ -204,10 +403,22 @@ define i64 @muli64_0xf0f0f0f0(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_0xf0f0f0f0: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: slli a1, a1, 4 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 4 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 12 +; RV64I-NEXT: slli a4, a0, 16 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 20 +; RV64I-NEXT: sub a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 24 +; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: sub a2, a2, a3 +; RV64I-NEXT: slli a3, a0, 28 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: ret %a1 = mul i64 %a, 4042322160 ret i64 %a1 } @@ -215,19 +426,28 @@ define i64 @muli64_0xf0f0f0f0(i64 %a) nounwind { define i32 @muli32_0xf7f7f7f7(i32 %a) nounwind { ; RV32I-LABEL: muli32_0xf7f7f7f7: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 1015679 -; RV32I-NEXT: addi a1, a1, 2039 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 3 +; RV32I-NEXT: slli a2, a0, 11 +; RV32I-NEXT: slli a3, a0, 19 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: slli a0, a0, 27 +; RV32I-NEXT: add a2, a2, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV64I-LABEL: muli32_0xf7f7f7f7: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a1, 1015679 -; RV64I-NEXT: addiw a1, a1, 2039 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a2, a0, 11 +; RV64I-NEXT: slli a3, a0, 19 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: neg a0, a0 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret %a1 = mul i32 %a, -134744073 ret i32 %a1 @@ -248,11 +468,17 @@ define i64 @muli64_0xf7f7f7f7(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_0xf7f7f7f7: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 248 -; RV64I-NEXT: addiw a1, a1, -129 -; RV64I-NEXT: slli a1, a1, 12 -; RV64I-NEXT: addi a1, a1, 2039 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a2, a0, 11 +; RV64I-NEXT: slli a3, a0, 19 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a0, 27 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret %a1 = mul i64 %a, 4160223223 ret i64 %a1 } @@ -405,19 +631,44 @@ define i64 @muli64_0x7fffffff(i64 %a) nounwind { define i32 @muli32_0xdeadbeef(i32 %a) nounwind { ; RV32I-LABEL: muli32_0xdeadbeef: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 912092 -; RV32I-NEXT: addi a1, a1, -273 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: slli a2, a0, 14 +; RV32I-NEXT: slli a3, a0, 17 +; RV32I-NEXT: slli a4, a0, 20 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: slli a2, a0, 24 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 29 +; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: slli a4, a0, 4 +; RV32I-NEXT: add a4, a0, a4 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: slli a0, a0, 22 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: neg a1, a2 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret ; ; RV64I-LABEL: muli32_0xdeadbeef: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a1, 912092 -; RV64I-NEXT: addiw a1, a1, -273 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: slli a2, a0, 14 +; RV64I-NEXT: slli a3, a0, 17 +; RV64I-NEXT: slli a4, a0, 20 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 24 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 29 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: add a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a0, a0, 22 +; RV64I-NEXT: add a0, a3, a0 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: neg a1, a2 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: ret %a1 = mul i32 %a, -559038737 ret i32 %a1 @@ -438,10 +689,25 @@ define i64 @muli64_0xdeadbeef(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_0xdeadbeef: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 228023 -; RV64I-NEXT: slli a1, a1, 2 -; RV64I-NEXT: addi a1, a1, -273 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: slli a2, a0, 14 +; RV64I-NEXT: slli a3, a0, 17 +; RV64I-NEXT: slli a4, a0, 20 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 24 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 29 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: add a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 22 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret %a1 = mul i64 %a, 3735928559 ret i64 %a1 } @@ -449,19 +715,52 @@ define i64 @muli64_0xdeadbeef(i64 %a) nounwind { define i32 @muli32_0x12345678(i32 %a) nounwind { ; RV32I-LABEL: muli32_0x12345678: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 74565 -; RV32I-NEXT: addi a1, a1, 1656 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 3 +; RV32I-NEXT: slli a2, a0, 7 +; RV32I-NEXT: slli a3, a0, 9 +; RV32I-NEXT: slli a4, a0, 11 +; RV32I-NEXT: slli a5, a0, 13 +; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: slli a1, a0, 15 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 20 +; RV32I-NEXT: sub a5, a5, a1 +; RV32I-NEXT: slli a1, a0, 22 +; RV32I-NEXT: sub a4, a4, a1 +; RV32I-NEXT: sub a2, a2, a3 +; RV32I-NEXT: slli a1, a0, 18 +; RV32I-NEXT: sub a5, a5, a1 +; RV32I-NEXT: slli a1, a0, 25 +; RV32I-NEXT: sub a4, a4, a1 +; RV32I-NEXT: slli a0, a0, 28 +; RV32I-NEXT: sub a2, a2, a5 +; RV32I-NEXT: sub a4, a4, a0 +; RV32I-NEXT: sub a0, a2, a4 +; RV32I-NEXT: ret ; ; RV64I-LABEL: muli32_0x12345678: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a1, 74565 -; RV64I-NEXT: addiw a1, a1, 1656 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a2, a0, 7 +; RV64I-NEXT: slli a3, a0, 9 +; RV64I-NEXT: slli a4, a0, 11 +; RV64I-NEXT: slli a5, a0, 13 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 15 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 20 +; RV64I-NEXT: sub a5, a5, a1 +; RV64I-NEXT: slli a1, a0, 22 +; RV64I-NEXT: sub a4, a4, a1 +; RV64I-NEXT: sub a2, a2, a3 +; RV64I-NEXT: slli a1, a0, 18 +; RV64I-NEXT: sub a5, a5, a1 +; RV64I-NEXT: slli a1, a0, 25 +; RV64I-NEXT: sub a4, a4, a1 +; RV64I-NEXT: slli a0, a0, 28 +; RV64I-NEXT: sub a2, a2, a5 +; RV64I-NEXT: sub a4, a4, a0 +; RV64I-NEXT: sub a0, a2, a4 ; RV64I-NEXT: ret %a1 = mul i32 %a, 305419896 ret i32 %a1 @@ -482,9 +781,28 @@ define i64 @muli64_0x12345678(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_0x12345678: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a1, 74565 -; RV64I-NEXT: addiw a1, a1, 1656 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a2, a0, 7 +; RV64I-NEXT: slli a3, a0, 9 +; RV64I-NEXT: slli a4, a0, 11 +; RV64I-NEXT: slli a5, a0, 13 +; RV64I-NEXT: sub a2, a2, a1 +; RV64I-NEXT: slli a1, a0, 15 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 20 +; RV64I-NEXT: sub a5, a5, a1 +; RV64I-NEXT: slli a1, a0, 22 +; RV64I-NEXT: sub a4, a4, a1 +; RV64I-NEXT: sub a2, a2, a3 +; RV64I-NEXT: slli a1, a0, 18 +; RV64I-NEXT: sub a5, a5, a1 +; RV64I-NEXT: slli a1, a0, 25 +; RV64I-NEXT: sub a4, a4, a1 +; RV64I-NEXT: slli a0, a0, 28 +; RV64I-NEXT: sub a2, a2, a5 +; RV64I-NEXT: sub a4, a4, a0 +; RV64I-NEXT: sub a0, a2, a4 +; RV64I-NEXT: ret %a1 = mul i64 %a, 305419896 ret i64 %a1 } diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index a65ea088df50c..27d5eaa032522 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -464,11 +464,45 @@ define i32 @mulhu_constant(i32 %a) nounwind { ret i32 %4 } +define i32 @muli32_p10(i32 %a) nounwind { +; RV32I-LABEL: muli32_p10: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 3 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: muli32_p10: +; RV32IM: # %bb.0: +; RV32IM-NEXT: slli a1, a0, 1 +; RV32IM-NEXT: slli a0, a0, 3 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p10: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 1 +; RV64I-NEXT: slli a0, a0, 3 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p10: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a1, a0, 1 +; RV64IM-NEXT: slli a0, a0, 3 +; RV64IM-NEXT: addw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 10 + ret i32 %1 +} + define i32 @muli32_p14(i32 %a) nounwind { ; RV32I-LABEL: muli32_p14: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 14 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p14: ; RV32IM: # %bb.0: @@ -497,8 +531,10 @@ define i32 @muli32_p14(i32 %a) nounwind { define i32 @muli32_p18(i32 %a) nounwind { ; RV32I-LABEL: muli32_p18: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 18 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p18: ; RV32IM: # %bb.0: @@ -527,8 +563,10 @@ define i32 @muli32_p18(i32 %a) nounwind { define i32 @muli32_p28(i32 %a) nounwind { ; RV32I-LABEL: muli32_p28: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 28 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 2 +; RV32I-NEXT: slli a0, a0, 5 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p28: ; RV32IM: # %bb.0: @@ -557,8 +595,10 @@ define i32 @muli32_p28(i32 %a) nounwind { define i32 @muli32_p30(i32 %a) nounwind { ; RV32I-LABEL: muli32_p30: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 30 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 5 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p30: ; RV32IM: # %bb.0: @@ -587,8 +627,10 @@ define i32 @muli32_p30(i32 %a) nounwind { define i32 @muli32_p34(i32 %a) nounwind { ; RV32I-LABEL: muli32_p34: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 34 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 5 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p34: ; RV32IM: # %bb.0: @@ -617,8 +659,10 @@ define i32 @muli32_p34(i32 %a) nounwind { define i32 @muli32_p36(i32 %a) nounwind { ; RV32I-LABEL: muli32_p36: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 36 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 2 +; RV32I-NEXT: slli a0, a0, 5 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p36: ; RV32IM: # %bb.0: @@ -647,8 +691,10 @@ define i32 @muli32_p36(i32 %a) nounwind { define i32 @muli32_p56(i32 %a) nounwind { ; RV32I-LABEL: muli32_p56: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 56 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 3 +; RV32I-NEXT: slli a0, a0, 6 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p56: ; RV32IM: # %bb.0: @@ -677,8 +723,10 @@ define i32 @muli32_p56(i32 %a) nounwind { define i32 @muli32_p60(i32 %a) nounwind { ; RV32I-LABEL: muli32_p60: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 60 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 2 +; RV32I-NEXT: slli a0, a0, 6 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p60: ; RV32IM: # %bb.0: @@ -707,8 +755,10 @@ define i32 @muli32_p60(i32 %a) nounwind { define i32 @muli32_p62(i32 %a) nounwind { ; RV32I-LABEL: muli32_p62: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 62 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 6 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p62: ; RV32IM: # %bb.0: @@ -762,6 +812,38 @@ define i32 @muli32_p65(i32 %a) nounwind { ret i32 %1 } +define i32 @muli32_p66(i32 %a) nounwind { +; RV32I-LABEL: muli32_p66: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: slli a0, a0, 6 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: muli32_p66: +; RV32IM: # %bb.0: +; RV32IM-NEXT: slli a1, a0, 1 +; RV32IM-NEXT: slli a0, a0, 6 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p66: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 1 +; RV64I-NEXT: slli a0, a0, 6 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p66: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a1, a0, 1 +; RV64IM-NEXT: slli a0, a0, 6 +; RV64IM-NEXT: addw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 66 + ret i32 %1 +} + define i32 @muli32_p63(i32 %a) nounwind { ; RV32I-LABEL: muli32_p63: ; RV32I: # %bb.0: @@ -895,8 +977,10 @@ define i64 @muli64_p72(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_p72: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 72 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 3 +; RV64I-NEXT: slli a0, a0, 6 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret ; ; RV64IM-LABEL: muli64_p72: ; RV64IM: # %bb.0: @@ -908,6 +992,48 @@ define i64 @muli64_p72(i64 %a) nounwind { ret i64 %1 } +define i64 @muli64_p68(i64 %a) nounwind { +; RV32I-LABEL: muli64_p68: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: li a2, 68 +; RV32I-NEXT: li a3, 0 +; RV32I-NEXT: call __muldi3 +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: muli64_p68: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a2, 68 +; RV32IM-NEXT: slli a3, a1, 2 +; RV32IM-NEXT: slli a1, a1, 6 +; RV32IM-NEXT: add a1, a1, a3 +; RV32IM-NEXT: slli a3, a0, 2 +; RV32IM-NEXT: mulhu a2, a0, a2 +; RV32IM-NEXT: slli a0, a0, 6 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: add a0, a0, a3 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli64_p68: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 2 +; RV64I-NEXT: slli a0, a0, 6 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli64_p68: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a1, a0, 2 +; RV64IM-NEXT: slli a0, a0, 6 +; RV64IM-NEXT: add a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i64 %a, 68 + ret i64 %1 +} + define i32 @muli32_m63(i32 %a) nounwind { ; RV32I-LABEL: muli32_m63: ; RV32I: # %bb.0: @@ -1058,8 +1184,10 @@ define i64 @muli64_m65(i64 %a) nounwind { define i32 @muli32_p384(i32 %a) nounwind { ; RV32I-LABEL: muli32_p384: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, 384 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 7 +; RV32I-NEXT: slli a0, a0, 9 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p384: ; RV32IM: # %bb.0: @@ -1088,8 +1216,10 @@ define i32 @muli32_p384(i32 %a) nounwind { define i32 @muli32_p12288(i32 %a) nounwind { ; RV32I-LABEL: muli32_p12288: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 3 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 12 +; RV32I-NEXT: slli a0, a0, 14 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_p12288: ; RV32IM: # %bb.0: @@ -1214,9 +1344,11 @@ define i32 @muli32_m3840(i32 %a) nounwind { define i32 @muli32_m4352(i32 %a) nounwind { ; RV32I-LABEL: muli32_m4352: ; RV32I: # %bb.0: -; RV32I-NEXT: li a1, -17 -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 12 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli32_m4352: ; RV32IM: # %bb.0: @@ -1227,13 +1359,10 @@ define i32 @muli32_m4352(i32 %a) nounwind { ; ; RV64I-LABEL: muli32_m4352: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: li a1, -17 -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 12 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: neg a0, a0 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muli32_m4352: @@ -1368,9 +1497,11 @@ define i64 @muli64_m4352(i64 %a) nounwind { ; ; RV64I-LABEL: muli64_m4352: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, -17 -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 12 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: neg a0, a0 +; RV64I-NEXT: ret ; ; RV64IM-LABEL: muli64_m4352: ; RV64IM: # %bb.0: @@ -1459,10 +1590,10 @@ define i128 @muli128_m3840(i128 %a) nounwind { ; RV32I-NEXT: sltu a7, a5, a4 ; RV32I-NEXT: sub a6, a6, t2 ; RV32I-NEXT: mv t1, a7 -; RV32I-NEXT: beq t0, a3, .LBB40_2 +; RV32I-NEXT: beq t0, a3, .LBB43_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu t1, t0, a3 -; RV32I-NEXT: .LBB40_2: +; RV32I-NEXT: .LBB43_2: ; RV32I-NEXT: sub a2, a2, a1 ; RV32I-NEXT: sub a1, t0, a3 ; RV32I-NEXT: sub a5, a5, a4 @@ -1573,10 +1704,10 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32I-NEXT: sltu a7, a3, a6 ; RV32I-NEXT: or t0, t0, a5 ; RV32I-NEXT: mv a5, a7 -; RV32I-NEXT: beq a4, t0, .LBB41_2 +; RV32I-NEXT: beq a4, t0, .LBB44_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu a5, a4, t0 -; RV32I-NEXT: .LBB41_2: +; RV32I-NEXT: .LBB44_2: ; RV32I-NEXT: srli t1, a4, 26 ; RV32I-NEXT: slli t2, a2, 6 ; RV32I-NEXT: srli t3, a2, 26 @@ -2001,8 +2132,10 @@ define i64 @muland_demand(i64 %x) nounwind { ; RV64I-NEXT: li a1, -29 ; RV64I-NEXT: srli a1, a1, 2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: li a1, 12 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 2 +; RV64I-NEXT: slli a0, a0, 4 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret ; ; RV64IM-LABEL: muland_demand: ; RV64IM: # %bb.0: @@ -2037,9 +2170,10 @@ define i64 @mulzext_demand(i32 signext %x) nounwind { ; ; RV64I-LABEL: mulzext_demand: ; RV64I: # %bb.0: -; RV64I-NEXT: li a1, 3 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: tail __muldi3 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: slli a0, a0, 34 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret ; ; RV64IM-LABEL: mulzext_demand: ; RV64IM: # %bb.0: @@ -2056,8 +2190,20 @@ define i32 @mulfshl_demand(i32 signext %x) nounwind { ; RV32I-LABEL: mulfshl_demand: ; RV32I: # %bb.0: ; RV32I-NEXT: srli a0, a0, 11 -; RV32I-NEXT: lui a1, 92808 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 19 +; RV32I-NEXT: slli a2, a0, 15 +; RV32I-NEXT: slli a3, a0, 21 +; RV32I-NEXT: slli a4, a0, 23 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: slli a2, a0, 25 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 27 +; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: slli a0, a0, 29 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: sub a2, a2, a0 +; RV32I-NEXT: sub a0, a1, a2 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: mulfshl_demand: ; RV32IM: # %bb.0: @@ -2068,13 +2214,20 @@ define i32 @mulfshl_demand(i32 signext %x) nounwind { ; ; RV64I-LABEL: mulfshl_demand: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a0, a0, 11 -; RV64I-NEXT: lui a1, 92808 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 19 +; RV64I-NEXT: slli a2, a0, 15 +; RV64I-NEXT: slli a3, a0, 21 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: slli a2, a0, 25 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 29 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: sub a2, a2, a0 +; RV64I-NEXT: sub a0, a1, a2 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: mulfshl_demand: @@ -2091,8 +2244,20 @@ define i32 @mulfshl_demand(i32 signext %x) nounwind { define i32 @mulor_demand(i32 signext %x, i32 signext %y) nounwind { ; RV32I-LABEL: mulor_demand: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a1, 92808 -; RV32I-NEXT: tail __mulsi3 +; RV32I-NEXT: slli a1, a0, 19 +; RV32I-NEXT: slli a2, a0, 15 +; RV32I-NEXT: slli a3, a0, 21 +; RV32I-NEXT: slli a4, a0, 23 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: slli a2, a0, 25 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: slli a4, a0, 27 +; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: slli a0, a0, 29 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: sub a2, a2, a0 +; RV32I-NEXT: sub a0, a1, a2 +; RV32I-NEXT: ret ; ; RV32IM-LABEL: mulor_demand: ; RV32IM: # %bb.0: @@ -2102,12 +2267,19 @@ define i32 @mulor_demand(i32 signext %x, i32 signext %y) nounwind { ; ; RV64I-LABEL: mulor_demand: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a1, 92808 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: slli a1, a0, 19 +; RV64I-NEXT: slli a2, a0, 15 +; RV64I-NEXT: slli a3, a0, 21 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: slli a2, a0, 25 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 29 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: sub a2, a2, a0 +; RV64I-NEXT: sub a0, a1, a2 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: mulor_demand: diff --git a/llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll b/llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll new file mode 100644 index 0000000000000..24d63cbebc7af --- /dev/null +++ b/llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll @@ -0,0 +1,31 @@ +; RUN: llc --mtriple=riscv32 --filetype=obj -o - %s | llvm-readelf -n - | FileCheck --check-prefixes=READELF %s +; RUN: llc --mtriple=riscv64 --filetype=obj -o - %s | llvm-readelf -n - | FileCheck --check-prefixes=READELF %s +; RUN: llc --mtriple=riscv32 -o - %s | FileCheck --check-prefixes=ASM,ASM32 %s +; RUN: llc --mtriple=riscv64 -o - %s | FileCheck --check-prefixes=ASM,ASM64 %s + +; READELF: Properties: RISC-V feature: ZICFISS + +; ASM: .section ".note.GNU-stack","",@progbits +; ASM-NEXT: .section .note.gnu.property,"a",@note +; ASM-NEXT: .word 4 +; ASM-NEXT: .word .Ltmp1-.Ltmp0 +; ASM-NEXT: .word 5 +; ASM-NEXT: .asciz "GNU" +; ASM-NEXT: .Ltmp0: +; ASM32-NEXT: .p2align 2, 0x0 +; ASM64-NEXT: .p2align 3, 0x0 +; ASM-NEXT: .word 3221225472 +; ASM-NEXT: .word 4 +; ASM-NEXT: .word 2 +; ASM32-NEXT: .p2align 2, 0x0 +; ASM64-NEXT: .p2align 3, 0x0 +; ASM-NEXT: .Ltmp1: + +define i32 @f() "hw-shadow-stack" { +entry: + ret i32 0 +} + +!llvm.module.flags = !{!0} + +!0 = !{i32 8, !"cf-protection-return", i32 1} diff --git a/llvm/test/CodeGen/RISCV/riscv-zihintpause.ll b/llvm/test/CodeGen/RISCV/riscv-zihintpause.ll new file mode 100644 index 0000000000000..6c6f5e20a8b48 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/riscv-zihintpause.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+zihintpause -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RVPAUSE + +declare void @llvm.riscv.pause() + +define void @test_pause() { +; RVPAUSE-LABEL: test_pause: +; RVPAUSE: # %bb.0: +; RVPAUSE-NEXT: pause +; RVPAUSE-NEXT: ret + call void @llvm.riscv.pause() + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index d9f7d36127293..10ef3357d4783 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -357,20 +357,33 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB6_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI6_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: li a0, 32 @@ -397,20 +410,33 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-LABEL: cttz_zero_undef_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI7_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI7_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64XTHEADBB-LABEL: cttz_zero_undef_i32: @@ -429,26 +455,36 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findFirstSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI8_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: snez a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI8_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI8_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64XTHEADBB-LABEL: findFirstSet_i32: @@ -472,27 +508,37 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ffs_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI9_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI9_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a0, a0, 1 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a1, a0 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: lui a4, %hi(.LCPI9_0) +; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0) +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a1, a1, 1 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64XTHEADBB-LABEL: ffs_i32: diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 17eb0817d548a..3cd1931b6ae4c 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -347,20 +347,33 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB6_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI6_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: li a0, 32 @@ -377,20 +390,33 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-LABEL: cttz_zero_undef_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: neg a1, a0 +; RV64I-NEXT: negw a1, a0 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 6 +; RV64I-NEXT: slli a2, a0, 8 +; RV64I-NEXT: slli a3, a0, 10 +; RV64I-NEXT: slli a4, a0, 12 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a0, 16 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 18 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a4, a0, 4 +; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: slli a4, a0, 14 +; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: slli a4, a0, 23 +; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: slli a0, a0, 27 +; RV64I-NEXT: add a1, a1, a3 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: srliw a0, a0, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI7_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI7_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: cttz_zero_undef_i32: @@ -404,26 +430,36 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findFirstSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI8_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: snez a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: lui a2, %hi(.LCPI8_0) +; RV64I-NEXT: addi a2, a2, %lo(.LCPI8_0) +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: findFirstSet_i32: @@ -442,27 +478,37 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ffs_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: and a0, s0, a0 -; RV64I-NEXT: lui a1, 30667 -; RV64I-NEXT: addiw a1, a1, 1329 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 27 -; RV64I-NEXT: lui a1, %hi(.LCPI9_0) -; RV64I-NEXT: addi a1, a1, %lo(.LCPI9_0) -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: seqz a1, s0 -; RV64I-NEXT: addi a0, a0, 1 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a0, a1, a0 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: slli a2, a1, 6 +; RV64I-NEXT: slli a3, a1, 8 +; RV64I-NEXT: slli a4, a1, 10 +; RV64I-NEXT: slli a5, a1, 12 +; RV64I-NEXT: add a2, a2, a3 +; RV64I-NEXT: slli a3, a1, 16 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 18 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: slli a5, a1, 4 +; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: add a2, a5, a2 +; RV64I-NEXT: slli a5, a1, 14 +; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: slli a5, a1, 23 +; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: lui a4, %hi(.LCPI9_0) +; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0) +; RV64I-NEXT: slli a1, a1, 27 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: srliw a1, a1, 27 +; RV64I-NEXT: add a1, a4, a1 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a1, a1, 1 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ffs_i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll index ac1d63311fd1e..88894f887cc20 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll @@ -1,13 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zbb | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zbb | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zbb | FileCheck %s --check-prefixes=CHECK,V +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zbb | FileCheck %s --check-prefixes=CHECK,V +; RUN: llc < %s -mtriple=riscv64 -mattr=+zve32x,+zvl128b,+zbb | FileCheck %s --check-prefixes=CHECK,ZVE define i32 @test_v2i1(<2 x i1> %x) { -; CHECK-LABEL: test_v2i1: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vcpop.m a0, v0 -; CHECK-NEXT: ret +; V-LABEL: test_v2i1: +; V: # %bb.0: +; V-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; V-NEXT: vcpop.m a0, v0 +; V-NEXT: ret +; +; ZVE-LABEL: test_v2i1: +; ZVE: # %bb.0: +; ZVE-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; ZVE-NEXT: vcpop.m a0, v0 +; ZVE-NEXT: ret %a = zext <2 x i1> %x to <2 x i32> %b = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) ret i32 %b @@ -173,6 +180,35 @@ define i32 @test_v256i1(<256 x i1> %x) { ret i32 %b } +; FIXME: Optimize this case with Zve32x. We have to use mf4 and set the VL to +; VLEN/64. +define i32 @test_nxv1i1( %x) { +; V-LABEL: test_nxv1i1: +; V: # %bb.0: # %entry +; V-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; V-NEXT: vcpop.m a0, v0 +; V-NEXT: ret +; +; ZVE-LABEL: test_nxv1i1: +; ZVE: # %bb.0: # %entry +; ZVE-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVE-NEXT: vmv.v.i v8, 0 +; ZVE-NEXT: csrr a0, vlenb +; ZVE-NEXT: srli a0, a0, 3 +; ZVE-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; ZVE-NEXT: vmerge.vim v8, v8, 1, v0 +; ZVE-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; ZVE-NEXT: vmv.s.x v9, zero +; ZVE-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; ZVE-NEXT: vredsum.vs v9, v8, v9 +; ZVE-NEXT: vmv.x.s a0, v9 +; ZVE-NEXT: ret +entry: + %a = zext %x to + %b = call i32 @llvm.vector.reduce.add.nxv1i32( %a) + ret i32 %b +} + define i32 @test_nxv2i1( %x) { ; CHECK-LABEL: test_nxv2i1: ; CHECK: # %bb.0: # %entry @@ -520,7 +556,3 @@ entry: %b = call i16 @llvm.vector.reduce.add.nxv64i16( %a) ret i16 %b } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV32: {{.*}} -; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll index 7990c1c1eabc2..4d9a6aeaad2ef 100644 --- a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll +++ b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll @@ -9,26 +9,35 @@ define i32 @vscale_known_nonzero() { ; CHECK-LABEL: vscale_known_nonzero: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: neg a1, a0 +; CHECK-NEXT: negw a1, a0 ; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: lui a1, 30667 -; CHECK-NEXT: addiw a1, a1, 1329 -; CHECK-NEXT: call __muldi3 +; CHECK-NEXT: slli a1, a0, 6 +; CHECK-NEXT: slli a2, a0, 8 +; CHECK-NEXT: slli a3, a0, 10 +; CHECK-NEXT: slli a4, a0, 12 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: slli a2, a0, 16 +; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: slli a4, a0, 18 +; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: slli a4, a0, 4 +; CHECK-NEXT: subw a4, a0, a4 +; CHECK-NEXT: add a1, a4, a1 +; CHECK-NEXT: slli a4, a0, 14 +; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: slli a4, a0, 23 +; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: slli a0, a0, 27 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: srliw a0, a0, 27 ; CHECK-NEXT: lui a1, %hi(.LCPI0_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI0_0) ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: lbu a0, 0(a0) -; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: .cfi_restore ra -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %x = call i32 @llvm.vscale() %r = call i32 @llvm.cttz.i32(i32 %x, i1 false) diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll index c3c1643e6de01..604271702ebad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll +++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll @@ -361,7 +361,7 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 { ; If a function has variable-sized stack objects, then any function calls which ; need to pass arguments on the stack must allocate the stack space for them ; dynamically, to ensure they are at the bottom of the frame. -define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 { +define void @no_reserved_call_frame(i64 %n) #0 { ; RV64I-LABEL: no_reserved_call_frame: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: addi sp, sp, -16 @@ -377,15 +377,20 @@ define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 { ; RV64I-NEXT: addi a0, a0, 15 ; RV64I-NEXT: andi a0, a0, -16 ; RV64I-NEXT: sub a0, sp, a0 -; RV64I-NEXT: lui a2, 1 +; RV64I-NEXT: lui a1, 1 ; RV64I-NEXT: .LBB4_1: # %entry ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: sub sp, sp, a2 +; RV64I-NEXT: sub sp, sp, a1 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: blt a0, sp, .LBB4_1 ; RV64I-NEXT: # %bb.2: # %entry ; RV64I-NEXT: mv sp, a0 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: sub sp, sp, a1 +; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: call callee_stack_args +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: add sp, sp, a0 ; RV64I-NEXT: addi sp, s0, -16 ; RV64I-NEXT: .cfi_def_cfa sp, 16 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -407,20 +412,27 @@ define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: addi s0, sp, 16 ; RV32I-NEXT: .cfi_def_cfa s0, 0 -; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: slli a0, a0, 2 ; RV32I-NEXT: addi a0, a0, 15 ; RV32I-NEXT: andi a0, a0, -16 ; RV32I-NEXT: sub a0, sp, a0 -; RV32I-NEXT: lui a2, 1 +; RV32I-NEXT: lui a1, 1 ; RV32I-NEXT: .LBB4_1: # %entry ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: sub sp, sp, a2 +; RV32I-NEXT: sub sp, sp, a1 ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: blt a0, sp, .LBB4_1 ; RV32I-NEXT: # %bb.2: # %entry ; RV32I-NEXT: mv sp, a0 +; RV32I-NEXT: lui a1, 1 +; RV32I-NEXT: sub sp, sp, a1 +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: call callee_stack_args +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, 32 +; RV32I-NEXT: add sp, sp, a0 ; RV32I-NEXT: addi sp, s0, -16 ; RV32I-NEXT: .cfi_def_cfa sp, 16 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -432,48 +444,70 @@ define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 { ; RV32I-NEXT: ret entry: %v = alloca i32, i64 %n - call void @callee_stack_args(ptr %v, i32 %dummy) + call void @callee_stack_args(ptr %v, [518 x i64] poison) ret void } ; Same as above but without a variable-sized allocation, so the reserved call ; frame can be folded into the fixed-size allocation in the prologue. -define void @reserved_call_frame(i64 %n, i32 %dummy) #0 { +define void @reserved_call_frame(i64 %n) #0 { ; RV64I-LABEL: reserved_call_frame: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -416 -; RV64I-NEXT: .cfi_def_cfa_offset 416 -; RV64I-NEXT: sd ra, 408(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: .cfi_def_cfa_offset 2032 +; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill ; RV64I-NEXT: .cfi_offset ra, -8 -; RV64I-NEXT: addi a0, sp, 8 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: sub sp, sp, a0 +; RV64I-NEXT: sd zero, 0(sp) +; RV64I-NEXT: .cfi_def_cfa_offset 4096 +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 4144 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: add a0, sp, a0 ; RV64I-NEXT: call callee_stack_args -; RV64I-NEXT: ld ra, 408(sp) # 8-byte Folded Reload +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, 48 +; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: .cfi_def_cfa_offset 2032 +; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload ; RV64I-NEXT: .cfi_restore ra -; RV64I-NEXT: addi sp, sp, 416 +; RV64I-NEXT: addi sp, sp, 2032 ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; ; RV32I-LABEL: reserved_call_frame: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: addi sp, sp, -416 -; RV32I-NEXT: .cfi_def_cfa_offset 416 -; RV32I-NEXT: sw ra, 412(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill ; RV32I-NEXT: .cfi_offset ra, -4 -; RV32I-NEXT: mv a1, a2 -; RV32I-NEXT: addi a0, sp, 12 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: sub sp, sp, a0 +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: .cfi_def_cfa_offset 4096 +; RV32I-NEXT: addi sp, sp, -80 +; RV32I-NEXT: .cfi_def_cfa_offset 4176 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, 36 +; RV32I-NEXT: add a0, sp, a0 ; RV32I-NEXT: call callee_stack_args -; RV32I-NEXT: lw ra, 412(sp) # 4-byte Folded Reload +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, 80 +; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload ; RV32I-NEXT: .cfi_restore ra -; RV32I-NEXT: addi sp, sp, 416 +; RV32I-NEXT: addi sp, sp, 2032 ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret entry: - %v = alloca i32, i64 100 - call void @callee_stack_args(ptr %v, i32 %dummy) + %v = alloca i32, i64 518 + call void @callee_stack_args(ptr %v, [518 x i64] poison) ret void } -declare void @callee_stack_args(ptr, i32) +declare void @callee_stack_args(ptr, [518 x i64]) ; Dynamic allocation of vectors define void @dynamic_vector(i64 %size, ptr %out) #0 { diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 6343afc6bac62..988335126e62e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -498,4 +498,29 @@ body: | %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ PseudoBR %bb.1 ... +--- +name: EMUL_is_unknown +tracksRegLiveness: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: EMUL_is_unknown + ; CHECK: [[PseudoVMCLR_M_B4_:%[0-9]+]]:vr = PseudoVMCLR_M_B4 1, 0 /* e8 */ + ; CHECK-NEXT: [[PseudoVMOR_MM_B4_:%[0-9]+]]:vmv0 = PseudoVMOR_MM_B4 [[PseudoVMCLR_M_B4_]], [[PseudoVMCLR_M_B4_]], 1, 0 /* e8 */ + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x0 + ; CHECK-NEXT: [[PseudoVMV_S_X:%[0-9]+]]:vr = PseudoVMV_S_X $noreg, [[COPY]], 1, 5 /* e32 */ + ; CHECK-NEXT: [[PseudoVMV_V_I_M8_:%[0-9]+]]:vrm8 = PseudoVMV_V_I_M8 $noreg, 0, 1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: [[PseudoVREDMAX_VS_M8_E32_MASK:%[0-9]+]]:vrnov0 = PseudoVREDMAX_VS_M8_E32_MASK $noreg, killed [[PseudoVMV_V_I_M8_]], killed [[PseudoVMV_S_X]], [[PseudoVMOR_MM_B4_]], 1, 5 /* e32 */, 1 /* ta, mu */ + ; CHECK-NEXT: [[PseudoVMV_X_S:%[0-9]+]]:gpr = PseudoVMV_X_S killed [[PseudoVREDMAX_VS_M8_E32_MASK]], 5 /* e32 */ + ; CHECK-NEXT: $x10 = COPY [[PseudoVMV_X_S]] + ; CHECK-NEXT: PseudoRET implicit $x10 + %2:vr = PseudoVMCLR_M_B4 -1, 0 /* e8 */ + %3:vmv0 = PseudoVMOR_MM_B4 %2, %2, 1, 0 /* e8 */ + %4:gpr = COPY $x0 + %5:vr = PseudoVMV_S_X $noreg, %4, 1, 5 /* e32 */ + %6:vrm8 = PseudoVMV_V_I_M8 $noreg, 0, 1, 5 /* e32 */, 0 /* tu, mu */ + %7:vrnov0 = PseudoVREDMAX_VS_M8_E32_MASK $noreg, killed %6, killed %5, %3, 1, 5 /* e32 */, 1 /* ta, mu */ + %9:gpr = PseudoVMV_X_S killed %7, 5 /* e32 */ + $x10 = COPY %9 + PseudoRET implicit $x10 +... diff --git a/llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadb.ll b/llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadb.ll new file mode 100644 index 0000000000000..feceacd90e5f0 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadb.ll @@ -0,0 +1,299 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64x,+xandesvpackfph \ +; RUN: -verify-machineinstrs -target-abi=ilp32f | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+xandesvpackfph \ +; RUN: -verify-machineinstrs -target-abi=lp64f | FileCheck %s + +declare @llvm.riscv.nds.vfpmadb.nxv1f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadb_vf_nxv1f16_nxv1f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_vf_nxv1f16_nxv1f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.nxv1f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.mask.nxv1f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadb_mask_vf_nxv1f16_nxv1f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_mask_vf_nxv1f16_nxv1f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v9, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.mask.nxv1f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.nxv2f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadb_vf_nxv2f16_nxv2f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_vf_nxv2f16_nxv2f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.nxv2f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.mask.nxv2f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadb_mask_vf_nxv2f16_nxv2f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_mask_vf_nxv2f16_nxv2f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v9, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.mask.nxv2f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.nxv4f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadb_vf_nxv4f16_nxv4f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_vf_nxv4f16_nxv4f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.nxv4f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.mask.nxv4f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadb_mask_vf_nxv4f16_nxv4f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_mask_vf_nxv4f16_nxv4f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v9, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.mask.nxv4f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.nxv8f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadb_vf_nxv8f16_nxv8f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_vf_nxv8f16_nxv8f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.nxv8f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.mask.nxv8f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadb_mask_vf_nxv8f16_nxv8f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_mask_vf_nxv8f16_nxv8f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.mask.nxv8f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.nxv16f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadb_vf_nxv16f16_nxv16f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_vf_nxv16f16_nxv16f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.nxv16f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.mask.nxv16f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadb_mask_vf_nxv16f16_nxv16f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_mask_vf_nxv16f16_nxv16f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v12, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.mask.nxv16f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.nxv32f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadb_vf_nxv32f16_nxv32f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_vf_nxv32f16_nxv32f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.nxv32f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadb.mask.nxv32f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadb_mask_vf_nxv32f16_nxv32f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadb_mask_vf_nxv32f16_nxv32f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: nds.vfpmadb.vf v8, fa0, v16, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadb.mask.nxv32f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadt.ll b/llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadt.ll new file mode 100644 index 0000000000000..e9d78d2d8b5f5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/xandesvpackfph-vfpmadt.ll @@ -0,0 +1,299 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64x,+xandesvpackfph \ +; RUN: -verify-machineinstrs -target-abi=ilp32f | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64x,+xandesvpackfph \ +; RUN: -verify-machineinstrs -target-abi=lp64f | FileCheck %s + +declare @llvm.riscv.nds.vfpmadt.nxv1f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadt_vf_nxv1f16_nxv1f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_vf_nxv1f16_nxv1f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.nxv1f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.mask.nxv1f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadt_mask_vf_nxv1f16_nxv1f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_mask_vf_nxv1f16_nxv1f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v9, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.mask.nxv1f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.nxv2f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadt_vf_nxv2f16_nxv2f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_vf_nxv2f16_nxv2f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.nxv2f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.mask.nxv2f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadt_mask_vf_nxv2f16_nxv2f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_mask_vf_nxv2f16_nxv2f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v9, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.mask.nxv2f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.nxv4f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadt_vf_nxv4f16_nxv4f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_vf_nxv4f16_nxv4f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.nxv4f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.mask.nxv4f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadt_mask_vf_nxv4f16_nxv4f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_mask_vf_nxv4f16_nxv4f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v9, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.mask.nxv4f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.nxv8f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadt_vf_nxv8f16_nxv8f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_vf_nxv8f16_nxv8f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.nxv8f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.mask.nxv8f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadt_mask_vf_nxv8f16_nxv8f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_mask_vf_nxv8f16_nxv8f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v10, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.mask.nxv8f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.nxv16f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadt_vf_nxv16f16_nxv16f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_vf_nxv16f16_nxv16f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.nxv16f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.mask.nxv16f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadt_mask_vf_nxv16f16_nxv16f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_mask_vf_nxv16f16_nxv16f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v12, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.mask.nxv16f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.nxv32f16.f32( + , + , + float, + iXLen, iXLen); + +define @intrinsic_vfpmadt_vf_nxv32f16_nxv32f16_f32( %0, float %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_vf_nxv32f16_nxv32f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v8 +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.nxv32f16.f32( + undef, + %0, + float %1, iXLen 0, iXLen %2) + + ret %a +} + +declare @llvm.riscv.nds.vfpmadt.mask.nxv32f16.f32( + , + , + float, + , + iXLen, iXLen, iXLen); + +define @intrinsic_vfpmadt_mask_vf_nxv32f16_nxv32f16_f32( %0, %1, float %2, %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vfpmadt_mask_vf_nxv32f16_nxv32f16_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsrmi a1, 0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: nds.vfpmadt.vf v8, fa0, v16, v0.t +; CHECK-NEXT: fsrm a1 +; CHECK-NEXT: ret +entry: + %a = tail call @llvm.riscv.nds.vfpmadt.mask.nxv32f16.f32( + %0, + %1, + float %2, + %3, + iXLen 0, iXLen %4, iXLen 1) + + ret %a +} diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index c6503813aeed2..17a09bf7dbe6c 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -9,40 +9,62 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV32-LABEL: test_srem_odd: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a1, 128424 -; RV32-NEXT: addi a1, a1, 331 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: lui a1, 662 -; RV32-NEXT: addi a1, a1, -83 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: lui a1, 1324 -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: srli a0, a0, 3 -; RV32-NEXT: addi a1, a1, -165 -; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: slli a1, a0, 4 +; RV32-NEXT: slli a2, a0, 6 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: slli a4, a0, 15 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: slli a2, a0, 19 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a0, 21 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: slli a4, a0, 2 +; RV32-NEXT: add a4, a0, a4 +; RV32-NEXT: sub a1, a1, a4 +; RV32-NEXT: slli a4, a0, 17 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a0, a0, 23 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: lui a2, 662 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: lui a3, 1324 +; RV32-NEXT: addi a2, a2, -83 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: addi a0, a3, -165 +; RV32-NEXT: sltu a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_srem_odd: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a1, 128424 -; RV64-NEXT: addiw a1, a1, 331 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: lui a1, 662 -; RV64-NEXT: addi a1, a1, -83 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: lui a1, 1324 -; RV64-NEXT: slli a0, a0, 35 -; RV64-NEXT: srli a0, a0, 35 -; RV64-NEXT: addiw a1, a1, -165 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: slli a1, a0, 4 +; RV64-NEXT: slli a2, a0, 6 +; RV64-NEXT: slli a3, a0, 8 +; RV64-NEXT: slli a4, a0, 15 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a2, a0, 19 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: slli a4, a0, 21 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a0, 2 +; RV64-NEXT: add a4, a0, a4 +; RV64-NEXT: subw a1, a1, a4 +; RV64-NEXT: slli a4, a0, 17 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: slli a0, a0, 23 +; RV64-NEXT: add a0, a2, a0 +; RV64-NEXT: lui a2, 662 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: lui a3, 1324 +; RV64-NEXT: addi a2, a2, -83 +; RV64-NEXT: subw a0, a0, a2 +; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: slli a1, a1, 35 +; RV64-NEXT: srli a1, a1, 35 +; RV64-NEXT: addiw a0, a3, -165 +; RV64-NEXT: sltu a0, a1, a0 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_srem_odd: @@ -382,65 +404,122 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 ; RV64-NEXT: lbu a0, 12(a0) -; RV64-NEXT: ld a1, 0(s0) -; RV64-NEXT: lwu a2, 8(s0) +; RV64-NEXT: ld s3, 0(s0) +; RV64-NEXT: lwu a1, 8(s0) ; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a3, a1, 2 -; RV64-NEXT: or a0, a2, a0 -; RV64-NEXT: slli a2, a2, 62 -; RV64-NEXT: slli a1, a1, 31 -; RV64-NEXT: or a2, a2, a3 -; RV64-NEXT: slli s1, a0, 29 -; RV64-NEXT: srai a0, a2, 31 -; RV64-NEXT: srai s1, s1, 31 -; RV64-NEXT: srai s2, a1, 31 +; RV64-NEXT: srli a2, s3, 2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: slli a1, a1, 62 +; RV64-NEXT: or a1, a1, a2 +; RV64-NEXT: slli a2, s3, 31 +; RV64-NEXT: slli a3, a0, 29 +; RV64-NEXT: srai a0, a1, 31 +; RV64-NEXT: srai s2, a3, 31 +; RV64-NEXT: srai s4, a2, 31 ; RV64-NEXT: li a1, 7 ; RV64-NEXT: call __moddi3 -; RV64-NEXT: mv s3, a0 -; RV64-NEXT: li a1, -5 -; RV64-NEXT: mv a0, s1 -; RV64-NEXT: call __moddi3 ; RV64-NEXT: mv s1, a0 -; RV64-NEXT: lui a0, 699051 -; RV64-NEXT: addiw a1, a0, -1365 -; RV64-NEXT: slli a0, a1, 32 -; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: li a1, -5 ; RV64-NEXT: mv a0, s2 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: lui a1, %hi(.LCPI3_0) -; RV64-NEXT: addi s1, s1, -2 -; RV64-NEXT: addi s3, s3, -1 -; RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; RV64-NEXT: seqz a2, s1 -; RV64-NEXT: seqz a3, s3 -; RV64-NEXT: addi a3, a3, -1 -; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: slli a4, a2, 2 -; RV64-NEXT: slli a5, a3, 31 -; RV64-NEXT: srli a5, a5, 62 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: or a4, a5, a4 -; RV64-NEXT: slli a5, a0, 63 -; RV64-NEXT: srli a0, a0, 1 -; RV64-NEXT: or a0, a0, a5 -; RV64-NEXT: slli a2, a2, 29 -; RV64-NEXT: slli a3, a3, 33 -; RV64-NEXT: srli a2, a2, 61 -; RV64-NEXT: sltu a0, a1, a0 -; RV64-NEXT: neg a0, a0 -; RV64-NEXT: slli a0, a0, 31 -; RV64-NEXT: srli a0, a0, 31 -; RV64-NEXT: or a0, a0, a3 -; RV64-NEXT: sd a0, 0(s0) +; RV64-NEXT: call __moddi3 +; RV64-NEXT: slli a1, s4, 4 +; RV64-NEXT: slli a2, s4, 6 +; RV64-NEXT: slli a3, s4, 8 +; RV64-NEXT: slli a4, s4, 10 +; RV64-NEXT: slli a5, s4, 14 +; RV64-NEXT: slli a6, s4, 16 +; RV64-NEXT: slli a7, s4, 22 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a2, s4, 24 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: slli a4, s3, 32 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: slli a6, s3, 34 +; RV64-NEXT: add a2, a7, a2 +; RV64-NEXT: slli a7, s3, 48 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: slli a6, s3, 50 +; RV64-NEXT: add a6, a7, a6 +; RV64-NEXT: slli a7, s4, 2 +; RV64-NEXT: add a7, s4, a7 +; RV64-NEXT: add a1, a7, a1 +; RV64-NEXT: slli a7, s4, 12 +; RV64-NEXT: add a3, a3, a7 +; RV64-NEXT: slli a7, s4, 18 +; RV64-NEXT: add a5, a5, a7 +; RV64-NEXT: slli a7, s4, 26 +; RV64-NEXT: add a2, a2, a7 +; RV64-NEXT: slli a7, s3, 36 +; RV64-NEXT: add a4, a4, a7 +; RV64-NEXT: slli a7, s3, 52 +; RV64-NEXT: add a6, a6, a7 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: slli a3, s4, 20 +; RV64-NEXT: add a3, a5, a3 +; RV64-NEXT: slli a5, s4, 28 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, s3, 38 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: slli a5, s3, 54 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: slli s4, s4, 30 +; RV64-NEXT: add a2, a2, s4 +; RV64-NEXT: slli a3, s3, 40 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: slli a4, s3, 56 +; RV64-NEXT: add a4, a5, a4 +; RV64-NEXT: slli a5, s3, 42 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a2, s3, 58 +; RV64-NEXT: addi a0, a0, -2 +; RV64-NEXT: addi s1, s1, -1 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: seqz a6, s1 +; RV64-NEXT: addi a6, a6, -1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, a0, 2 +; RV64-NEXT: add a2, a4, a2 +; RV64-NEXT: slli a4, a6, 31 +; RV64-NEXT: srli a4, a4, 62 +; RV64-NEXT: or a4, a4, a5 +; RV64-NEXT: slli a5, s3, 44 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli a5, s3, 60 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: slli a5, s3, 46 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: slli s3, s3, 62 +; RV64-NEXT: add a2, a2, s3 +; RV64-NEXT: lui a5, %hi(.LCPI3_0) +; RV64-NEXT: ld a5, %lo(.LCPI3_0)(a5) +; RV64-NEXT: slli a0, a0, 29 +; RV64-NEXT: slli a6, a6, 33 +; RV64-NEXT: srli a0, a0, 61 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: sub a2, a5, a2 +; RV64-NEXT: sub a2, a2, a1 +; RV64-NEXT: slli a1, a2, 63 +; RV64-NEXT: srli a2, a2, 1 +; RV64-NEXT: or a1, a2, a1 +; RV64-NEXT: sltu a1, a5, a1 +; RV64-NEXT: neg a1, a1 +; RV64-NEXT: slli a1, a1, 31 +; RV64-NEXT: srli a1, a1, 31 +; RV64-NEXT: or a1, a1, a6 +; RV64-NEXT: sd a1, 0(s0) ; RV64-NEXT: sw a4, 8(s0) -; RV64-NEXT: sb a2, 12(s0) +; RV64-NEXT: sb a0, 12(s0) ; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s4, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 48 ; RV64-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/stack-offset.ll b/llvm/test/CodeGen/RISCV/stack-offset.ll index 402d3546eae29..3dc4fcfe26a82 100644 --- a/llvm/test/CodeGen/RISCV/stack-offset.ll +++ b/llvm/test/CodeGen/RISCV/stack-offset.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefixes=RV32,RV32I +; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+experimental-xqcilia < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32XQCILIA ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+zba < %s \ ; RUN: | FileCheck %s -check-prefixes=RV32,RV32ZBA ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ @@ -39,6 +41,27 @@ define void @test() { ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; +; RV32XQCILIA-LABEL: test: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: addi sp, sp, -2032 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 2032 +; RV32XQCILIA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill +; RV32XQCILIA-NEXT: .cfi_offset ra, -4 +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, -3168 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 5200 +; RV32XQCILIA-NEXT: addi a0, sp, 12 +; RV32XQCILIA-NEXT: qc.e.addi a1, sp, 2060 +; RV32XQCILIA-NEXT: qc.e.addi a2, sp, 4108 +; RV32XQCILIA-NEXT: qc.e.addi a3, sp, 5132 +; RV32XQCILIA-NEXT: call inspect +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, 3168 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 2032 +; RV32XQCILIA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload +; RV32XQCILIA-NEXT: .cfi_restore ra +; RV32XQCILIA-NEXT: addi sp, sp, 2032 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 0 +; RV32XQCILIA-NEXT: ret +; ; RV32ZBA-LABEL: test: ; RV32ZBA: # %bb.0: ; RV32ZBA-NEXT: addi sp, sp, -2032 @@ -150,6 +173,25 @@ define void @align_8() { ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; +; RV32XQCILIA-LABEL: align_8: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: addi sp, sp, -256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32XQCILIA-NEXT: .cfi_offset ra, -4 +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, -3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 4112 +; RV32XQCILIA-NEXT: addi a0, sp, 7 +; RV32XQCILIA-NEXT: qc.e.addi a1, sp, 4104 +; RV32XQCILIA-NEXT: call inspect +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, 3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32XQCILIA-NEXT: .cfi_restore ra +; RV32XQCILIA-NEXT: addi sp, sp, 256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 0 +; RV32XQCILIA-NEXT: ret +; ; RV32ZBA-LABEL: align_8: ; RV32ZBA: # %bb.0: ; RV32ZBA-NEXT: addi sp, sp, -2032 @@ -246,6 +288,25 @@ define void @align_4() { ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; +; RV32XQCILIA-LABEL: align_4: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: addi sp, sp, -256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32XQCILIA-NEXT: .cfi_offset ra, -4 +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, -3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 4112 +; RV32XQCILIA-NEXT: addi a0, sp, 7 +; RV32XQCILIA-NEXT: qc.e.addi a1, sp, 4104 +; RV32XQCILIA-NEXT: call inspect +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, 3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32XQCILIA-NEXT: .cfi_restore ra +; RV32XQCILIA-NEXT: addi sp, sp, 256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 0 +; RV32XQCILIA-NEXT: ret +; ; RV32ZBA-LABEL: align_4: ; RV32ZBA: # %bb.0: ; RV32ZBA-NEXT: addi sp, sp, -2032 @@ -342,6 +403,25 @@ define void @align_2() { ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; +; RV32XQCILIA-LABEL: align_2: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: addi sp, sp, -256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32XQCILIA-NEXT: .cfi_offset ra, -4 +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, -3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 4112 +; RV32XQCILIA-NEXT: addi a0, sp, 9 +; RV32XQCILIA-NEXT: qc.e.addi a1, sp, 4106 +; RV32XQCILIA-NEXT: call inspect +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, 3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32XQCILIA-NEXT: .cfi_restore ra +; RV32XQCILIA-NEXT: addi sp, sp, 256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 0 +; RV32XQCILIA-NEXT: ret +; ; RV64-LABEL: align_2: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -2032 @@ -395,6 +475,25 @@ define void @align_1() { ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; +; RV32XQCILIA-LABEL: align_1: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: addi sp, sp, -256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32XQCILIA-NEXT: .cfi_offset ra, -4 +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, -3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 4112 +; RV32XQCILIA-NEXT: addi a0, sp, 10 +; RV32XQCILIA-NEXT: qc.e.addi a1, sp, 4107 +; RV32XQCILIA-NEXT: call inspect +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, 3856 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 256 +; RV32XQCILIA-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32XQCILIA-NEXT: .cfi_restore ra +; RV32XQCILIA-NEXT: addi sp, sp, 256 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 0 +; RV32XQCILIA-NEXT: ret +; ; RV64-LABEL: align_1: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -2032 @@ -422,3 +521,100 @@ define void @align_1() { call void (...) @inspect(ptr %p1, ptr %p2) ret void } + +define void @align_1_lui() { +; RV32-LABEL: align_1_lui: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -2032 +; RV32-NEXT: .cfi_def_cfa_offset 2032 +; RV32-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: lui a0, 1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa_offset 6128 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: lui a1, 1 +; RV32-NEXT: addi a1, a1, 2027 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: call inspect +; RV32-NEXT: lui a0, 1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa_offset 2032 +; RV32-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: addi sp, sp, 2032 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV32XQCILIA-LABEL: align_1_lui: +; RV32XQCILIA: # %bb.0: +; RV32XQCILIA-NEXT: addi sp, sp, -2032 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 2032 +; RV32XQCILIA-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill +; RV32XQCILIA-NEXT: .cfi_offset ra, -4 +; RV32XQCILIA-NEXT: qc.e.addi sp, sp, -4096 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 6128 +; RV32XQCILIA-NEXT: addi a0, sp, 8 +; RV32XQCILIA-NEXT: qc.e.addi a1, sp, 6123 +; RV32XQCILIA-NEXT: call inspect +; RV32XQCILIA-NEXT: lui a0, 1 +; RV32XQCILIA-NEXT: add sp, sp, a0 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 2032 +; RV32XQCILIA-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload +; RV32XQCILIA-NEXT: .cfi_restore ra +; RV32XQCILIA-NEXT: addi sp, sp, 2032 +; RV32XQCILIA-NEXT: .cfi_def_cfa_offset 0 +; RV32XQCILIA-NEXT: ret +; +; RV64I-LABEL: align_1_lui: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: .cfi_def_cfa_offset 2032 +; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, 16 +; RV64I-NEXT: sub sp, sp, a0 +; RV64I-NEXT: .cfi_def_cfa_offset 6144 +; RV64I-NEXT: addi a0, sp, 20 +; RV64I-NEXT: lui a1, 1 +; RV64I-NEXT: addiw a1, a1, 2039 +; RV64I-NEXT: add a1, sp, a1 +; RV64I-NEXT: call inspect +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, 16 +; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: .cfi_def_cfa_offset 2032 +; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 2032 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: align_1_lui: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: addi sp, sp, -2032 +; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032 +; RV64ZBA-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill +; RV64ZBA-NEXT: .cfi_offset ra, -8 +; RV64ZBA-NEXT: li a0, -514 +; RV64ZBA-NEXT: sh3add sp, a0, sp +; RV64ZBA-NEXT: .cfi_def_cfa_offset 6144 +; RV64ZBA-NEXT: addi a0, sp, 20 +; RV64ZBA-NEXT: lui a1, 1 +; RV64ZBA-NEXT: addiw a1, a1, 2039 +; RV64ZBA-NEXT: add a1, sp, a1 +; RV64ZBA-NEXT: call inspect +; RV64ZBA-NEXT: li a0, 514 +; RV64ZBA-NEXT: sh3add sp, a0, sp +; RV64ZBA-NEXT: .cfi_def_cfa_offset 2032 +; RV64ZBA-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload +; RV64ZBA-NEXT: .cfi_restore ra +; RV64ZBA-NEXT: addi sp, sp, 2032 +; RV64ZBA-NEXT: .cfi_def_cfa_offset 0 +; RV64ZBA-NEXT: ret + %p2 = alloca i8, align 1 + %p1 = alloca [6115 x i8], align 1 + call void (...) @inspect(ptr %p1, ptr %p2) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index c73a18c8869d5..46e250710f9c1 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -9,30 +9,40 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; RV32-LABEL: test_urem_odd: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a1, 1 -; RV32-NEXT: addi a1, a1, -819 -; RV32-NEXT: call __mulsi3 +; RV32-NEXT: slli a1, a0, 4 +; RV32-NEXT: slli a2, a0, 6 +; RV32-NEXT: slli a3, a0, 8 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a2, a0, 10 +; RV32-NEXT: sub a3, a3, a2 +; RV32-NEXT: slli a2, a0, 2 +; RV32-NEXT: sub a2, a0, a2 +; RV32-NEXT: slli a0, a0, 12 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: slli a0, a0, 19 ; RV32-NEXT: srli a0, a0, 19 ; RV32-NEXT: sltiu a0, a0, 1639 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_odd: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a1, 1 -; RV64-NEXT: addiw a1, a1, -819 -; RV64-NEXT: call __muldi3 +; RV64-NEXT: slli a1, a0, 4 +; RV64-NEXT: slli a2, a0, 6 +; RV64-NEXT: slli a3, a0, 8 +; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: slli a2, a0, 10 +; RV64-NEXT: subw a3, a3, a2 +; RV64-NEXT: slli a2, a0, 2 +; RV64-NEXT: subw a2, a0, a2 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a0, a3, a0 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: slli a0, a0, 51 ; RV64-NEXT: srli a0, a0, 51 ; RV64-NEXT: sltiu a0, a0, 1639 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_odd: @@ -82,42 +92,64 @@ define i1 @test_urem_odd(i13 %X) nounwind { define i1 @test_urem_even(i27 %X) nounwind { ; RV32-LABEL: test_urem_even: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a1, 28087 -; RV32-NEXT: addi a1, a1, -585 -; RV32-NEXT: call __mulsi3 +; RV32-NEXT: slli a1, a0, 6 +; RV32-NEXT: slli a2, a0, 9 +; RV32-NEXT: slli a3, a0, 12 +; RV32-NEXT: slli a4, a0, 15 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: slli a2, a0, 21 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a4, a0, 24 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: slli a4, a0, 3 +; RV32-NEXT: add a4, a0, a4 +; RV32-NEXT: add a1, a4, a1 +; RV32-NEXT: slli a4, a0, 18 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: slli a0, a0, 27 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: lui a2, 2341 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: sub a0, a0, a1 ; RV32-NEXT: slli a1, a0, 26 ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: srli a0, a0, 6 ; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: lui a1, 2341 ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: srli a0, a0, 5 -; RV32-NEXT: addi a1, a1, -1755 +; RV32-NEXT: addi a1, a2, -1755 ; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_even: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a1, 28087 -; RV64-NEXT: addiw a1, a1, -585 -; RV64-NEXT: call __muldi3 +; RV64-NEXT: slli a1, a0, 6 +; RV64-NEXT: slli a2, a0, 9 +; RV64-NEXT: slli a3, a0, 12 +; RV64-NEXT: slli a4, a0, 15 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: slli a2, a0, 21 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: slli a4, a0, 24 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: slli a4, a0, 3 +; RV64-NEXT: add a4, a0, a4 +; RV64-NEXT: add a1, a4, a1 +; RV64-NEXT: slli a4, a0, 18 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: slli a0, a0, 27 +; RV64-NEXT: subw a0, a0, a2 +; RV64-NEXT: lui a2, 2341 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: subw a0, a0, a1 ; RV64-NEXT: slli a1, a0, 26 ; RV64-NEXT: slli a0, a0, 37 ; RV64-NEXT: srli a0, a0, 38 ; RV64-NEXT: or a0, a0, a1 -; RV64-NEXT: lui a1, 2341 ; RV64-NEXT: slli a0, a0, 37 ; RV64-NEXT: srli a0, a0, 37 -; RV64-NEXT: addiw a1, a1, -1755 +; RV64-NEXT: addiw a1, a2, -1755 ; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_even: @@ -256,28 +288,32 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { define i1 @test_urem_negative_odd(i9 %X) nounwind { ; RV32-LABEL: test_urem_negative_odd: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a1, 307 -; RV32-NEXT: call __mulsi3 +; RV32-NEXT: slli a1, a0, 2 +; RV32-NEXT: slli a2, a0, 4 +; RV32-NEXT: slli a3, a0, 6 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sub a2, a2, a3 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a0, a0, 8 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: andi a0, a0, 511 ; RV32-NEXT: sltiu a0, a0, 2 ; RV32-NEXT: xori a0, a0, 1 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_negative_odd: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a1, 307 -; RV64-NEXT: call __muldi3 +; RV64-NEXT: slli a1, a0, 2 +; RV64-NEXT: slli a2, a0, 4 +; RV64-NEXT: slli a3, a0, 6 +; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: subw a2, a2, a3 +; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: slli a0, a0, 8 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: andi a0, a0, 511 ; RV64-NEXT: sltiu a0, a0, 2 ; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_negative_odd: @@ -323,117 +359,127 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { define void @test_urem_vec(ptr %X) nounwind { ; RV32-LABEL: test_urem_vec: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lbu a0, 4(a0) -; RV32-NEXT: lw a1, 0(s0) -; RV32-NEXT: slli a0, a0, 10 -; RV32-NEXT: srli s1, a1, 22 -; RV32-NEXT: or s1, s1, a0 -; RV32-NEXT: srli s2, a1, 11 -; RV32-NEXT: andi a0, a1, 2047 -; RV32-NEXT: li a1, 683 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: slli a1, a0, 10 -; RV32-NEXT: slli a0, a0, 21 -; RV32-NEXT: srli a0, a0, 22 -; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: andi a0, a0, 2047 -; RV32-NEXT: sltiu s3, a0, 342 -; RV32-NEXT: li a1, 819 -; RV32-NEXT: mv a0, s1 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a0, a0, -1638 -; RV32-NEXT: andi a0, a0, 2047 -; RV32-NEXT: sltiu s1, a0, 2 -; RV32-NEXT: xori s4, s1, 1 -; RV32-NEXT: li a1, 1463 -; RV32-NEXT: mv a0, s2 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a0, a0, -1463 -; RV32-NEXT: addi s3, s3, -1 -; RV32-NEXT: addi s1, s1, -1 -; RV32-NEXT: andi a0, a0, 2047 -; RV32-NEXT: andi a1, s3, 2047 -; RV32-NEXT: slli s1, s1, 22 -; RV32-NEXT: sltiu a0, a0, 293 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: andi a0, a0, 2047 -; RV32-NEXT: slli a0, a0, 11 -; RV32-NEXT: or a0, a0, s1 -; RV32-NEXT: or a0, a1, a0 -; RV32-NEXT: sw a0, 0(s0) -; RV32-NEXT: sb s4, 4(s0) -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: lbu a1, 4(a0) +; RV32-NEXT: lw a2, 0(a0) +; RV32-NEXT: slli a1, a1, 10 +; RV32-NEXT: srli a3, a2, 22 +; RV32-NEXT: srli a4, a2, 11 +; RV32-NEXT: andi a2, a2, 2047 +; RV32-NEXT: or a1, a3, a1 +; RV32-NEXT: slli a3, a2, 2 +; RV32-NEXT: slli a5, a2, 4 +; RV32-NEXT: slli a6, a2, 6 +; RV32-NEXT: slli a7, a2, 8 +; RV32-NEXT: slli t0, a2, 10 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: addi a1, a1, -2 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a3, t0, a7 +; RV32-NEXT: slli a6, a4, 3 +; RV32-NEXT: slli a7, a4, 6 +; RV32-NEXT: slli t0, a4, 9 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: slli a5, a1, 2 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: slli a6, a1, 4 +; RV32-NEXT: add a7, a7, t0 +; RV32-NEXT: slli t0, a1, 6 +; RV32-NEXT: sub a6, a6, t0 +; RV32-NEXT: slli t0, a1, 8 +; RV32-NEXT: sub a5, a5, a1 +; RV32-NEXT: slli a1, a1, 10 +; RV32-NEXT: sub a1, t0, a1 +; RV32-NEXT: sub a3, a3, a2 +; RV32-NEXT: add a4, a4, a7 +; RV32-NEXT: sub a2, a5, a6 +; RV32-NEXT: slli a5, a3, 10 +; RV32-NEXT: slli a3, a3, 21 +; RV32-NEXT: neg a4, a4 +; RV32-NEXT: sub a2, a2, a1 +; RV32-NEXT: srli a3, a3, 22 +; RV32-NEXT: andi a1, a4, 2047 +; RV32-NEXT: andi a2, a2, 2047 +; RV32-NEXT: or a3, a3, a5 +; RV32-NEXT: sltiu a1, a1, 293 +; RV32-NEXT: sltiu a2, a2, 2 +; RV32-NEXT: andi a3, a3, 2047 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: xori a4, a2, 1 +; RV32-NEXT: sltiu a3, a3, 342 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: andi a1, a1, 2047 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: slli a1, a1, 11 +; RV32-NEXT: slli a2, a2, 22 +; RV32-NEXT: andi a3, a3, 2047 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: or a1, a3, a1 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sb a4, 4(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_vec: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -48 -; RV64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: lbu a0, 4(a0) -; RV64-NEXT: lwu a1, 0(s0) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: srli s1, a0, 22 -; RV64-NEXT: srli s2, a0, 11 -; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: li a1, 683 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: slli a1, a0, 10 -; RV64-NEXT: slli a0, a0, 53 -; RV64-NEXT: srli a0, a0, 54 -; RV64-NEXT: or a0, a0, a1 -; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: sltiu s3, a0, 342 -; RV64-NEXT: li a1, 1463 -; RV64-NEXT: mv a0, s2 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a0, a0, -1463 -; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: sltiu s2, a0, 293 -; RV64-NEXT: li a1, 819 -; RV64-NEXT: mv a0, s1 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a0, a0, -1638 -; RV64-NEXT: addi s3, s3, -1 -; RV64-NEXT: addi s2, s2, -1 -; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: andi a1, s3, 2047 -; RV64-NEXT: andi a2, s2, 2047 -; RV64-NEXT: sltiu a0, a0, 2 +; RV64-NEXT: lbu a1, 4(a0) +; RV64-NEXT: lwu a2, 0(a0) +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: or a1, a2, a1 +; RV64-NEXT: srli a2, a1, 22 +; RV64-NEXT: srli a3, a1, 11 +; RV64-NEXT: andi a1, a1, 2047 +; RV64-NEXT: slli a4, a1, 2 +; RV64-NEXT: slli a5, a1, 4 +; RV64-NEXT: slli a6, a1, 6 +; RV64-NEXT: slli a7, a1, 8 +; RV64-NEXT: slli t0, a1, 10 +; RV64-NEXT: addi a3, a3, -1 +; RV64-NEXT: addi a2, a2, -2 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: subw a4, t0, a7 +; RV64-NEXT: slli a6, a3, 3 +; RV64-NEXT: slli a7, a3, 6 +; RV64-NEXT: slli t0, a3, 9 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: add a3, a3, a6 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a7, t0 +; RV64-NEXT: slli t0, a2, 6 +; RV64-NEXT: subw a6, a6, t0 +; RV64-NEXT: slli t0, a2, 8 +; RV64-NEXT: subw a5, a5, a2 +; RV64-NEXT: slli a2, a2, 10 +; RV64-NEXT: subw a2, t0, a2 +; RV64-NEXT: subw a4, a4, a1 +; RV64-NEXT: add a3, a3, a7 +; RV64-NEXT: subw a1, a5, a6 +; RV64-NEXT: slli a5, a4, 10 +; RV64-NEXT: slli a4, a4, 53 +; RV64-NEXT: negw a3, a3 +; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: srli a4, a4, 54 +; RV64-NEXT: andi a2, a3, 2047 +; RV64-NEXT: andi a1, a1, 2047 +; RV64-NEXT: or a4, a4, a5 +; RV64-NEXT: sltiu a2, a2, 293 +; RV64-NEXT: sltiu a1, a1, 2 +; RV64-NEXT: andi a3, a4, 2047 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: sltiu a3, a3, 342 +; RV64-NEXT: andi a2, a2, 2047 +; RV64-NEXT: slli a1, a1, 22 +; RV64-NEXT: addi a3, a3, -1 ; RV64-NEXT: slli a2, a2, 11 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: slli a0, a0, 22 -; RV64-NEXT: or a0, a2, a0 -; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: slli a1, a0, 31 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: sw a0, 0(s0) -; RV64-NEXT: sb a1, 4(s0) -; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 48 +; RV64-NEXT: andi a3, a3, 2047 +; RV64-NEXT: or a1, a2, a1 +; RV64-NEXT: or a1, a3, a1 +; RV64-NEXT: slli a2, a1, 31 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: sb a2, 4(a0) ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_vec: diff --git a/llvm/test/CodeGen/RISCV/xqccmp-additional-stack.ll b/llvm/test/CodeGen/RISCV/xqccmp-additional-stack.ll index 14e6b9bddd0a0..c73d836c45ca3 100644 --- a/llvm/test/CodeGen/RISCV/xqccmp-additional-stack.ll +++ b/llvm/test/CodeGen/RISCV/xqccmp-additional-stack.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqccmp,+e -target-abi ilp32e -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 +declare i32 @__mulsi3(i32, i32) + define ptr @func(ptr %s, i32 %_c, ptr %incdec.ptr, i1 %0, i8 %conv14) #0 { ; RV32-LABEL: func: ; RV32: # %bb.0: # %entry @@ -45,8 +47,8 @@ while.body: ; preds = %while.body, %entry br i1 %0, label %while.body, label %while.end while.end: ; preds = %while.body - %or5 = mul i32 %_c, 16843009 - store i32 %or5, ptr null, align 4 + %mul_result = call i32 @__mulsi3(i32 %_c, i32 16843009) + store i32 %mul_result, ptr null, align 4 %1 = and i32 %n.addr.042, 1 %scevgep = getelementptr i8, ptr %incdec.ptr, i32 %1 store i8 %conv14, ptr %scevgep, align 1 diff --git a/llvm/test/CodeGen/RISCV/xqcibi.ll b/llvm/test/CodeGen/RISCV/xqcibi.ll new file mode 100644 index 0000000000000..242012b5ad462 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/xqcibi.ll @@ -0,0 +1,359 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test that we are able to generate the Xqcibi instructions +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32I +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcibi -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32IXQCIBI + +define i32 @beqimm(i32 %a) { +; RV32I-LABEL: beqimm: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, -16 +; RV32I-NEXT: beq a0, a1, .LBB0_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB0_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: beqimm: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.beqi a0, -16, .LBB0_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB0_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp eq i32 %a, -16 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bneimm(i32 %a) { +; RV32I-LABEL: bneimm: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 15 +; RV32I-NEXT: bne a0, a1, .LBB1_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB1_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bneimm: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.bnei a0, 15, .LBB1_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB1_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp ne i32 %a, 15 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bltimm(i32 %a) { +; RV32I-LABEL: bltimm: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 5 +; RV32I-NEXT: blt a0, a1, .LBB2_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB2_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bltimm: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.blti a0, 5, .LBB2_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB2_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp slt i32 %a, 5 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bgeimm(i32 %a) { +; RV32I-LABEL: bgeimm: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, -6 +; RV32I-NEXT: blt a1, a0, .LBB3_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB3_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bgeimm: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.bgei a0, -5, .LBB3_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB3_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp sge i32 %a, -5 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bltuimm(i32 %a) { +; RV32I-LABEL: bltuimm: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 31 +; RV32I-NEXT: bltu a0, a1, .LBB4_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB4_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bltuimm: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.bltui a0, 31, .LBB4_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB4_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp ult i32 %a, 31 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bgeuimm(i32 %a) { +; RV32I-LABEL: bgeuimm: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 9 +; RV32I-NEXT: bltu a1, a0, .LBB5_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB5_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bgeuimm: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.bgeui a0, 10, .LBB5_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB5_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp uge i32 %a, 10 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @beqimm16(i32 %a) { +; RV32I-LABEL: beqimm16: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 1048568 +; RV32I-NEXT: beq a0, a1, .LBB6_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB6_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: beqimm16: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.e.beqi a0, -32768, .LBB6_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB6_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp eq i32 %a, -32768 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bneimm16(i32 %a) { +; RV32I-LABEL: bneimm16: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: bne a0, a1, .LBB7_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB7_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bneimm16: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.e.bnei a0, 32767, .LBB7_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB7_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp ne i32 %a, 32767 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bltimm16(i32 %a) { +; RV32I-LABEL: bltimm16: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, -35 +; RV32I-NEXT: blt a0, a1, .LBB8_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB8_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bltimm16: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.e.blti a0, -35, .LBB8_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB8_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp slt i32 %a, -35 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bgeimm16(i32 %a) { +; RV32I-LABEL: bgeimm16: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 254 +; RV32I-NEXT: blt a1, a0, .LBB9_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB9_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bgeimm16: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.e.bgei a0, 255, .LBB9_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB9_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp sge i32 %a, 255 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bltuimm16(i32 %a) { +; RV32I-LABEL: bltuimm16: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a1, 16 +; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: bltu a0, a1, .LBB10_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB10_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bltuimm16: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.e.bltui a0, 65535, .LBB10_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB10_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp ult i32 %a, 65535 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + +define i32 @bgeuimm16(i32 %a) { +; RV32I-LABEL: bgeuimm16: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 99 +; RV32I-NEXT: bltu a1, a0, .LBB11_2 +; RV32I-NEXT: # %bb.1: # %f +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB11_2: # %t +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV32IXQCIBI-LABEL: bgeuimm16: +; RV32IXQCIBI: # %bb.0: +; RV32IXQCIBI-NEXT: qc.e.bgeui a0, 100, .LBB11_2 +; RV32IXQCIBI-NEXT: # %bb.1: # %f +; RV32IXQCIBI-NEXT: li a0, 0 +; RV32IXQCIBI-NEXT: ret +; RV32IXQCIBI-NEXT: .LBB11_2: # %t +; RV32IXQCIBI-NEXT: li a0, 1 +; RV32IXQCIBI-NEXT: ret + %1 = icmp uge i32 %a, 100 + br i1 %1, label %t, label %f, !prof !0 +f: + ret i32 0 +t: + ret i32 1 +} + + +!0 = !{!"branch_weights", i32 1, i32 99} diff --git a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll new file mode 100644 index 0000000000000..3f5b949585fa3 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32I +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcibm -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=RV32XQCIBM + +define i32 @sexti1_i32(i1 %a) nounwind { +; RV32I-LABEL: sexti1_i32: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 31 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti1_i32: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 1, 0 +; RV32XQCIBM-NEXT: ret + %sext = sext i1 %a to i32 + ret i32 %sext +} + +define i32 @sexti1_i32_2(i32 %a) { +; RV32I-LABEL: sexti1_i32_2: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 31 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti1_i32_2: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 1, 0 +; RV32XQCIBM-NEXT: ret + %shl = shl i32 %a, 31 + %shr = ashr exact i32 %shl, 31 + ret i32 %shr +} + + +define i32 @sexti8_i32(i8 %a) nounwind { +; RV32I-LABEL: sexti8_i32: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a0, a0, 24 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti8_i32: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 8, 0 +; RV32XQCIBM-NEXT: ret + %sext = sext i8 %a to i32 + ret i32 %sext +} + +define i32 @sexti8_i32_2(i32 %a) { +; RV32I-LABEL: sexti8_i32_2: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: srai a0, a0, 24 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti8_i32_2: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 8, 0 +; RV32XQCIBM-NEXT: ret + %shl = shl i32 %a, 24 + %shr = ashr exact i32 %shl, 24 + ret i32 %shr +} + +define i32 @sexti16_i32(i16 %a) nounwind { +; RV32I-LABEL: sexti16_i32: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti16_i32: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 16, 0 +; RV32XQCIBM-NEXT: ret + %sext = sext i16 %a to i32 + ret i32 %sext +} + +define i32 @sexti16_i32_2(i32 %a) { +; RV32I-LABEL: sexti16_i32_2: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 16 +; RV32I-NEXT: srai a0, a0, 16 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti16_i32_2: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 16, 0 +; RV32XQCIBM-NEXT: ret + %shl = shl i32 %a, 16 + %shr = ashr exact i32 %shl, 16 + ret i32 %shr +} + +define i64 @sexti1_i64(i64 %a) { +; RV32I-LABEL: sexti1_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 31 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti1_i64: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 1, 0 +; RV32XQCIBM-NEXT: mv a1, a0 +; RV32XQCIBM-NEXT: ret + %shl = shl i64 %a, 63 + %shr = ashr exact i64 %shl, 63 + ret i64 %shr +} + +define i64 @sexti1_i64_2(i1 %a) { +; RV32I-LABEL: sexti1_i64_2: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a0, a0, 31 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti1_i64_2: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 1, 0 +; RV32XQCIBM-NEXT: mv a1, a0 +; RV32XQCIBM-NEXT: ret + %1 = sext i1 %a to i64 + ret i64 %1 +} + +define i64 @sexti8_i64(i64 %a) { +; RV32I-LABEL: sexti8_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a0, a1, 24 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti8_i64: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 8, 0 +; RV32XQCIBM-NEXT: srai a1, a0, 31 +; RV32XQCIBM-NEXT: ret + %shl = shl i64 %a, 56 + %shr = ashr exact i64 %shl, 56 + ret i64 %shr +} + +define i64 @sexti8_i64_2(i8 %a) { +; RV32I-LABEL: sexti8_i64_2: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a0, a1, 24 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti8_i64_2: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 8, 0 +; RV32XQCIBM-NEXT: srai a1, a0, 31 +; RV32XQCIBM-NEXT: ret + %1 = sext i8 %a to i64 + ret i64 %1 +} + +define i64 @sexti16_i64(i64 %a) { +; RV32I-LABEL: sexti16_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a0, a1, 16 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti16_i64: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 16, 0 +; RV32XQCIBM-NEXT: srai a1, a0, 31 +; RV32XQCIBM-NEXT: ret + %shl = shl i64 %a, 48 + %shr = ashr exact i64 %shl, 48 + ret i64 %shr +} + +define i64 @sexti16_i64_2(i16 %a) { +; RV32I-LABEL: sexti16_i64_2: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a0, a1, 16 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti16_i64_2: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: qc.ext a0, a0, 16, 0 +; RV32XQCIBM-NEXT: srai a1, a0, 31 +; RV32XQCIBM-NEXT: ret + %1 = sext i16 %a to i64 + ret i64 %1 +} + +define i64 @sexti32_i64(i64 %a) { +; RV32I-LABEL: sexti32_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti32_i64: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: srai a1, a0, 31 +; RV32XQCIBM-NEXT: ret + %shl = shl i64 %a, 32 + %shr = ashr exact i64 %shl, 32 + ret i64 %shr +} + +define i64 @sexti32_i64_2(i32 %a) { +; RV32I-LABEL: sexti32_i64_2: +; RV32I: # %bb.0: +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti32_i64_2: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: srai a1, a0, 31 +; RV32XQCIBM-NEXT: ret + %1 = sext i32 %a to i64 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/zcmp-additional-stack.ll b/llvm/test/CodeGen/RISCV/zcmp-additional-stack.ll index c98b9b80378fd..601780e346a0a 100644 --- a/llvm/test/CodeGen/RISCV/zcmp-additional-stack.ll +++ b/llvm/test/CodeGen/RISCV/zcmp-additional-stack.ll @@ -1,5 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=riscv32 -mattr=+zcmp,+e -target-abi ilp32e -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 + +declare i32 @__mulsi3(i32, i32) + define ptr @func(ptr %s, i32 %_c, ptr %incdec.ptr, i1 %0, i8 %conv14) #0 { ; RV32-LABEL: func: ; RV32: # %bb.0: # %entry @@ -44,8 +47,8 @@ while.body: ; preds = %while.body, %entry br i1 %0, label %while.body, label %while.end while.end: ; preds = %while.body - %or5 = mul i32 %_c, 16843009 - store i32 %or5, ptr null, align 4 + %mul_result = call i32 @__mulsi3(i32 %_c, i32 16843009) + store i32 %mul_result, ptr null, align 4 %1 = and i32 %n.addr.042, 1 %scevgep = getelementptr i8, ptr %incdec.ptr, i32 %1 store i8 %conv14, ptr %scevgep, align 1 diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll index 9a312d9daca8d..f9db686c9e855 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll @@ -9,20 +9,16 @@ define void @foo(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: mv a3, a2 -; RV32ZDINX-NEXT: addi a0, a0, 2047 -; RV32ZDINX-NEXT: mv a2, a1 -; RV32ZDINX-NEXT: sw a2, -3(a0) -; RV32ZDINX-NEXT: sw a3, 1(a0) +; RV32ZDINX-NEXT: addi a3, a0, 2044 +; RV32ZDINX-NEXT: sw a1, 2044(a0) +; RV32ZDINX-NEXT: sw a2, 4(a3) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo: ; RV32ZDINXUALIGNED: # %bb.0: # %entry -; RV32ZDINXUALIGNED-NEXT: mv a3, a2 -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: mv a2, a1 -; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) +; RV32ZDINXUALIGNED-NEXT: addi a3, a0, 2044 +; RV32ZDINXUALIGNED-NEXT: sw a1, 2044(a0) +; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a3) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo: @@ -39,21 +35,21 @@ define void @foo2(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo2: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: mv a3, a2 -; RV32ZDINX-NEXT: addi a0, a0, 2047 +; RV32ZDINX-NEXT: addi a4, a0, 2044 ; RV32ZDINX-NEXT: mv a2, a1 ; RV32ZDINX-NEXT: fadd.d a2, a2, a2 -; RV32ZDINX-NEXT: sw a2, -3(a0) -; RV32ZDINX-NEXT: sw a3, 1(a0) +; RV32ZDINX-NEXT: sw a3, 4(a4) +; RV32ZDINX-NEXT: sw a2, 2044(a0) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo2: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: mv a3, a2 -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 +; RV32ZDINXUALIGNED-NEXT: addi a4, a0, 2044 ; RV32ZDINXUALIGNED-NEXT: mv a2, a1 ; RV32ZDINXUALIGNED-NEXT: fadd.d a2, a2, a2 -; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) +; RV32ZDINXUALIGNED-NEXT: sw a3, 4(a4) +; RV32ZDINXUALIGNED-NEXT: sw a2, 2044(a0) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo2: @@ -74,21 +70,21 @@ define void @foo3(ptr nocapture %p) nounwind { ; RV32ZDINX-LABEL: foo3: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: lui a1, %hi(d) -; RV32ZDINX-NEXT: lw a2, %lo(d)(a1) -; RV32ZDINX-NEXT: lw a3, %lo(d+4)(a1) -; RV32ZDINX-NEXT: addi a0, a0, 2047 -; RV32ZDINX-NEXT: sw a2, -3(a0) -; RV32ZDINX-NEXT: sw a3, 1(a0) +; RV32ZDINX-NEXT: lw a2, %lo(d+4)(a1) +; RV32ZDINX-NEXT: lw a1, %lo(d)(a1) +; RV32ZDINX-NEXT: addi a3, a0, 2044 +; RV32ZDINX-NEXT: sw a2, 4(a3) +; RV32ZDINX-NEXT: sw a1, 2044(a0) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo3: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: lui a1, %hi(d) -; RV32ZDINXUALIGNED-NEXT: lw a2, %lo(d)(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, %lo(d+4)(a1) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) +; RV32ZDINXUALIGNED-NEXT: lw a2, %lo(d+4)(a1) +; RV32ZDINXUALIGNED-NEXT: lw a1, %lo(d)(a1) +; RV32ZDINXUALIGNED-NEXT: addi a3, a0, 2044 +; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a3) +; RV32ZDINXUALIGNED-NEXT: sw a1, 2044(a0) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo3: @@ -108,26 +104,26 @@ define void @foo4(ptr %p) nounwind { ; RV32ZDINX-LABEL: foo4: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: addi a1, a0, 2047 -; RV32ZDINX-NEXT: lw a2, -3(a1) -; RV32ZDINX-NEXT: lw a3, 1(a1) +; RV32ZDINX-NEXT: addi a1, a0, 2044 +; RV32ZDINX-NEXT: lw a2, 2044(a0) +; RV32ZDINX-NEXT: lw a1, 4(a1) ; RV32ZDINX-NEXT: sw a0, 8(sp) ; RV32ZDINX-NEXT: lui a0, %hi(d) ; RV32ZDINX-NEXT: sw a2, %lo(d)(a0) -; RV32ZDINX-NEXT: sw a3, %lo(d+4)(a0) +; RV32ZDINX-NEXT: sw a1, %lo(d+4)(a0) ; RV32ZDINX-NEXT: addi sp, sp, 16 ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo4: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, -16 -; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: lw a2, -3(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 1(a1) +; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2044 +; RV32ZDINXUALIGNED-NEXT: lw a2, 2044(a0) +; RV32ZDINXUALIGNED-NEXT: lw a1, 4(a1) ; RV32ZDINXUALIGNED-NEXT: sw a0, 8(sp) ; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(d) ; RV32ZDINXUALIGNED-NEXT: sw a2, %lo(d)(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, %lo(d+4)(a0) +; RV32ZDINXUALIGNED-NEXT: sw a1, %lo(d+4)(a0) ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, 16 ; RV32ZDINXUALIGNED-NEXT: ret ; @@ -153,20 +149,16 @@ entry: define void @foo5(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo5: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: mv a3, a2 -; RV32ZDINX-NEXT: addi a0, a0, -2048 -; RV32ZDINX-NEXT: mv a2, a1 -; RV32ZDINX-NEXT: sw a2, -1(a0) -; RV32ZDINX-NEXT: sw a3, 3(a0) +; RV32ZDINX-NEXT: addi a3, a0, -2048 +; RV32ZDINX-NEXT: sw a2, -2045(a0) +; RV32ZDINX-NEXT: sw a1, -1(a3) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo5: ; RV32ZDINXUALIGNED: # %bb.0: # %entry -; RV32ZDINXUALIGNED-NEXT: mv a3, a2 -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, -2048 -; RV32ZDINXUALIGNED-NEXT: mv a2, a1 -; RV32ZDINXUALIGNED-NEXT: sw a2, -1(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 3(a0) +; RV32ZDINXUALIGNED-NEXT: addi a3, a0, -2048 +; RV32ZDINXUALIGNED-NEXT: sw a2, -2045(a0) +; RV32ZDINXUALIGNED-NEXT: sw a1, -1(a3) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo5: @@ -186,12 +178,13 @@ define void @foo6(ptr %p, double %d) nounwind { ; RV32ZDINX-NEXT: mv a3, a2 ; RV32ZDINX-NEXT: lui a2, %hi(.LCPI5_0) ; RV32ZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a2) -; RV32ZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) +; RV32ZDINX-NEXT: addi a2, a2, %lo(.LCPI5_0) +; RV32ZDINX-NEXT: lw a5, 4(a2) ; RV32ZDINX-NEXT: mv a2, a1 +; RV32ZDINX-NEXT: addi a1, a0, 2044 ; RV32ZDINX-NEXT: fadd.d a2, a2, a4 -; RV32ZDINX-NEXT: addi a0, a0, 2047 -; RV32ZDINX-NEXT: sw a2, -3(a0) -; RV32ZDINX-NEXT: sw a3, 1(a0) +; RV32ZDINX-NEXT: sw a3, 4(a1) +; RV32ZDINX-NEXT: sw a2, 2044(a0) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo6: @@ -199,12 +192,13 @@ define void @foo6(ptr %p, double %d) nounwind { ; RV32ZDINXUALIGNED-NEXT: mv a3, a2 ; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(.LCPI5_0) ; RV32ZDINXUALIGNED-NEXT: lw a4, %lo(.LCPI5_0)(a2) -; RV32ZDINXUALIGNED-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) +; RV32ZDINXUALIGNED-NEXT: addi a2, a2, %lo(.LCPI5_0) +; RV32ZDINXUALIGNED-NEXT: lw a5, 4(a2) ; RV32ZDINXUALIGNED-NEXT: mv a2, a1 +; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2044 ; RV32ZDINXUALIGNED-NEXT: fadd.d a2, a2, a4 -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) +; RV32ZDINXUALIGNED-NEXT: sw a3, 4(a1) +; RV32ZDINXUALIGNED-NEXT: sw a2, 2044(a0) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo6: @@ -224,30 +218,24 @@ entry: define void @foo7(ptr nocapture %p) nounwind { ; RV32ZDINX-LABEL: foo7: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: addi sp, sp, -16 ; RV32ZDINX-NEXT: lui a1, %hi(d) -; RV32ZDINX-NEXT: lw a2, %lo(d+4)(a1) -; RV32ZDINX-NEXT: addi a1, a1, %lo(d) -; RV32ZDINX-NEXT: sw a2, 8(sp) -; RV32ZDINX-NEXT: lw a1, 8(a1) -; RV32ZDINX-NEXT: sw a1, 12(sp) -; RV32ZDINX-NEXT: lw a2, 8(sp) -; RV32ZDINX-NEXT: lw a3, 12(sp) -; RV32ZDINX-NEXT: addi a0, a0, 2047 -; RV32ZDINX-NEXT: sw a2, -3(a0) -; RV32ZDINX-NEXT: sw a3, 1(a0) -; RV32ZDINX-NEXT: addi sp, sp, 16 +; RV32ZDINX-NEXT: addi a2, a1, %lo(d) +; RV32ZDINX-NEXT: lw a1, %lo(d+4)(a1) +; RV32ZDINX-NEXT: lw a2, 8(a2) +; RV32ZDINX-NEXT: addi a3, a0, 2044 +; RV32ZDINX-NEXT: sw a1, 2044(a0) +; RV32ZDINX-NEXT: sw a2, 4(a3) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo7: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: lui a1, %hi(d) -; RV32ZDINXUALIGNED-NEXT: addi a1, a1, %lo(d) -; RV32ZDINXUALIGNED-NEXT: lw a2, 4(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 8(a1) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) +; RV32ZDINXUALIGNED-NEXT: addi a2, a1, %lo(d) +; RV32ZDINXUALIGNED-NEXT: lw a1, %lo(d+4)(a1) +; RV32ZDINXUALIGNED-NEXT: lw a2, 8(a2) +; RV32ZDINXUALIGNED-NEXT: addi a3, a0, 2044 +; RV32ZDINXUALIGNED-NEXT: sw a1, 2044(a0) +; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a3) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo7: @@ -272,32 +260,28 @@ define void @foo8(ptr %p) nounwind { ; RV32ZDINX-LABEL: foo8: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: addi a1, a0, 2047 -; RV32ZDINX-NEXT: lw a2, -3(a1) -; RV32ZDINX-NEXT: lw a3, 1(a1) ; RV32ZDINX-NEXT: sw a0, 8(sp) -; RV32ZDINX-NEXT: sw a2, 0(sp) -; RV32ZDINX-NEXT: sw a3, 4(sp) -; RV32ZDINX-NEXT: lw a0, 4(sp) -; RV32ZDINX-NEXT: lui a1, %hi(d) -; RV32ZDINX-NEXT: addi a2, a1, %lo(d) -; RV32ZDINX-NEXT: sw a0, 8(a2) -; RV32ZDINX-NEXT: lw a0, 0(sp) -; RV32ZDINX-NEXT: sw a0, %lo(d+4)(a1) +; RV32ZDINX-NEXT: addi a1, a0, 2044 +; RV32ZDINX-NEXT: lw a0, 2044(a0) +; RV32ZDINX-NEXT: lw a1, 4(a1) +; RV32ZDINX-NEXT: lui a2, %hi(d) +; RV32ZDINX-NEXT: addi a3, a2, %lo(d) +; RV32ZDINX-NEXT: sw a0, %lo(d+4)(a2) +; RV32ZDINX-NEXT: sw a1, 8(a3) ; RV32ZDINX-NEXT: addi sp, sp, 16 ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo8: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, -16 -; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: lw a2, -3(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 1(a1) ; RV32ZDINXUALIGNED-NEXT: sw a0, 8(sp) -; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(d) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, %lo(d) -; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 8(a0) +; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2044 +; RV32ZDINXUALIGNED-NEXT: lw a0, 2044(a0) +; RV32ZDINXUALIGNED-NEXT: lw a1, 4(a1) +; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(d) +; RV32ZDINXUALIGNED-NEXT: addi a3, a2, %lo(d) +; RV32ZDINXUALIGNED-NEXT: sw a0, %lo(d+4)(a2) +; RV32ZDINXUALIGNED-NEXT: sw a1, 8(a3) ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, 16 ; RV32ZDINXUALIGNED-NEXT: ret ; @@ -329,30 +313,24 @@ entry: define void @foo9(ptr nocapture %p) nounwind { ; RV32ZDINX-LABEL: foo9: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: addi sp, sp, -16 ; RV32ZDINX-NEXT: lui a1, %hi(e) -; RV32ZDINX-NEXT: lw a2, %lo(e)(a1) -; RV32ZDINX-NEXT: sw a2, 8(sp) -; RV32ZDINX-NEXT: addi a1, a1, %lo(e) -; RV32ZDINX-NEXT: lw a1, 4(a1) -; RV32ZDINX-NEXT: sw a1, 12(sp) -; RV32ZDINX-NEXT: lw a2, 8(sp) -; RV32ZDINX-NEXT: lw a3, 12(sp) -; RV32ZDINX-NEXT: addi a0, a0, 2047 -; RV32ZDINX-NEXT: sw a2, -3(a0) -; RV32ZDINX-NEXT: sw a3, 1(a0) -; RV32ZDINX-NEXT: addi sp, sp, 16 +; RV32ZDINX-NEXT: addi a2, a1, %lo(e) +; RV32ZDINX-NEXT: lw a1, %lo(e)(a1) +; RV32ZDINX-NEXT: lw a2, 4(a2) +; RV32ZDINX-NEXT: addi a3, a0, 2044 +; RV32ZDINX-NEXT: sw a1, 2044(a0) +; RV32ZDINX-NEXT: sw a2, 4(a3) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo9: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: lui a1, %hi(e) -; RV32ZDINXUALIGNED-NEXT: addi a1, a1, %lo(e) -; RV32ZDINXUALIGNED-NEXT: lw a2, 0(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 4(a1) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) +; RV32ZDINXUALIGNED-NEXT: addi a2, a1, %lo(e) +; RV32ZDINXUALIGNED-NEXT: lw a1, %lo(e)(a1) +; RV32ZDINXUALIGNED-NEXT: lw a2, 4(a2) +; RV32ZDINXUALIGNED-NEXT: addi a3, a0, 2044 +; RV32ZDINXUALIGNED-NEXT: sw a1, 2044(a0) +; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a3) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo9: @@ -376,32 +354,28 @@ define void @foo10(ptr %p) nounwind { ; RV32ZDINX-LABEL: foo10: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: addi a1, a0, 2047 -; RV32ZDINX-NEXT: lw a2, -3(a1) -; RV32ZDINX-NEXT: lw a3, 1(a1) ; RV32ZDINX-NEXT: sw a0, 8(sp) -; RV32ZDINX-NEXT: sw a2, 0(sp) -; RV32ZDINX-NEXT: sw a3, 4(sp) -; RV32ZDINX-NEXT: lw a0, 4(sp) -; RV32ZDINX-NEXT: lui a1, %hi(e) -; RV32ZDINX-NEXT: addi a2, a1, %lo(e) -; RV32ZDINX-NEXT: sw a0, 4(a2) -; RV32ZDINX-NEXT: lw a0, 0(sp) -; RV32ZDINX-NEXT: sw a0, %lo(e)(a1) +; RV32ZDINX-NEXT: lw a1, 2044(a0) +; RV32ZDINX-NEXT: addi a0, a0, 2044 +; RV32ZDINX-NEXT: lw a0, 4(a0) +; RV32ZDINX-NEXT: lui a2, %hi(e) +; RV32ZDINX-NEXT: sw a1, %lo(e)(a2) +; RV32ZDINX-NEXT: addi a1, a2, %lo(e) +; RV32ZDINX-NEXT: sw a0, 4(a1) ; RV32ZDINX-NEXT: addi sp, sp, 16 ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo10: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, -16 -; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: lw a2, -3(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 1(a1) ; RV32ZDINXUALIGNED-NEXT: sw a0, 8(sp) -; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(e) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, %lo(e) -; RV32ZDINXUALIGNED-NEXT: sw a2, 0(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 4(a0) +; RV32ZDINXUALIGNED-NEXT: lw a1, 2044(a0) +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2044 +; RV32ZDINXUALIGNED-NEXT: lw a0, 4(a0) +; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(e) +; RV32ZDINXUALIGNED-NEXT: sw a1, %lo(e)(a2) +; RV32ZDINXUALIGNED-NEXT: addi a1, a2, %lo(e) +; RV32ZDINXUALIGNED-NEXT: sw a0, 4(a1) ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, 16 ; RV32ZDINXUALIGNED-NEXT: ret ; @@ -430,22 +404,18 @@ entry: define void @foo11(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo11: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: mv a3, a2 -; RV32ZDINX-NEXT: lui a2, 1 -; RV32ZDINX-NEXT: add a0, a0, a2 -; RV32ZDINX-NEXT: mv a2, a1 -; RV32ZDINX-NEXT: sw a2, -4(a0) -; RV32ZDINX-NEXT: sw a3, 0(a0) +; RV32ZDINX-NEXT: addi a0, a0, 2047 +; RV32ZDINX-NEXT: addi a3, a0, 2045 +; RV32ZDINX-NEXT: sw a1, 2045(a0) +; RV32ZDINX-NEXT: sw a2, 4(a3) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo11: ; RV32ZDINXUALIGNED: # %bb.0: # %entry -; RV32ZDINXUALIGNED-NEXT: mv a3, a2 -; RV32ZDINXUALIGNED-NEXT: lui a2, 1 -; RV32ZDINXUALIGNED-NEXT: add a0, a0, a2 -; RV32ZDINXUALIGNED-NEXT: mv a2, a1 -; RV32ZDINXUALIGNED-NEXT: sw a2, -4(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 0(a0) +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 +; RV32ZDINXUALIGNED-NEXT: addi a3, a0, 2045 +; RV32ZDINXUALIGNED-NEXT: sw a1, 2045(a0) +; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a3) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo11: @@ -462,24 +432,20 @@ entry: define void @foo12(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo12: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: mv a3, a2 -; RV32ZDINX-NEXT: lui a2, 2 -; RV32ZDINX-NEXT: addi a2, a2, 2047 -; RV32ZDINX-NEXT: add a0, a0, a2 -; RV32ZDINX-NEXT: mv a2, a1 -; RV32ZDINX-NEXT: sw a2, 0(a0) -; RV32ZDINX-NEXT: sw a3, 4(a0) +; RV32ZDINX-NEXT: lui a3, 2 +; RV32ZDINX-NEXT: addi a3, a3, 2047 +; RV32ZDINX-NEXT: add a0, a0, a3 +; RV32ZDINX-NEXT: sw a1, 0(a0) +; RV32ZDINX-NEXT: sw a2, 4(a0) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo12: ; RV32ZDINXUALIGNED: # %bb.0: # %entry -; RV32ZDINXUALIGNED-NEXT: mv a3, a2 -; RV32ZDINXUALIGNED-NEXT: lui a2, 2 -; RV32ZDINXUALIGNED-NEXT: addi a2, a2, 2047 -; RV32ZDINXUALIGNED-NEXT: add a0, a0, a2 -; RV32ZDINXUALIGNED-NEXT: mv a2, a1 -; RV32ZDINXUALIGNED-NEXT: sw a2, 0(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 4(a0) +; RV32ZDINXUALIGNED-NEXT: lui a3, 2 +; RV32ZDINXUALIGNED-NEXT: addi a3, a3, 2047 +; RV32ZDINXUALIGNED-NEXT: add a0, a0, a3 +; RV32ZDINXUALIGNED-NEXT: sw a1, 0(a0) +; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a0) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo12: @@ -499,23 +465,16 @@ entry: define double @foo13(ptr nocapture %p) nounwind { ; RV32ZDINX-LABEL: foo13: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: lui a0, %hi(f) -; RV32ZDINX-NEXT: lw a1, %lo(f+8)(a0) -; RV32ZDINX-NEXT: sw a1, 12(sp) -; RV32ZDINX-NEXT: lw a0, %lo(f+4)(a0) -; RV32ZDINX-NEXT: sw a0, 8(sp) -; RV32ZDINX-NEXT: lw a0, 8(sp) -; RV32ZDINX-NEXT: lw a1, 12(sp) -; RV32ZDINX-NEXT: addi sp, sp, 16 +; RV32ZDINX-NEXT: lui a1, %hi(f) +; RV32ZDINX-NEXT: lw a0, %lo(f+4)(a1) +; RV32ZDINX-NEXT: lw a1, %lo(f+8)(a1) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo13: ; RV32ZDINXUALIGNED: # %bb.0: # %entry -; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(f) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, %lo(f) -; RV32ZDINXUALIGNED-NEXT: lw a1, 8(a0) -; RV32ZDINXUALIGNED-NEXT: lw a0, 4(a0) +; RV32ZDINXUALIGNED-NEXT: lui a1, %hi(f) +; RV32ZDINXUALIGNED-NEXT: lw a0, %lo(f+4)(a1) +; RV32ZDINXUALIGNED-NEXT: lw a1, %lo(f+8)(a1) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo13: @@ -535,16 +494,16 @@ entry: define double @foo14(ptr nocapture %p) nounwind { ; RV32ZDINX-LABEL: foo14: ; RV32ZDINX: # %bb.0: # %entry -; RV32ZDINX-NEXT: lui a0, %hi(f) -; RV32ZDINX-NEXT: lw a1, %lo(f+12)(a0) -; RV32ZDINX-NEXT: lw a0, %lo(f+8)(a0) +; RV32ZDINX-NEXT: lui a1, %hi(f) +; RV32ZDINX-NEXT: lw a0, %lo(f+8)(a1) +; RV32ZDINX-NEXT: lw a1, %lo(f+12)(a1) ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo14: ; RV32ZDINXUALIGNED: # %bb.0: # %entry -; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(f) -; RV32ZDINXUALIGNED-NEXT: lw a1, %lo(f+12)(a0) -; RV32ZDINXUALIGNED-NEXT: lw a0, %lo(f+8)(a0) +; RV32ZDINXUALIGNED-NEXT: lui a1, %hi(f) +; RV32ZDINXUALIGNED-NEXT: lw a0, %lo(f+8)(a1) +; RV32ZDINXUALIGNED-NEXT: lw a1, %lo(f+12)(a1) ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo14: diff --git a/llvm/test/CodeGen/RISCV/zdinx-large-spill.mir b/llvm/test/CodeGen/RISCV/zdinx-large-spill.mir index caebdab2c95ab..f8b2b542a497d 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-large-spill.mir +++ b/llvm/test/CodeGen/RISCV/zdinx-large-spill.mir @@ -10,34 +10,40 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -2048 - ; CHECK-NEXT: addi sp, sp, -16 - ; CHECK-NEXT: .cfi_def_cfa_offset 2064 + ; CHECK-NEXT: addi sp, sp, -32 + ; CHECK-NEXT: .cfi_def_cfa_offset 2080 ; CHECK-NEXT: lui t0, 1 ; CHECK-NEXT: add t0, sp, t0 - ; CHECK-NEXT: sw a0, -2040(t0) # 4-byte Folded Spill - ; CHECK-NEXT: sw a1, -2036(t0) # 4-byte Folded Spill + ; CHECK-NEXT: sw a0, -2024(t0) # 4-byte Folded Spill + ; CHECK-NEXT: sw a1, -2020(t0) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: add a0, sp, a0 - ; CHECK-NEXT: sw a2, -2048(a0) # 4-byte Folded Spill - ; CHECK-NEXT: sw a3, -2044(a0) # 4-byte Folded Spill - ; CHECK-NEXT: sw a4, 2040(sp) # 4-byte Folded Spill - ; CHECK-NEXT: sw a5, 2044(sp) # 4-byte Folded Spill - ; CHECK-NEXT: sw a6, 2032(sp) # 4-byte Folded Spill - ; CHECK-NEXT: sw a7, 2036(sp) # 4-byte Folded Spill + ; CHECK-NEXT: sw a2, -2032(a0) # 4-byte Folded Spill + ; CHECK-NEXT: sw a3, -2028(a0) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: add a0, sp, a0 - ; CHECK-NEXT: lw a1, -2036(a0) # 4-byte Folded Reload - ; CHECK-NEXT: lw a0, -2040(a0) # 4-byte Folded Reload + ; CHECK-NEXT: sw a4, -2040(a0) # 4-byte Folded Spill + ; CHECK-NEXT: sw a5, -2036(a0) # 4-byte Folded Spill + ; CHECK-NEXT: addi a0, sp, 2044 + ; CHECK-NEXT: sw a6, 0(a0) # 4-byte Folded Spill + ; CHECK-NEXT: sw a7, 4(a0) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: add a0, sp, a0 - ; CHECK-NEXT: lw a2, -2048(a0) # 4-byte Folded Reload - ; CHECK-NEXT: lw a3, -2044(a0) # 4-byte Folded Reload - ; CHECK-NEXT: lw a4, 2040(sp) # 4-byte Folded Reload - ; CHECK-NEXT: lw a5, 2044(sp) # 4-byte Folded Reload - ; CHECK-NEXT: lw a6, 2032(sp) # 4-byte Folded Reload - ; CHECK-NEXT: lw a7, 2036(sp) # 4-byte Folded Reload + ; CHECK-NEXT: lw a1, -2020(a0) # 4-byte Folded Reload + ; CHECK-NEXT: lw a0, -2024(a0) # 4-byte Folded Reload + ; CHECK-NEXT: lui a0, 1 + ; CHECK-NEXT: add a0, sp, a0 + ; CHECK-NEXT: lw a2, -2032(a0) # 4-byte Folded Reload + ; CHECK-NEXT: lw a3, -2028(a0) # 4-byte Folded Reload + ; CHECK-NEXT: lui a0, 1 + ; CHECK-NEXT: add a0, sp, a0 + ; CHECK-NEXT: lw a4, -2040(a0) # 4-byte Folded Reload + ; CHECK-NEXT: lw a5, -2036(a0) # 4-byte Folded Reload + ; CHECK-NEXT: addi a0, sp, 2044 + ; CHECK-NEXT: lw a6, 0(a0) # 4-byte Folded Reload + ; CHECK-NEXT: lw a7, 4(a0) # 4-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 2032 - ; CHECK-NEXT: addi sp, sp, 32 + ; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ret void @@ -53,8 +59,9 @@ stack: - { id: 0, type: spill-slot, size: 8, alignment: 4 } - { id: 1, type: spill-slot, size: 8, alignment: 4 } - { id: 2, type: spill-slot, size: 8, alignment: 4 } - - { id: 3, type: spill-slot, size: 8, alignment: 4 } - - { id: 4, type: spill-slot, size: 2024, alignment: 4 } + - { id: 3, type: spill-slot, size: 4, alignment: 4 } + - { id: 4, type: spill-slot, size: 8, alignment: 4 } + - { id: 5, type: spill-slot, size: 2028, alignment: 4 } machineFunctionInfo: varArgsFrameIndex: 0 varArgsSaveSize: 0 @@ -65,11 +72,11 @@ body: | PseudoRV32ZdinxSD killed renamable $x10_x11, %stack.0, 0 :: (store (s64) into %stack.0, align 4) PseudoRV32ZdinxSD killed renamable $x12_x13, %stack.1, 0 :: (store (s64) into %stack.1, align 4) PseudoRV32ZdinxSD killed renamable $x14_x15, %stack.2, 0 :: (store (s64) into %stack.2, align 4) - PseudoRV32ZdinxSD killed renamable $x16_x17, %stack.3, 0 :: (store (s64) into %stack.3, align 4) + PseudoRV32ZdinxSD killed renamable $x16_x17, %stack.4, 0 :: (store (s64) into %stack.4, align 4) renamable $x10_x11 = PseudoRV32ZdinxLD %stack.0, 0 :: (load (s64) from %stack.0, align 4) renamable $x12_x13 = PseudoRV32ZdinxLD %stack.1, 0 :: (load (s64) from %stack.1, align 4) renamable $x14_x15 = PseudoRV32ZdinxLD %stack.2, 0 :: (load (s64) from %stack.2, align 4) - renamable $x16_x17 = PseudoRV32ZdinxLD %stack.3, 0 :: (load (s64) from %stack.3, align 4) + renamable $x16_x17 = PseudoRV32ZdinxLD %stack.4, 0 :: (load (s64) from %stack.4, align 4) PseudoRET ... diff --git a/llvm/test/CodeGen/RISCV/zdinx-memoperand.ll b/llvm/test/CodeGen/RISCV/zdinx-memoperand.ll index d618253912470..8cb7b79f3ff6a 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-memoperand.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-memoperand.ll @@ -15,9 +15,8 @@ define i32 @foo(double %x, ptr %y, i64 %0, i64 %1, i1 %cmp6.not, ptr %arrayidx13 ; CHECK-NEXT: .LBB0_2: # %if.then7 ; CHECK-NEXT: lw a0, 0(sp) ; CHECK-NEXT: .LBB0_3: # %common.ret -; CHECK-NEXT: fcvt.d.w a2, zero -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: sw a3, 4(a0) +; CHECK-NEXT: sw zero, 0(a0) +; CHECK-NEXT: sw zero, 4(a0) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/zdinx-spill.ll b/llvm/test/CodeGen/RISCV/zdinx-spill.ll new file mode 100644 index 0000000000000..d7a700622bf8c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/zdinx-spill.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs -stop-after=prologepilog | FileCheck %s + +declare void @bar() + +define double @foo(double %x) nounwind { + ; CHECK-LABEL: name: foo + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $x10, $x11, $x8, $x9, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -64 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 64 + ; CHECK-NEXT: frame-setup SW killed $x8, $x2, 60 :: (store (s32) into %stack.1) + ; CHECK-NEXT: frame-setup SW killed $x9, $x2, 56 :: (store (s32) into %stack.2) + ; CHECK-NEXT: frame-setup SW killed $x18, $x2, 52 :: (store (s32) into %stack.3) + ; CHECK-NEXT: frame-setup SW killed $x19, $x2, 48 :: (store (s32) into %stack.4) + ; CHECK-NEXT: frame-setup SW killed $x20, $x2, 44 :: (store (s32) into %stack.5) + ; CHECK-NEXT: frame-setup SW killed $x21, $x2, 40 :: (store (s32) into %stack.6) + ; CHECK-NEXT: frame-setup SW killed $x22, $x2, 36 :: (store (s32) into %stack.7) + ; CHECK-NEXT: frame-setup SW killed $x23, $x2, 32 :: (store (s32) into %stack.8) + ; CHECK-NEXT: frame-setup SW killed $x24, $x2, 28 :: (store (s32) into %stack.9) + ; CHECK-NEXT: frame-setup SW killed $x25, $x2, 24 :: (store (s32) into %stack.10) + ; CHECK-NEXT: frame-setup SW killed $x26, $x2, 20 :: (store (s32) into %stack.11) + ; CHECK-NEXT: frame-setup SW killed $x27, $x2, 16 :: (store (s32) into %stack.12) + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -4 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x9, -8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x18, -12 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x19, -16 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x20, -20 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x21, -24 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x22, -28 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x23, -32 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x24, -36 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x25, -40 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x26, -44 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x27, -48 + ; CHECK-NEXT: renamable $x10_x11 = nofpexcept FADD_D_IN32X killed renamable $x10_x11, renamable $x10_x11, 7, implicit $frm + ; CHECK-NEXT: PseudoRV32ZdinxSD killed renamable $x10_x11, $x2, 8 :: (store (s64) into %stack.0, align 4) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $x6, 12 /* clobber */, implicit-def dead early-clobber $x7, 12 /* clobber */, implicit-def dead early-clobber $x8, 12 /* clobber */, implicit-def dead early-clobber $x9, 12 /* clobber */, implicit-def dead early-clobber $x10, 12 /* clobber */, implicit-def dead early-clobber $x11, 12 /* clobber */, implicit-def dead early-clobber $x12, 12 /* clobber */, implicit-def dead early-clobber $x13, 12 /* clobber */, implicit-def dead early-clobber $x14, 12 /* clobber */, implicit-def dead early-clobber $x15, 12 /* clobber */, implicit-def dead early-clobber $x16, 12 /* clobber */, implicit-def dead early-clobber $x17, 12 /* clobber */, implicit-def dead early-clobber $x18, 12 /* clobber */, implicit-def dead early-clobber $x19, 12 /* clobber */, implicit-def dead early-clobber $x20, 12 /* clobber */, implicit-def dead early-clobber $x21, 12 /* clobber */, implicit-def dead early-clobber $x22, 12 /* clobber */, implicit-def dead early-clobber $x23, 12 /* clobber */, implicit-def dead early-clobber $x24, 12 /* clobber */, implicit-def dead early-clobber $x25, 12 /* clobber */, implicit-def dead early-clobber $x26, 12 /* clobber */, implicit-def dead early-clobber $x27, 12 /* clobber */, implicit-def dead early-clobber $x28, 12 /* clobber */, implicit-def dead early-clobber $x29, 12 /* clobber */, implicit-def dead early-clobber $x31 + ; CHECK-NEXT: renamable $x10_x11 = PseudoRV32ZdinxLD $x2, 8 :: (load (s64) from %stack.0, align 4) + ; CHECK-NEXT: $x8 = frame-destroy LW $x2, 60 :: (load (s32) from %stack.1) + ; CHECK-NEXT: $x9 = frame-destroy LW $x2, 56 :: (load (s32) from %stack.2) + ; CHECK-NEXT: $x18 = frame-destroy LW $x2, 52 :: (load (s32) from %stack.3) + ; CHECK-NEXT: $x19 = frame-destroy LW $x2, 48 :: (load (s32) from %stack.4) + ; CHECK-NEXT: $x20 = frame-destroy LW $x2, 44 :: (load (s32) from %stack.5) + ; CHECK-NEXT: $x21 = frame-destroy LW $x2, 40 :: (load (s32) from %stack.6) + ; CHECK-NEXT: $x22 = frame-destroy LW $x2, 36 :: (load (s32) from %stack.7) + ; CHECK-NEXT: $x23 = frame-destroy LW $x2, 32 :: (load (s32) from %stack.8) + ; CHECK-NEXT: $x24 = frame-destroy LW $x2, 28 :: (load (s32) from %stack.9) + ; CHECK-NEXT: $x25 = frame-destroy LW $x2, 24 :: (load (s32) from %stack.10) + ; CHECK-NEXT: $x26 = frame-destroy LW $x2, 20 :: (load (s32) from %stack.11) + ; CHECK-NEXT: $x27 = frame-destroy LW $x2, 16 :: (load (s32) from %stack.12) + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x8 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x9 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x18 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x19 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x20 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x21 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x22 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x23 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x24 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x25 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x26 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x27 + ; CHECK-NEXT: $x2 = frame-destroy ADDI $x2, 64 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 + ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 + %a = fadd double %x, %x + call void asm sideeffect "", "~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{xr0},~{x31}"() + ret double %a +} diff --git a/llvm/test/CodeGen/RISCV/zilsd.ll b/llvm/test/CodeGen/RISCV/zilsd.ll new file mode 100644 index 0000000000000..eb5d8237bda8c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/zilsd.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -mattr=+zilsd -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,SLOW %s +; RUN: llc -mtriple=riscv32 -mattr=+zilsd,+unaligned-scalar-mem -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,FAST %s + +define i64 @load(ptr %a) nounwind { +; CHECK-LABEL: load: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a2, 80(a0) +; CHECK-NEXT: ld a0, 0(a0) +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: ret + %1 = getelementptr i64, ptr %a, i32 10 + %2 = load i64, ptr %1 + %3 = load volatile i64, ptr %a + ret i64 %2 +} + +define void @store(ptr %a, i64 %b) nounwind { +; CHECK-LABEL: store: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: sd a2, 0(a0) +; CHECK-NEXT: sd a2, 88(a0) +; CHECK-NEXT: ret + store i64 %b, ptr %a + %1 = getelementptr i64, ptr %a, i32 11 + store i64 %b, ptr %1 + ret void +} + +define i64 @load_unaligned(ptr %p) { +; SLOW-LABEL: load_unaligned: +; SLOW: # %bb.0: +; SLOW-NEXT: lbu a1, 1(a0) +; SLOW-NEXT: lbu a2, 2(a0) +; SLOW-NEXT: lbu a3, 3(a0) +; SLOW-NEXT: lbu a4, 0(a0) +; SLOW-NEXT: slli a1, a1, 8 +; SLOW-NEXT: slli a2, a2, 16 +; SLOW-NEXT: slli a3, a3, 24 +; SLOW-NEXT: or a1, a1, a4 +; SLOW-NEXT: lbu a4, 4(a0) +; SLOW-NEXT: lbu a5, 5(a0) +; SLOW-NEXT: or a2, a3, a2 +; SLOW-NEXT: lbu a3, 6(a0) +; SLOW-NEXT: lbu a0, 7(a0) +; SLOW-NEXT: slli a5, a5, 8 +; SLOW-NEXT: or a4, a5, a4 +; SLOW-NEXT: slli a3, a3, 16 +; SLOW-NEXT: slli a0, a0, 24 +; SLOW-NEXT: or a3, a0, a3 +; SLOW-NEXT: or a0, a2, a1 +; SLOW-NEXT: or a1, a3, a4 +; SLOW-NEXT: ret +; +; FAST-LABEL: load_unaligned: +; FAST: # %bb.0: +; FAST-NEXT: ld a0, 0(a0) +; FAST-NEXT: ret + %res = load i64, ptr %p, align 1 + ret i64 %res +} + +define void @store_unaligned(ptr %p, i64 %v) { +; SLOW-LABEL: store_unaligned: +; SLOW: # %bb.0: +; SLOW-NEXT: srli a3, a2, 24 +; SLOW-NEXT: srli a4, a2, 16 +; SLOW-NEXT: srli a5, a2, 8 +; SLOW-NEXT: srli a6, a1, 24 +; SLOW-NEXT: srli a7, a1, 16 +; SLOW-NEXT: sb a2, 4(a0) +; SLOW-NEXT: sb a5, 5(a0) +; SLOW-NEXT: sb a4, 6(a0) +; SLOW-NEXT: sb a3, 7(a0) +; SLOW-NEXT: srli a2, a1, 8 +; SLOW-NEXT: sb a1, 0(a0) +; SLOW-NEXT: sb a2, 1(a0) +; SLOW-NEXT: sb a7, 2(a0) +; SLOW-NEXT: sb a6, 3(a0) +; SLOW-NEXT: ret +; +; FAST-LABEL: store_unaligned: +; FAST: # %bb.0: +; FAST-NEXT: mv a3, a2 +; FAST-NEXT: mv a2, a1 +; FAST-NEXT: sd a2, 0(a0) +; FAST-NEXT: ret + store i64 %v, ptr %p, align 1 + ret void +} + +@g = dso_local global i64 0, align 8 + +define i64 @load_g() nounwind { +; CHECK-LABEL: load_g: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a0, %hi(g) +; CHECK-NEXT: ld a0, %lo(g)(a0) +; CHECK-NEXT: ret +entry: + %0 = load i64, ptr @g + ret i64 %0 +} + +define void @store_g() nounwind { +; CHECK-LABEL: store_g: +; CHECK: # %bb.0: # %entyr +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: lui a2, %hi(g) +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: sd a0, %lo(g)(a2) +; CHECK-NEXT: ret +entyr: + store i64 0, ptr @g + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/global-var-name-align.ll b/llvm/test/CodeGen/SPIRV/global-var-name-align.ll new file mode 100644 index 0000000000000..d73c98e55b872 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/global-var-name-align.ll @@ -0,0 +1,76 @@ +; Check names and decoration of global variables. + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s -check-prefixes=CHECK,OCL +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s -check-prefixes=CHECK,OCL +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s -check-prefixes=CHECK,VK +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpName %[[#id18:]] "G1" +; CHECK-DAG: OpName %[[#id22:]] "g1" +; CHECK-DAG: OpName %[[#id23:]] "g2" +; CHECK-DAG: OpName %[[#id27:]] "g4" +; CHECK-DAG: OpName %[[#id30:]] "c1" +; CHECK-DAG: OpName %[[#id31:]] "n_t" +; CHECK-DAG: OpName %[[#id32:]] "w" +; CHECK-DAG: OpName %[[#id34:]] "a.b" +; CHECK-DAG: OpName %[[#id35:]] "e" +; CHECK-DAG: OpName %[[#id36:]] "y.z" +; CHECK-DAG: OpName %[[#id38:]] "x" + +; CHECK-NOT: OpDecorate %[[#id18]] LinkageAttributes +; OCL-DAG: OpDecorate %[[#id18]] Constant +; OCL-DAG: OpDecorate %[[#id22]] Alignment 4 +; VK-NOT: OpDecorate {{.*}} Constant +; VK-NOT: OpDecorate {{.*}} Alignment +; CHECK-DAG: OpDecorate %[[#id22]] LinkageAttributes "g1" Export +; OCL-DAG: OpDecorate %[[#id23]] Alignment 4 +; OCL-DAG: OpDecorate %[[#id27]] Alignment 4 +; VK-NOT: OpDecorate {{.*}} Constant +; VK-NOT: OpDecorate {{.*}} Alignment +; CHECK-DAG: OpDecorate %[[#id27]] LinkageAttributes "g4" Export +; OCL-DAG: OpDecorate %[[#id30]] Constant +; OCL-DAG: OpDecorate %[[#id30]] Alignment 4 +; VK-NOT: OpDecorate {{.*}} Constant +; VK-NOT: OpDecorate {{.*}} Alignment +; CHECK-DAG: OpDecorate %[[#id30]] LinkageAttributes "c1" Export +; OCL-DAG: OpDecorate %[[#id31]] Constant +; VK-NOT: OpDecorate {{.*}} Constant +; VK-NOT: OpDecorate {{.*}} Alignment +; CHECK-DAG: OpDecorate %[[#id31]] LinkageAttributes "n_t" Import +; OCL-DAG: OpDecorate %[[#id32]] Constant +; OCL-DAG: OpDecorate %[[#id32]] Alignment 4 +; VK-NOT: OpDecorate {{.*}} Constant +; VK-NOT: OpDecorate {{.*}} Alignment +; CHECK-DAG: OpDecorate %[[#id32]] LinkageAttributes "w" Export +; OCL-DAG: OpDecorate %[[#id34]] Constant +; OCL-DAG: OpDecorate %[[#id34]] Alignment 4 +; VK-NOT: OpDecorate {{.*}} Constant +; VK-NOT: OpDecorate {{.*}} Alignment +; CHECK-DAG: OpDecorate %[[#id35]] LinkageAttributes "e" Import +; OCL-DAG: OpDecorate %[[#id36]] Alignment 4 +; OCL-DAG: OpDecorate %[[#id38]] Constant +; OCL-DAG: OpDecorate %[[#id38]] Alignment 4 +; VK-NOT: OpDecorate {{.*}} Constant +; VK-NOT: OpDecorate {{.*}} Alignment + +%"class.sycl::_V1::nd_item" = type { i8 } + +@G1 = private unnamed_addr addrspace(1) constant %"class.sycl::_V1::nd_item" poison, align 1 +@g1 = addrspace(1) global i32 1, align 4 +@g2 = internal addrspace(1) global i32 2, align 4 +@g4 = common addrspace(1) global i32 0, align 4 +@c1 = addrspace(2) constant [2 x i32] [i32 0, i32 1], align 4 +@n_t = external addrspace(2) constant [256 x i32] +@w = addrspace(1) constant i32 0, align 4 +@a.b = internal addrspace(2) constant [2 x i32] [i32 2, i32 3], align 4 +@e = external addrspace(1) global i32 +@y.z = internal addrspace(1) global i32 0, align 4 +@x = internal addrspace(2) constant float 1.000000e+00, align 4 + +define internal spir_func void @foo() { + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll new file mode 100644 index 0000000000000..d5f6545180147 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll @@ -0,0 +1,37 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %} + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" + +; CHECK-DAG: OpName [[unpacked:%[0-9]+]] "unpacked" +; CHECK-DAG: OpName [[packed:%[0-9]+]] "packed" + +; CHECK-NOT: OpDecorate {{.*}} CPacked +; CHECK-DAG: OpMemberDecorate [[unpacked]] 0 Offset 0 +; CHECK-DAG: OpMemberDecorate [[unpacked]] 1 Offset 16 + +; CHECK-NOT: OpDecorate {{.*}} CPacked +; CHECK-DAG: OpMemberDecorate [[packed]] 0 Offset 0 +; CHECK-DAG: OpMemberDecorate [[packed]] 1 Offset 4 +; CHECK-NOT: OpDecorate {{.*}} CPacked + + +%unpacked = type {i32, <3 x i32>} +%packed = type <{i32, <3 x i32>}> + + +define external i32 @unpacked_vulkan_buffer_load() { +entry: + %handle = tail call target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false) + %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) %handle, i32 1) + %1 = load i32, ptr addrspace(11) %0, align 4 + ret i32 %1 +} + +define external i32 @packed_vulkan_buffer_load() { +entry: + %handle = tail call target("spirv.VulkanBuffer", [0 x %packed], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, i1 false) + %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %packed], 12, 0) %handle, i32 1) + %1 = load i32, ptr addrspace(11) %0, align 4 + ret i32 %1 +} diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll index 93208c16ed4a5..d608529b421cc 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll @@ -1,9 +1,6 @@ ; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %} -; FIXME(134119): enable-this once Offset decoration are added. -; XFAIL: spirv-tools - %S2 = type { { [10 x { i32, i32 } ] }, i32 } ; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll index 24a50c7177340..b1446b7529ea4 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll @@ -1,9 +1,6 @@ ; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %} -; FIXME(134119): enable-this once Offset decoration are added. -; XFAIL: spirv-tools - %struct.S = type { i32 } ; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/X86/avg-mask.ll b/llvm/test/CodeGen/X86/avg-mask.ll index ace422e1a925f..b148cd3d42df6 100644 --- a/llvm/test/CodeGen/X86/avg-mask.ll +++ b/llvm/test/CodeGen/X86/avg-mask.ll @@ -7,7 +7,7 @@ define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -35,7 +35,7 @@ define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwin ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -64,9 +64,9 @@ define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 ; AVX512F-NEXT: shrl $16, %edi ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 @@ -96,9 +96,9 @@ define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwin ; AVX512F-NEXT: shrl $16, %edi ; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -137,18 +137,18 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2)) ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v64i8_mask: @@ -185,14 +185,14 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k4} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k3} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 @@ -220,7 +220,7 @@ define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -248,7 +248,7 @@ define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -275,7 +275,7 @@ define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: retq @@ -302,7 +302,7 @@ define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nou ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq @@ -334,12 +334,12 @@ define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2)) ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v32i16_mask: @@ -370,9 +370,9 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou ; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpandq %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll index db6f61ed434fd..2566860357130 100644 --- a/llvm/test/CodeGen/X86/avgfloors.ll +++ b/llvm/test/CodeGen/X86/avgfloors.ll @@ -53,7 +53,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) ; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -108,7 +108,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) ; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -405,7 +405,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind { ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) ; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -478,7 +478,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind { ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) ; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -966,7 +966,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 ; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) ; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq @@ -1078,7 +1078,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 ; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) ; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll index e3c5a5023ac9e..ae422381c841c 100644 --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -23,7 +23,7 @@ define <4 x i32> @reassociate_and_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> ; AVX512-LABEL: reassociate_and_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogd $128, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & xmm3 & xmm2 ; AVX512-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -50,7 +50,7 @@ define <4 x i32> @reassociate_or_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> % ; AVX512-LABEL: reassociate_or_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogd $254, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 | xmm3 | xmm2 ; AVX512-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -77,7 +77,7 @@ define <4 x i32> @reassociate_xor_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> ; AVX512-LABEL: reassociate_xor_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogd $150, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ xmm3 ^ xmm2 ; AVX512-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -109,7 +109,7 @@ define <8 x i32> @reassociate_and_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> ; AVX512-LABEL: reassociate_and_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogd $128, %ymm2, %ymm3, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 & ymm3 & ymm2 ; AVX512-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -139,7 +139,7 @@ define <8 x i32> @reassociate_or_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> % ; AVX512-LABEL: reassociate_or_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogd $254, %ymm2, %ymm3, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | ymm3 | ymm2 ; AVX512-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -169,7 +169,7 @@ define <8 x i32> @reassociate_xor_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> ; AVX512-LABEL: reassociate_xor_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogd $150, %ymm2, %ymm3, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ ymm3 ^ ymm2 ; AVX512-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -211,7 +211,7 @@ define <16 x i32> @reassociate_and_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x ; AVX512-LABEL: reassociate_and_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $128, %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 & zmm3 & zmm2 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -250,7 +250,7 @@ define <16 x i32> @reassociate_or_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i ; AVX512-LABEL: reassociate_or_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $254, %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | zmm3 | zmm2 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -289,7 +289,7 @@ define <16 x i32> @reassociate_xor_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x ; AVX512-LABEL: reassociate_xor_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $150, %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ zmm3 ^ zmm2 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 diff --git a/llvm/test/CodeGen/X86/nofpclass.ll b/llvm/test/CodeGen/X86/nofpclass.ll new file mode 100644 index 0000000000000..55f0af904a38d --- /dev/null +++ b/llvm/test/CodeGen/X86/nofpclass.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,-sse | FileCheck %s --check-prefix=NOSSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE + +@gf = global { float, float } zeroinitializer, align 8 + +define void @f(<2 x float> noundef nofpclass(nan inf) %e.coerce) { +; NOSSE-LABEL: f: +; NOSSE: # %bb.0: # %entry +; NOSSE-NEXT: flds {{[0-9]+}}(%rsp) +; NOSSE-NEXT: flds {{[0-9]+}}(%rsp) +; NOSSE-NEXT: movq gf@GOTPCREL(%rip), %rax +; NOSSE-NEXT: fstps 4(%rax) +; NOSSE-NEXT: fstps (%rax) +; NOSSE-NEXT: retq +; +; SSE-LABEL: f: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq gf@GOTPCREL(%rip), %rax +; SSE-NEXT: movlps %xmm0, (%rax) +; SSE-NEXT: retq +entry: + store <2 x float> %e.coerce, ptr @gf, align 8 + ret void +} diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll index b552e3238470f..4bbc1707e10c3 100644 --- a/llvm/test/CodeGen/X86/pr63108.ll +++ b/llvm/test/CodeGen/X86/pr63108.ll @@ -15,7 +15,7 @@ define i32 @PR63108() { ; SSE-NEXT: .LBB0_2: # %vector.body.preheader ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: movd {{.*#+}} xmm1 = [57339,0,0,0] -; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB0_3: # %vector.body ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 4cdc65e5c1b97..d848a8b879215 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -857,6 +857,66 @@ define <4 x double> @shuffle_v4f64_2345_0567_select(<4 x double> %vec1, <4 x dou ret <4 x double> %res } +; PR140234 +define <4 x double> @shuffle_v4f64_1436_split_load(ptr %px, ptr %py) { +; AVX1-LABEL: shuffle_v4f64_1436_split_load: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vmovupd (%rdi), %ymm1 +; AVX1-NEXT: vinsertf128 $1, 16(%rsi), %ymm0, %ymm0 +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_1436_split_load: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovapd (%rsi), %xmm0 +; AVX2-NEXT: vmovupd (%rdi), %ymm1 +; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3] +; AVX2-NEXT: vbroadcastsd 16(%rsi), %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4f64_1436_split_load: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovapd (%rsi), %xmm0 +; AVX512VL-SLOW-NEXT: vmovupd (%rdi), %ymm1 +; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3] +; AVX512VL-SLOW-NEXT: vbroadcastsd 16(%rsi), %ymm1 +; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1436_split_load: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovapd (%rsi), %xmm0 +; AVX512VL-FAST-ALL-NEXT: vmovapd 16(%rsi), %xmm1 +; AVX512VL-FAST-ALL-NEXT: vmovupd (%rdi), %ymm2 +; AVX512VL-FAST-ALL-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,1,2,4] +; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_1436_split_load: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vmovapd (%rsi), %xmm0 +; AVX512VL-FAST-PERLANE-NEXT: vmovupd (%rdi), %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3] +; AVX512VL-FAST-PERLANE-NEXT: vbroadcastsd 16(%rsi), %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512VL-FAST-PERLANE-NEXT: retq + %pxhi = getelementptr inbounds nuw i8, ptr %px, i64 16 + %pyhi = getelementptr inbounds nuw i8, ptr %py, i64 16 + %x0 = load <2 x double>, ptr %px, align 16 + %y0 = load <2 x double>, ptr %py, align 16 + %x1 = load <2 x double>, ptr %pxhi, align 16 + %y1 = load <2 x double>, ptr %pyhi, align 16 + %shuf0 = shufflevector <2 x double> %x0, <2 x double> %y0, <4 x i32> + %shuf1 = shufflevector <2 x double> %x1, <2 x double> poison, <4 x i32> + %shuf2 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> + %shuf3 = shufflevector <2 x double> %y1, <2 x double> poison, <4 x i32> + %shuf4 = shufflevector <4 x double> %shuf2, <4 x double> %shuf3, <4 x i32> + ret <4 x double> %shuf4 +} + define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0000: ; AVX1: # %bb.0: diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca-array.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca-array.ll index 51d34ce5b3882..9064d5ca8df4e 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca-array.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca-array.ll @@ -9,7 +9,7 @@ declare void @use(ptr, ptr) define void @test_alloca() sanitize_hwaddress { ; CHECK-LABEL: define void @test_alloca ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca-compat.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca-compat.ll index 9e9ed50d35daf..aae2946cbb190 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca-compat.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca-compat.ll @@ -11,7 +11,7 @@ declare void @use32(ptr) define void @test_alloca() sanitize_hwaddress { ; CHECK-LABEL: define void @test_alloca ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; CHECK-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll index 0ef09321e41ad..9ef624c0b7f75 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca-with-calls.ll @@ -12,7 +12,7 @@ define void @test_alloca() sanitize_hwaddress { ; CHECK-LABEL: define void @test_alloca ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer() +; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 48 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i64 [[TMP2]], 3 diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll b/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll index 9e9fceb5eb472..50ce490f297bc 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll @@ -18,7 +18,7 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: define void @test ; CHECK-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer() +; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 48 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i64 [[TMP2]], 3 diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll b/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll index 1698592bafc62..4e7c021bd7f97 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll @@ -88,7 +88,7 @@ define void @test_alloca() sanitize_hwaddress { ; CHECK-LABEL: define void @test_alloca ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer() +; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 48 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i64 [[TMP2]], 3 @@ -134,7 +134,7 @@ define void @test_alloca() sanitize_hwaddress { ; NOIFUNC-TLS-HISTORY-LABEL: define void @test_alloca ; NOIFUNC-TLS-HISTORY-SAME: () #[[ATTR0]] { ; NOIFUNC-TLS-HISTORY-NEXT: entry: -; NOIFUNC-TLS-HISTORY-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer() +; NOIFUNC-TLS-HISTORY-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer.p0() ; NOIFUNC-TLS-HISTORY-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 48 ; NOIFUNC-TLS-HISTORY-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 ; NOIFUNC-TLS-HISTORY-NEXT: [[TMP3:%.*]] = ashr i64 [[TMP2]], 3 diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll index 62fd7a1671569..57d37ca1ef95a 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope-setjmp.ll @@ -12,7 +12,7 @@ define dso_local noundef i1 @_Z6targetv() sanitize_hwaddress { ; CHECK-LABEL: define dso_local noundef i1 @_Z6targetv ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer() +; CHECK-NEXT: [[TMP0:%.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 48 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i64 [[TMP2]], 3 diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll index 16e6cda59a616..e30b51890e172 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll @@ -65,7 +65,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress ; X86-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SCOPE-LABEL: @standard_lifetime( -; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -117,7 +117,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress ; AARCH64-SCOPE-NEXT: ret i32 0 ; ; AARCH64-NOSCOPE-LABEL: @standard_lifetime( -; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -167,7 +167,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress ; AARCH64-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime( -; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -222,7 +222,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress ; AARCH64-SHORT-SCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime( -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -346,7 +346,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi ; X86-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SCOPE-LABEL: @standard_lifetime_optnone( -; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -398,7 +398,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi ; AARCH64-SCOPE-NEXT: ret i32 0 ; ; AARCH64-NOSCOPE-LABEL: @standard_lifetime_optnone( -; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -448,7 +448,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi ; AARCH64-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime_optnone( -; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -503,7 +503,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi ; AARCH64-SHORT-SCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime_optnone( -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -617,7 +617,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress ; X86-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SCOPE-LABEL: @multiple_lifetimes( -; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -663,7 +663,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress ; AARCH64-SCOPE-NEXT: ret i32 0 ; ; AARCH64-NOSCOPE-LABEL: @multiple_lifetimes( -; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -709,7 +709,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress ; AARCH64-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-SCOPE-LABEL: @multiple_lifetimes( -; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -758,7 +758,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress ; AARCH64-SHORT-SCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-NOSCOPE-LABEL: @multiple_lifetimes( -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -877,7 +877,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress { ; X86-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SCOPE-LABEL: @unreachable_exit( -; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -934,7 +934,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress { ; AARCH64-SCOPE-NEXT: ret i32 0 ; ; AARCH64-NOSCOPE-LABEL: @unreachable_exit( -; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -990,7 +990,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress { ; AARCH64-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-SCOPE-LABEL: @unreachable_exit( -; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -1050,7 +1050,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress { ; AARCH64-SHORT-SCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-NOSCOPE-LABEL: @unreachable_exit( -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -1185,7 +1185,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress { ; X86-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SCOPE-LABEL: @diamond_lifetime( -; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -1246,7 +1246,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress { ; AARCH64-SCOPE-NEXT: ret i32 0 ; ; AARCH64-NOSCOPE-LABEL: @diamond_lifetime( -; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -1298,7 +1298,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress { ; AARCH64-NOSCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-SCOPE-LABEL: @diamond_lifetime( -; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-SCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-SCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-SCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 @@ -1362,7 +1362,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress { ; AARCH64-SHORT-SCOPE-NEXT: ret i32 0 ; ; AARCH64-SHORT-NOSCOPE-LABEL: @diamond_lifetime( -; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer() +; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP1:%.*]] = call ptr @llvm.thread.pointer.p0() ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 ; AARCH64-SHORT-NOSCOPE-NEXT: [[TMP4:%.*]] = ashr i64 [[TMP3]], 3 diff --git a/llvm/test/MC/AsmParser/token.s b/llvm/test/MC/AsmParser/token.s new file mode 100644 index 0000000000000..c162e8336a2d7 --- /dev/null +++ b/llvm/test/MC/AsmParser/token.s @@ -0,0 +1,7 @@ +## Tested invalid statement start tokens. X86 supports "{". Use a different target. +# REQUIRES: aarch64-registered-target + +# RUN: not llvm-mc -triple=aarch64 %s 2>&1 | FileCheck %s + +# CHECK: [[#@LINE+1]]:2: error: unexpected token at start of statement + {insn} diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt index e1ba009f3c4c8..9708821affae0 100644 --- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt +++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt @@ -39,7 +39,7 @@ 0x04 0x11 0x14 0x9b # CHECK: bal 21104 # The encode/decode functions are not inverses of each other. 0x18 0x02 0x01 0x4d # CHECK: blezalc $2, 1336 -0x18 0x02 0xff 0xfa # CHECk: blezalc $2, -20 +0x18 0x02 0xff 0xfa # CHECK: blezalc $2, -20 # The encode/decode functions are not inverses of each other in the immediate case. 0x18 0x42 0x01 0x4d # CHECK: bgezalc $2, 1336 0x18 0x42 0xff 0xfa # CHECK: bgezalc $2, -20 @@ -162,13 +162,13 @@ 0x49 0xc8 0x0d 0x43 # CHECK: ldc2 $8, -701($1) 0x49 0xf4 0x92 0x75 # CHECK: sdc2 $20, 629($18) 0x58 0x05 0x00 0x40 # CHECK: blezc $5, 260 -0x58 0x05 0xff 0xfa # CHECk: blezc $5, -20 +0x58 0x05 0xff 0xfa # CHECK: blezc $5, -20 0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 260 0x58 0x43 0xff 0xfa # CHECK: bgec $2, $3, -20 0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 260 0x58 0xa5 0xff 0xfa # CHECK: bgezc $5, -20 0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 260 -0x5c 0x05 0xff 0xfa # CHECk: bgtzc $5, -20 +0x5c 0x05 0xff 0xfa # CHECK: bgtzc $5, -20 0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 260 0x5c 0xa5 0xff 0xfa # CHECK: bltzc $5, -20 0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 260 diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt index 0030e51d6c238..28cd1619e80ad 100644 --- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt +++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt @@ -56,7 +56,7 @@ 0x04 0x7e 0xab 0xcd # CHECK: dati $3, $3, 43981 # The encode/decode functions are not inverses of each other in the immediate case. 0x18 0x02 0x01 0x4d # CHECK: blezalc $2, 1336 -0x18 0x02 0xff 0xfa # CHECk: blezalc $2, -20 +0x18 0x02 0xff 0xfa # CHECK: blezalc $2, -20 # The encode/decode functions are not inverses of each other in the immediate case. 0x18 0x42 0x01 0x4d # CHECK: bgezalc $2, 1336 0x18 0x42 0xff 0xfa # CHECK: bgezalc $2, -20 @@ -181,13 +181,13 @@ 0x49 0xc8 0x0d 0x43 # CHECK: ldc2 $8, -701($1) 0x49 0xf4 0x92 0x75 # CHECK: sdc2 $20, 629($18) 0x58 0x05 0x00 0x40 # CHECK: blezc $5, 260 -0x58 0x05 0xff 0xfa # CHECk: blezc $5, -20 +0x58 0x05 0xff 0xfa # CHECK: blezc $5, -20 0x58 0x43 0x00 0x40 # CHECK: bgec $2, $3, 260 0x58 0x43 0xff 0xfa # CHECK: bgec $2, $3, -20 0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 260 0x58 0xa5 0xff 0xfa # CHECK: bgezc $5, -20 0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 260 -0x5c 0x05 0xff 0xfa # CHECk: bgtzc $5, -20 +0x5c 0x05 0xff 0xfa # CHECK: bgtzc $5, -20 0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 260 0x5c 0xa5 0xff 0xfa # CHECK: bltzc $5, -20 0x5c 0xa6 0x00 0x40 # CHECK: bltc $5, $6, 260 diff --git a/llvm/test/MC/M68k/Data/Classes/MxMOVEM_MR.s b/llvm/test/MC/M68k/Data/Classes/MxMOVEM_MR.s index a99bf117e450d..1df1b67c63f97 100644 --- a/llvm/test/MC/M68k/Data/Classes/MxMOVEM_MR.s +++ b/llvm/test/MC/M68k/Data/Classes/MxMOVEM_MR.s @@ -13,3 +13,7 @@ movem.l %d0, (%a1) ; CHECK: movem.l %d0-%d1, (%a1) ; CHECK-SAME: encoding: [0x48,0xd1,0x00,0x03] movem.l %d0-%d1, (%a1) + +; CHECK: movem.l %d0-%d7/%a0-%a6, -(%sp) +; CHECK-SAME: encoding: [0x48,0xe7,0xff,0xfe] +movem.l %d0-%d7/%a0-%a6, -(%sp) diff --git a/llvm/test/MC/M68k/Data/Classes/MxMOVEM_RM.s b/llvm/test/MC/M68k/Data/Classes/MxMOVEM_RM.s index 3cc067b6df995..2b31f30eaa326 100644 --- a/llvm/test/MC/M68k/Data/Classes/MxMOVEM_RM.s +++ b/llvm/test/MC/M68k/Data/Classes/MxMOVEM_RM.s @@ -13,3 +13,7 @@ movem.l (%a1), %d0 ; CHECK: movem.l (%a1), %d0-%d1 ; CHECK-SAME: encoding: [0x4c,0xd1,0x00,0x03] movem.l (%a1), %d0-%d1 + +; CHECK: movem.l -(%sp), %d0-%d7/%a0-%a6 +; CHECK-SAME: encoding: [0x4c,0xe7,0xff,0xfe] +movem.l -(%sp), %d0-%d7/%a0-%a6 diff --git a/llvm/test/MC/Mips/macro-rem.s b/llvm/test/MC/Mips/macro-rem.s index a33c4a098ed69..30239a652ca9f 100644 --- a/llvm/test/MC/Mips/macro-rem.s +++ b/llvm/test/MC/Mips/macro-rem.s @@ -95,7 +95,7 @@ # CHECK-NOTRAP: bnez $6, $tmp2 # encoding: [A,A,0xc0,0x14] # CHECK-NOTRAP: div $zero, $5, $6 # encoding: [0x1a,0x00,0xa6,0x00] # CHECK-NOTRAP: break 7 # encoding: [0x0d,0x00,0x07,0x00] -# CHECk-NOTRAP: $tmp2 +# CHECK-NOTRAP-NEXT: $tmp2: # CHECK-NOTRAP: addiu $1, $zero, -1 # encoding: [0xff,0xff,0x01,0x24] # CHECK-NOTRAP: bne $6, $1, $tmp3 # encoding: [A,A,0xc1,0x14] # CHECK-NOTRAP: lui $1, 32768 # encoding: [0x00,0x80,0x01,0x3c] diff --git a/llvm/test/MC/RISCV/rv32q-invalid.s b/llvm/test/MC/RISCV/rv32q-invalid.s new file mode 100644 index 0000000000000..9b74f9f1d59c1 --- /dev/null +++ b/llvm/test/MC/RISCV/rv32q-invalid.s @@ -0,0 +1,21 @@ +# RUN: not llvm-mc -triple riscv32 -mattr=+q < %s 2>&1 | FileCheck %s + +# Out of range immediates +## simm12 +flq ft1, -2049(a0) # CHECK: :[[@LINE]]:10: error: operand must be a symbol with %lo/%pcrel_lo/%tprel_lo specifier or an integer in the range [-2048, 2047] +fsq ft2, 2048(a1) # CHECK: :[[@LINE]]:10: error: operand must be a symbol with %lo/%pcrel_lo/%tprel_lo specifier or an integer in the range [-2048, 2047] + +# Memory operand not formatted correctly +flq ft1, a0, -200 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction +fsq ft2, a1, 100 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction + +# Invalid register names +flq ft15, 100(a0) # CHECK: :[[@LINE]]:5: error: invalid operand for instruction +flq ft1, 100(a10) # CHECK: :[[@LINE]]:14: error: expected register +fsgnjn.q fa100, fa2, fa3 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction + +# Integer registers where FP regs are expected +fadd.q a2, a1, a0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction + +# FP registers where integer regs are expected +fcvt.wu.q ft2, a1 # CHECK: :[[@LINE]]:11: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv64q-invalid.s b/llvm/test/MC/RISCV/rv64q-invalid.s new file mode 100644 index 0000000000000..ac469c268d7ad --- /dev/null +++ b/llvm/test/MC/RISCV/rv64q-invalid.s @@ -0,0 +1,9 @@ +# RUN: not llvm-mc -triple riscv64 -mattr=+q < %s 2>&1 | FileCheck %s + +# Integer registers where FP regs are expected +fcvt.l.q ft0, a0 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction +fcvt.lu.q ft1, a1 # CHECK: :[[@LINE]]:11: error: invalid operand for instruction + +# FP registers where integer regs are expected +fcvt.q.l a3, ft3 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction +fcvt.q.lu a4, ft4 # CHECK: :[[@LINE]]:11: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv64q-valid.s b/llvm/test/MC/RISCV/rv64q-valid.s new file mode 100644 index 0000000000000..81bb2852eac0f --- /dev/null +++ b/llvm/test/MC/RISCV/rv64q-valid.s @@ -0,0 +1,43 @@ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+q -M no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+q < %s \ +# RUN: | llvm-objdump --mattr=+q -M no-aliases -d -r - \ +# RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s +# +# RUN: not llvm-mc -triple riscv32 -mattr=+q < %s 2>&1 \ +# RUN: | FileCheck -check-prefix=CHECK-RV32 %s + +# CHECK-ASM-AND-OBJ: fcvt.l.q a0, ft0, dyn +# CHECK-ASM: encoding: [0x53,0x75,0x20,0xc6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.l.q a0, ft0, dyn +# CHECK-ASM-AND-OBJ: fcvt.lu.q a1, ft1, dyn +# CHECK-ASM: encoding: [0xd3,0xf5,0x30,0xc6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.lu.q a1, ft1, dyn +# CHECK-ASM-AND-OBJ: fcvt.q.l ft3, a3, dyn +# CHECK-ASM: encoding: [0xd3,0xf1,0x26,0xd6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.q.l ft3, a3, dyn +# CHECK-ASM-AND-OBJ: fcvt.q.lu ft4, a4, dyn +# CHECK-ASM: encoding: [0x53,0x72,0x37,0xd6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.q.lu ft4, a4, dyn + +# Rounding modes +# CHECK-ASM-AND-OBJ: fcvt.q.l ft3, a3 +# CHECK-ASM: encoding: [0xd3,0x81,0x26,0xd6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.q.l ft3, a3, rne +# CHECK-ASM-AND-OBJ: fcvt.q.lu ft4, a4, rtz +# CHECK-ASM: encoding: [0x53,0x12,0x37,0xd6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.q.lu ft4, a4, rtz +# CHECK-ASM-AND-OBJ: fcvt.l.q a0, ft0, rdn +# CHECK-ASM: encoding: [0x53,0x25,0x20,0xc6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.l.q a0, ft0, rdn +# CHECK-ASM-AND-OBJ: fcvt.lu.q a1, ft1, rup +# CHECK-ASM: encoding: [0xd3,0xb5,0x30,0xc6] +# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} +fcvt.lu.q a1, ft1, rup diff --git a/llvm/test/MC/RISCV/rv64zfa-only-valid.s b/llvm/test/MC/RISCV/rv64zfa-only-valid.s new file mode 100644 index 0000000000000..95fb253b145c1 --- /dev/null +++ b/llvm/test/MC/RISCV/rv64zfa-only-valid.s @@ -0,0 +1,19 @@ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+q,+zfh -M no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfa,+q,+zfh < %s \ +# RUN: | llvm-objdump --mattr=+zfa,+q,+zfh -M no-aliases -d -r - \ +# RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s +# +# RUN: not llvm-mc -triple riscv64 -mattr=+q,+zfh \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s + +# CHECK-ASM-AND-OBJ: fmvh.x.q a1, fs1 +# CHECK-ASM: encoding: [0xd3,0x85,0x14,0xe6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fmvh.x.q a1, fs1 + +# CHECK-ASM-AND-OBJ: fmvp.q.x fs1, a1, a2 +# CHECK-ASM: encoding: [0xd3,0x84,0xc5,0xb6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fmvp.q.x fs1, a1, a2 diff --git a/llvm/test/MC/RISCV/rvq-aliases-valid.s b/llvm/test/MC/RISCV/rvq-aliases-valid.s new file mode 100644 index 0000000000000..85e24f0e970cb --- /dev/null +++ b/llvm/test/MC/RISCV/rvq-aliases-valid.s @@ -0,0 +1,55 @@ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+q -M no-aliases \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv32 -mattr=+q \ +# RUN: | FileCheck -check-prefix=CHECK-ALIAS %s +# RUN: llvm-mc %s -triple=riscv64 -mattr=+q -M no-aliases \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc %s -triple=riscv64 -mattr=+q \ +# RUN: | FileCheck -check-prefix=CHECK-ALIAS %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+q < %s \ +# RUN: | llvm-objdump -d --mattr=+q --no-print-imm-hex -M no-aliases - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+q < %s \ +# RUN: | llvm-objdump -d --mattr=+q --no-print-imm-hex - \ +# RUN: | FileCheck -check-prefix=CHECK-ALIAS %s +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+q < %s \ +# RUN: | llvm-objdump -d --mattr=+q --no-print-imm-hex -M no-aliases - \ +# RUN: | FileCheck -check-prefix=CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+q < %s \ +# RUN: | llvm-objdump -d --mattr=+q --no-print-imm-hex - \ +# RUN: | FileCheck -check-prefix=CHECK-ALIAS %s + +##===----------------------------------------------------------------------===## +## Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20) +##===----------------------------------------------------------------------===## + +# CHECK-INST: flq ft0, 0(a0) +# CHECK-ALIAS: flq ft0, 0(a0) +flq f0, (a0) +# CHECK-INST: fsq ft0, 0(a0) +# CHECK-ALIAS: fsq ft0, 0(a0) +fsq f0, (a0) + +# CHECK-INST: fsgnj.q ft0, ft1, ft1 +# CHECK-ALIAS: fmv.q ft0, ft1 +fmv.q f0, f1 +# CHECK-INST: fsgnjx.q ft1, ft2, ft2 +# CHECK-ALIAS: fabs.q ft1, ft2 +fabs.q f1, f2 +# CHECK-INST: fsgnjn.q ft2, ft3, ft3 +# CHECK-ALIAS: fneg.q ft2, ft3 +fneg.q f2, f3 + +# CHECK-INST: flt.q tp, ft6, ft5 +# CHECK-ALIAS: flt.q tp, ft6, ft5 +fgt.q x4, f5, f6 +# CHECK-INST: fle.q t2, fs1, fs0 +# CHECK-ALIAS: fle.q t2, fs1, fs0 +fge.q x7, f8, f9 + +# CHECK-INST: flq ft0, 0(a0) +# CHECK-ALIAS: flq ft0, 0(a0) +flq f0, (x10) +# CHECK-INST: fsq ft0, 0(a0) +# CHECK-ALIAS: fsq ft0, 0(a0) +fsq f0, (x10) diff --git a/llvm/test/MC/RISCV/rvq-pseudos.s b/llvm/test/MC/RISCV/rvq-pseudos.s new file mode 100644 index 0000000000000..9c49a1bac3f15 --- /dev/null +++ b/llvm/test/MC/RISCV/rvq-pseudos.s @@ -0,0 +1,12 @@ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+q | FileCheck %s +# RUN: llvm-mc %s -triple=riscv64 -mattr=+q | FileCheck %s + +# CHECK: .Lpcrel_hi0: +# CHECK: auipc a2, %pcrel_hi(a_symbol) +# CHECK: flq fa2, %pcrel_lo(.Lpcrel_hi0)(a2) +flq fa2, a_symbol, a2 + +# CHECK: .Lpcrel_hi1: +# CHECK: auipc a3, %pcrel_hi(a_symbol) +# CHECK: fsq fa2, %pcrel_lo(.Lpcrel_hi1)(a3) +fsq fa2, a_symbol, a3 diff --git a/llvm/test/MC/RISCV/rvq-valid.s b/llvm/test/MC/RISCV/rvq-valid.s new file mode 100644 index 0000000000000..fe224f85cd699 --- /dev/null +++ b/llvm/test/MC/RISCV/rvq-valid.s @@ -0,0 +1,184 @@ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+q -M no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+q < %s \ +# RUN: | llvm-objdump --no-print-imm-hex --mattr=+q -M no-aliases -d -r - \ +# RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s +# RUN: llvm-mc %s -triple=riscv64 -mattr=+q -M no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+q < %s \ +# RUN: | llvm-objdump --no-print-imm-hex --mattr=+q -M no-aliases -d -r - \ +# RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s + +# Support for the 'Q' extension implies support for 'D' and 'F' + +# CHECK-ASM-AND-OBJ: fadd.d fs10, fs11, ft8, dyn +# CHECK-ASM: encoding: [0x53,0xfd,0xcd,0x03] +fadd.d f26, f27, f28, dyn + +# CHECK-ASM-AND-OBJ: fadd.s fs10, fs11, ft8 +# CHECK-ASM: encoding: [0x53,0xfd,0xcd,0x01] +fadd.s f26, f27, f28 + +# CHECK-ASM-AND-OBJ: flq ft0, 12(a0) +# CHECK-ASM: encoding: [0x07,0x40,0xc5,0x00] +flq f0, 12(a0) +# CHECK-ASM-AND-OBJ: flq ft1, 4(ra) +# CHECK-ASM: encoding: [0x87,0xc0,0x40,0x00] +flq f1, +4(ra) +# CHECK-ASM-AND-OBJ: flq ft2, -2048(a3) +# CHECK-ASM: encoding: [0x07,0xc1,0x06,0x80] +flq f2, -2048(x13) +# CHECK-ASM: flq ft3, %lo(2048)(s1) # encoding: [0x87,0xc1,0bAAAA0100,A] +# CHECK-OBJ: flq ft3, -2048(s1) +flq f3, %lo(2048)(s1) +# CHECK-ASM-AND-OBJ: flq ft4, 2047(s2) +# CHECK-ASM: encoding: [0x07,0x42,0xf9,0x7f] +flq f4, 2047(s2) +# CHECK-ASM-AND-OBJ: flq ft5, 0(s3) +# CHECK-ASM: encoding: [0x87,0xc2,0x09,0x00] +flq f5, 0(s3) + +# CHECK-ASM-AND-OBJ: fsq ft6, 2047(s4) +# CHECK-ASM: encoding: [0xa7,0x4f,0x6a,0x7e] +fsq f6, 2047(s4) +# CHECK-ASM-AND-OBJ: fsq ft7, -2048(s5) +# CHECK-ASM: encoding: [0x27,0xc0,0x7a,0x80] +fsq f7, -2048(s5) +# CHECK-ASM: fsq fs0, %lo(2048)(s6) # encoding: [0x27'A',0x40'A',0x8b'A',A] +# CHECK-OBJ: fsq fs0, -2048(s6) +fsq f8, %lo(2048)(s6) +# CHECK-ASM-AND-OBJ: fsq fs1, 999(s7) +# CHECK-ASM: encoding: [0xa7,0xc3,0x9b,0x3e] +fsq f9, 999(s7) + +# CHECK-ASM-AND-OBJ: fmadd.q fa0, fa1, fa2, fa3, dyn +# CHECK-ASM: encoding: [0x43,0xf5,0xc5,0x6e] +fmadd.q f10, f11, f12, f13, dyn +# CHECK-ASM-AND-OBJ: fmsub.q fa4, fa5, fa6, fa7, dyn +# CHECK-ASM: encoding: [0x47,0xf7,0x07,0x8f] +fmsub.q f14, f15, f16, f17, dyn +# CHECK-ASM-AND-OBJ: fnmsub.q fs2, fs3, fs4, fs5, dyn +# CHECK-ASM: encoding: [0x4b,0xf9,0x49,0xaf] +fnmsub.q f18, f19, f20, f21, dyn +# CHECK-ASM-AND-OBJ: fnmadd.q fs6, fs7, fs8, fs9, dyn +# CHECK-ASM: encoding: [0x4f,0xfb,0x8b,0xcf] +fnmadd.q f22, f23, f24, f25, dyn + +# CHECK-ASM-AND-OBJ: fadd.q fs10, fs11, ft8, dyn +# CHECK-ASM: encoding: [0x53,0xfd,0xcd,0x07] +fadd.q f26, f27, f28, dyn +# CHECK-ASM-AND-OBJ: fsub.q ft9, ft10, ft11, dyn +# CHECK-ASM: encoding: [0xd3,0x7e,0xff,0x0f] +fsub.q f29, f30, f31, dyn +# CHECK-ASM-AND-OBJ: fmul.q ft0, ft1, ft2, dyn +# CHECK-ASM: encoding: [0x53,0xf0,0x20,0x16] +fmul.q ft0, ft1, ft2, dyn +# CHECK-ASM-AND-OBJ: fdiv.q ft3, ft4, ft5, dyn +# CHECK-ASM: encoding: [0xd3,0x71,0x52,0x1e] +fdiv.q ft3, ft4, ft5, dyn +# CHECK-ASM-AND-OBJ: fsqrt.q ft6, ft7, dyn +# CHECK-ASM: encoding: [0x53,0xf3,0x03,0x5e] +fsqrt.q ft6, ft7, dyn +# CHECK-ASM-AND-OBJ: fsgnj.q fs1, fa0, fa1 +# CHECK-ASM: encoding: [0xd3,0x04,0xb5,0x26] +fsgnj.q fs1, fa0, fa1 +# CHECK-ASM-AND-OBJ: fsgnjn.q fa1, fa3, fa4 +# CHECK-ASM: encoding: [0xd3,0x95,0xe6,0x26] +fsgnjn.q fa1, fa3, fa4 +# CHECK-ASM-AND-OBJ: fsgnjx.q fa3, fa2, fa1 +# CHECK-ASM: encoding: [0xd3,0x26,0xb6,0x26] +fsgnjx.q fa3, fa2, fa1 +# CHECK-ASM-AND-OBJ: fmin.q fa5, fa6, fa7 +# CHECK-ASM: encoding: [0xd3,0x07,0x18,0x2f] +fmin.q fa5, fa6, fa7 +# CHECK-ASM-AND-OBJ: fmax.q fs2, fs3, fs4 +# CHECK-ASM: encoding: [0x53,0x99,0x49,0x2f] +fmax.q fs2, fs3, fs4 + +# CHECK-ASM-AND-OBJ: fcvt.s.q fs5, fs6, dyn +# CHECK-ASM: encoding: [0xd3,0x7a,0x3b,0x40] +fcvt.s.q fs5, fs6, dyn +# CHECK-ASM-AND-OBJ: fcvt.q.s fs7, fs8 +# CHECK-ASM: encoding: [0xd3,0x0b,0x0c,0x46] +fcvt.q.s fs7, fs8 +# CHECK-ASM-AND-OBJ: fcvt.q.s fs7, fs8, rup +# CHECK-ASM: encoding: [0xd3,0x3b,0x0c,0x46] +fcvt.q.s fs7, fs8, rup +# CHECK-ASM-AND-OBJ: fcvt.d.q fs5, fs6, dyn +# CHECK-ASM: encoding: [0xd3,0x7a,0x3b,0x42] +fcvt.d.q fs5, fs6, dyn +# CHECK-ASM-AND-OBJ: fcvt.q.d fs7, fs8 +# CHECK-ASM: encoding: [0xd3,0x0b,0x1c,0x46] +fcvt.q.d fs7, fs8 +# CHECK-ASM-AND-OBJ: fcvt.q.d fs7, fs8, rup +# CHECK-ASM: encoding: [0xd3,0x3b,0x1c,0x46] +fcvt.q.d fs7, fs8, rup +# CHECK-ASM-AND-OBJ: feq.q a1, fs8, fs9 +# CHECK-ASM: encoding: [0xd3,0x25,0x9c,0xa7] +feq.q a1, fs8, fs9 +# CHECK-ASM-AND-OBJ: flt.q a2, fs10, fs11 +# CHECK-ASM: encoding: [0x53,0x16,0xbd,0xa7] +flt.q a2, fs10, fs11 +# CHECK-ASM-AND-OBJ: fle.q a3, ft8, ft9 +# CHECK-ASM: encoding: [0xd3,0x06,0xde,0xa7] +fle.q a3, ft8, ft9 +# CHECK-ASM-AND-OBJ: fclass.q a3, ft10 +# CHECK-ASM: encoding: [0xd3,0x16,0x0f,0xe6] +fclass.q a3, ft10 + +# CHECK-ASM-AND-OBJ: fcvt.w.q a4, ft11, dyn +# CHECK-ASM: encoding: [0x53,0xf7,0x0f,0xc6] +fcvt.w.q a4, ft11, dyn +# CHECK-ASM-AND-OBJ: fcvt.q.w ft0, a5 +# CHECK-ASM: encoding: [0x53,0x80,0x07,0xd6] +fcvt.q.w ft0, a5 +# CHECK-ASM-AND-OBJ: fcvt.q.w ft0, a5, rup +# CHECK-ASM: encoding: [0x53,0xb0,0x07,0xd6] +fcvt.q.w ft0, a5, rup +# CHECK-ASM-AND-OBJ: fcvt.q.wu ft1, a6 +# CHECK-ASM: encoding: [0xd3,0x00,0x18,0xd6] +fcvt.q.wu ft1, a6 +# CHECK-ASM-AND-OBJ: fcvt.q.wu ft1, a6, rup +# CHECK-ASM: encoding: [0xd3,0x30,0x18,0xd6] +fcvt.q.wu ft1, a6, rup + +# Rounding modes + +# CHECK-ASM-AND-OBJ: fmadd.q fa0, fa1, fa2, fa3, rne +# CHECK-ASM: encoding: [0x43,0x85,0xc5,0x6e] +fmadd.q f10, f11, f12, f13, rne +# CHECK-ASM-AND-OBJ: fmsub.q fa4, fa5, fa6, fa7, rtz +# CHECK-ASM: encoding: [0x47,0x97,0x07,0x8f] +fmsub.q f14, f15, f16, f17, rtz +# CHECK-ASM-AND-OBJ: fnmsub.q fs2, fs3, fs4, fs5, rdn +# CHECK-ASM: encoding: [0x4b,0xa9,0x49,0xaf] +fnmsub.q f18, f19, f20, f21, rdn +# CHECK-ASM-AND-OBJ: fnmadd.q fs6, fs7, fs8, fs9, rup +# CHECK-ASM: encoding: [0x4f,0xbb,0x8b,0xcf] +fnmadd.q f22, f23, f24, f25, rup + +# CHECK-ASM-AND-OBJ: fadd.q fs10, fs11, ft8, rmm +# CHECK-ASM: encoding: [0x53,0xcd,0xcd,0x07] +fadd.q f26, f27, f28, rmm +# CHECK-ASM-AND-OBJ: fsub.q ft9, ft10, ft11 +# CHECK-ASM: encoding: [0xd3,0x7e,0xff,0x0f] +fsub.q f29, f30, f31, dyn +# CHECK-ASM-AND-OBJ: fmul.q ft0, ft1, ft2, rne +# CHECK-ASM: encoding: [0x53,0x80,0x20,0x16] +fmul.q ft0, ft1, ft2, rne +# CHECK-ASM-AND-OBJ: fdiv.q ft3, ft4, ft5, rtz +# CHECK-ASM: encoding: [0xd3,0x11,0x52,0x1e] +fdiv.q ft3, ft4, ft5, rtz + +# CHECK-ASM-AND-OBJ: fsqrt.q ft6, ft7, rdn +# CHECK-ASM: encoding: [0x53,0xa3,0x03,0x5e] +fsqrt.q ft6, ft7, rdn +# CHECK-ASM-AND-OBJ: fcvt.s.q fs5, fs6, rup +# CHECK-ASM: encoding: [0xd3,0x3a,0x3b,0x40] +fcvt.s.q fs5, fs6, rup +# CHECK-ASM-AND-OBJ: fcvt.w.q a4, ft11, rmm +# CHECK-ASM: encoding: [0x53,0xc7,0x0f,0xc6] +fcvt.w.q a4, ft11, rmm +# CHECK-ASM-AND-OBJ: fcvt.wu.q a5, ft10, dyn +# CHECK-ASM: encoding: [0xd3,0x77,0x1f,0xc6] +fcvt.wu.q a5, ft10, dyn diff --git a/llvm/test/MC/RISCV/xandesvdot-valid.s b/llvm/test/MC/RISCV/xandesvdot-valid.s new file mode 100644 index 0000000000000..06433790219de --- /dev/null +++ b/llvm/test/MC/RISCV/xandesvdot-valid.s @@ -0,0 +1,51 @@ +# XAndesVDot - Andes Vector Dot Product Extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+xandesvdot -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xandesvdot < %s \ +# RUN: | llvm-objdump --mattr=+xandesvdot -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefixes=CHECK-OBJ %s +# RUN: not llvm-mc -triple=riscv32 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc %s -triple=riscv64 -mattr=+xandesvdot -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM %s +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xandesvdot < %s \ +# RUN: | llvm-objdump --mattr=+xandesvdot -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefixes=CHECK-OBJ %s +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR + +# CHECK-OBJ: nds.vd4dots.vv v8, v10, v12 +# CHECK-ASM: nds.vd4dots.vv v8, v10, v12 +# CHECK-ASM: encoding: [0x5b,0x44,0xc5,0x12] +# CHECK-ERROR: instruction requires the following: 'XAndesVDot' (Andes Vector Dot Product Extension){{$}} +nds.vd4dots.vv v8, v10, v12 + +# CHECK-OBJ: nds.vd4dots.vv v8, v10, v12, v0.t +# CHECK-ASM: nds.vd4dots.vv v8, v10, v12, v0.t +# CHECK-ASM: encoding: [0x5b,0x44,0xc5,0x10] +# CHECK-ERROR: instruction requires the following: 'XAndesVDot' (Andes Vector Dot Product Extension){{$}} +nds.vd4dots.vv v8, v10, v12, v0.t + +# CHECK-OBJ: nds.vd4dotu.vv v8, v10, v12 +# CHECK-ASM: nds.vd4dotu.vv v8, v10, v12 +# CHECK-ASM: encoding: [0x5b,0x44,0xc5,0x1e] +# CHECK-ERROR: instruction requires the following: 'XAndesVDot' (Andes Vector Dot Product Extension){{$}} +nds.vd4dotu.vv v8, v10, v12 + +# CHECK-OBJ: nds.vd4dotu.vv v8, v10, v12, v0.t +# CHECK-ASM: nds.vd4dotu.vv v8, v10, v12, v0.t +# CHECK-ASM: encoding: [0x5b,0x44,0xc5,0x1c] +# CHECK-ERROR: instruction requires the following: 'XAndesVDot' (Andes Vector Dot Product Extension){{$}} +nds.vd4dotu.vv v8, v10, v12, v0.t + +# CHECK-OBJ: nds.vd4dotsu.vv v8, v10, v12 +# CHECK-ASM: nds.vd4dotsu.vv v8, v10, v12 +# CHECK-ASM: encoding: [0x5b,0x44,0xc5,0x16] +# CHECK-ERROR: instruction requires the following: 'XAndesVDot' (Andes Vector Dot Product Extension){{$}} +nds.vd4dotsu.vv v8, v10, v12 + +# CHECK-OBJ: nds.vd4dotsu.vv v8, v10, v12, v0.t +# CHECK-ASM: nds.vd4dotsu.vv v8, v10, v12, v0.t +# CHECK-ASM: encoding: [0x5b,0x44,0xc5,0x14] +# CHECK-ERROR: instruction requires the following: 'XAndesVDot' (Andes Vector Dot Product Extension){{$}} +nds.vd4dotsu.vv v8, v10, v12, v0.t diff --git a/llvm/test/MC/RISCV/xqcilia-valid.s b/llvm/test/MC/RISCV/xqcilia-valid.s index 1e4f855cb2b47..169edc42da697 100644 --- a/llvm/test/MC/RISCV/xqcilia-valid.s +++ b/llvm/test/MC/RISCV/xqcilia-valid.s @@ -92,3 +92,13 @@ qc.e.addi x5, x5, 20 # CHECK-NOALIAS: c.andi s1, -10 # CHECK-ENC: encoding: [0xd9,0x98] qc.e.andi x9, x9, -10 + +# CHECK-ALIAS: mv t0, t1 +# CHECK-NOALIAS: c.mv t0, t1 +# CHECK-ENC: encoding: [0x9a,0x82] +qc.e.addi x5, x6, 0 + +# CHECK-ALIAS: addi sp, sp, 48 +# CHECK-NOALIAS: c.addi16sp sp, 48 +# CHECK-ENC: encoding: [0x45,0x61] +qc.e.addi x2, x2, 48 diff --git a/llvm/test/MC/RISCV/zfa-invalid.s b/llvm/test/MC/RISCV/zfa-invalid.s index c2537c3fc5102..cedc9279db3cb 100644 --- a/llvm/test/MC/RISCV/zfa-invalid.s +++ b/llvm/test/MC/RISCV/zfa-invalid.s @@ -1,5 +1,5 @@ -# RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+d,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV32 %s -# RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+d,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV64 %s +# RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+q,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV32 %s +# RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+q,+zfh < %s 2>&1 | FileCheck -check-prefixes=CHECK-NO-RV64 %s # Invalid rounding modes # CHECK-NO-RV64: error: operand must be 'rtz' floating-point rounding mode @@ -35,6 +35,10 @@ fli.d ft1, 3.560000e+02 # CHECK-NO-RV32: error: operand must be a valid floating-point constant fli.h ft1, 1.600000e+00 +# CHECK-NO-RV64: error: operand must be a valid floating-point constant +# CHECK-NO-RV32: error: operand must be a valid floating-point constant +fli.q ft1, 2.250000e+00 + # CHECK-NO-RV64: error: invalid floating point immediate # CHECK-NO-RV32: error: invalid floating point immediate fli.s ft1, -min @@ -72,6 +76,11 @@ fli.d ft1, 1.1754943508222875079687365372222456778186655567720875215087517062784 # CHECK-NO-RV32: error: operand must be a valid floating-point constant fli.h ft1, 1.1754943508222875079687365372222456778186655567720875215087517062784172594547271728515625e-38 +# Don't accept single precision minimum for quad. +# CHECK-NO-RV64: error: operand must be a valid floating-point constant +# CHECK-NO-RV32: error: operand must be a valid floating-point constant +fli.q ft1, 1.1754943508222875079687365372222456778186655567720875215087517062784172594547271728515625e-38 + # Don't accept integers. # CHECK-NO-RV32: error: invalid floating point immediate # CHECK-NO-RV64: error: invalid floating point immediate diff --git a/llvm/test/MC/RISCV/zfa-quad-invalid.s b/llvm/test/MC/RISCV/zfa-quad-invalid.s new file mode 100644 index 0000000000000..3ca89c6ebe627 --- /dev/null +++ b/llvm/test/MC/RISCV/zfa-quad-invalid.s @@ -0,0 +1,42 @@ +# RUN: not llvm-mc -triple riscv32 -mattr=+zfa,+zfh \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK-NO-EXTQ %s +# RUN: not llvm-mc -triple riscv64 -mattr=+zfa,+zfh \ +# RUN: -M no-aliases -show-encoding < %s 2>&1 \ +# RUN: | FileCheck -check-prefixes=CHECK-NO-EXTQ %s + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fminm.q fa0, fa1, fa2 + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fmaxm.q fs3, fs4, fs5 + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fround.q fs1, fs2 + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fround.q fs1, fs2, dyn + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fround.q fs1, fs2, rtz + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fround.q fs1, fs2, rne + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +froundnx.q fs1, fs2 + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +froundnx.q fs1, fs2, dyn + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +froundnx.q fs1, fs2, rtz + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +froundnx.q fs1, fs2, rne + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fltq.q a1, fs1, fs2 + +# CHECK-NO-EXTQ: error: instruction requires the following: 'Q' (Quad-Precision Floating-Point){{$}} +fleq.q a1, ft1, ft2 diff --git a/llvm/test/MC/RISCV/zfa-valid.s b/llvm/test/MC/RISCV/zfa-valid.s index 6e78a4c0f2584..edf830642c263 100644 --- a/llvm/test/MC/RISCV/zfa-valid.s +++ b/llvm/test/MC/RISCV/zfa-valid.s @@ -1,18 +1,18 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfa,+q,+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+d,+zfh -M no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfa,+q,+zfh -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+d,+zfh < %s \ -# RUN: | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \ +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfa,+q,+zfh < %s \ +# RUN: | llvm-objdump --mattr=+zfa,+q,+zfh -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfa,+d,+zfh < %s \ -# RUN: | llvm-objdump --mattr=+zfa,+d,+zfh -M no-aliases -d -r - \ +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfa,+q,+zfh < %s \ +# RUN: | llvm-objdump --mattr=+zfa,+q,+zfh -M no-aliases -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s # -# RUN: not llvm-mc -triple riscv32 -mattr=+d,+zfh \ +# RUN: not llvm-mc -triple riscv32 -mattr=+q,+zfh \ # RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s -# RUN: not llvm-mc -triple riscv64 -mattr=+d,+zfh \ +# RUN: not llvm-mc -triple riscv64 -mattr=+q,+zfh \ # RUN: -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-NO-EXT %s @@ -933,6 +933,311 @@ fli.h ft1, INF # CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} fli.h ft1, nan +# CHECK-ASM-AND-OBJ: fli.q ft1, -1.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x10,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, -1.000000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, -1.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x10,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, -0x1p+0 + +# CHECK-ASM-AND-OBJ: fli.q ft1, min +# CHECK-ASM: encoding: [0xd3,0x80,0x10,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, min + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.52587890625e-05 +# CHECK-ASM: encoding: [0xd3,0x00,0x11,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.52587890625e-05 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.52587890625e-05 +# CHECK-ASM: encoding: [0xd3,0x00,0x11,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-16 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 3.0517578125e-05 +# CHECK-ASM: encoding: [0xd3,0x80,0x11,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 3.0517578125e-05 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 3.0517578125e-05 +# CHECK-ASM: encoding: [0xd3,0x80,0x11,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-15 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.00390625 +# CHECK-ASM: encoding: [0xd3,0x00,0x12,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 3.906250e-03 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.00390625 +# CHECK-ASM: encoding: [0xd3,0x00,0x12,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-8 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.0078125 +# CHECK-ASM: encoding: [0xd3,0x80,0x12,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 7.812500e-03 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.0078125 +# CHECK-ASM: encoding: [0xd3,0x80,0x12,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-7 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.0625 +# CHECK-ASM: encoding: [0xd3,0x00,0x13,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 6.250000e-02 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.0625 +# CHECK-ASM: encoding: [0xd3,0x00,0x13,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-4 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.125 +# CHECK-ASM: encoding: [0xd3,0x80,0x13,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.250000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.125 +# CHECK-ASM: encoding: [0xd3,0x80,0x13,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-3 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.25 +# CHECK-ASM: encoding: [0xd3,0x00,0x14,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 2.500000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.25 +# CHECK-ASM: encoding: [0xd3,0x00,0x14,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-2 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.3125 +# CHECK-ASM: encoding: [0xd3,0x80,0x14,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 3.125000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.3125 +# CHECK-ASM: encoding: [0xd3,0x80,0x14,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.4p-2 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.375 +# CHECK-ASM: encoding: [0xd3,0x00,0x15,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 3.750000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.375 +# CHECK-ASM: encoding: [0xd3,0x00,0x15,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.8p-2 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.4375 +# CHECK-ASM: encoding: [0xd3,0x80,0x15,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 4.375000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.4375 +# CHECK-ASM: encoding: [0xd3,0x80,0x15,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.cp-2 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.5 +# CHECK-ASM: encoding: [0xd3,0x00,0x16,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 5.000000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.5 +# CHECK-ASM: encoding: [0xd3,0x00,0x16,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p-1 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.625 +# CHECK-ASM: encoding: [0xd3,0x80,0x16,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 6.250000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.625 +# CHECK-ASM: encoding: [0xd3,0x80,0x16,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.4p-1 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.75 +# CHECK-ASM: encoding: [0xd3,0x00,0x17,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 7.500000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.75 +# CHECK-ASM: encoding: [0xd3,0x00,0x17,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.8p-1 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.875 +# CHECK-ASM: encoding: [0xd3,0x80,0x17,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 8.750000e-01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 0.875 +# CHECK-ASM: encoding: [0xd3,0x80,0x17,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.cp-1 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x18,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.000000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x18,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+0 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.25 +# CHECK-ASM: encoding: [0xd3,0x80,0x18,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.250000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.25 +# CHECK-ASM: encoding: [0xd3,0x80,0x18,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.4p+0 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.5 +# CHECK-ASM: encoding: [0xd3,0x00,0x19,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.500000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.5 +# CHECK-ASM: encoding: [0xd3,0x00,0x19,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.8p+0 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.75 +# CHECK-ASM: encoding: [0xd3,0x80,0x19,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.750000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 1.75 +# CHECK-ASM: encoding: [0xd3,0x80,0x19,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.cp+0 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 2.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1a,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 2.000000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 2.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1a,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+1 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 2.5 +# CHECK-ASM: encoding: [0xd3,0x80,0x1a,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 2.500000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 2.5 +# CHECK-ASM: encoding: [0xd3,0x80,0x1a,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.4p+1 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 3.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1b,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 3.000000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 3.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1b,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1.8p+1 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 4.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1b,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 4.000000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 4.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1b,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+2 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 8.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1c,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 8.000000e+00 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 8.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1c,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+3 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 16.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1c,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.600000e+01 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 16.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1c,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+4 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 128.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1d,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 1.280000e+02 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 128.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1d,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+7 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 256.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1d,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 2.560000e+02 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 256.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1d,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+8 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 32768.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1e,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 3.276800e+04 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 32768.0 +# CHECK-ASM: encoding: [0xd3,0x00,0x1e,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+15 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 65536.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1e,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 6.553600e+04 + +# CHECK-ASM-AND-OBJ: fli.q ft1, 65536.0 +# CHECK-ASM: encoding: [0xd3,0x80,0x1e,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, 0x1p+16 + +# CHECK-ASM-AND-OBJ: fli.q ft1, inf +# CHECK-ASM: encoding: [0xd3,0x00,0x1f,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, INF + +# CHECK-ASM-AND-OBJ: fli.q ft1, nan +# CHECK-ASM: encoding: [0xd3,0x80,0x1f,0xf6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fli.q ft1, nan + # CHECK-ASM-AND-OBJ: fminm.s fa0, fa1, fa2 # CHECK-ASM: encoding: [0x53,0xa5,0xc5,0x28] # CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} @@ -963,6 +1268,16 @@ fminm.h fa0, fa1, fa2 # CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} fmaxm.h fs3, fs4, fs5 +# CHECK-ASM-AND-OBJ: fminm.q fa0, fa1, fa2 +# CHECK-ASM: encoding: [0x53,0xa5,0xc5,0x2e] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fminm.q fa0, fa1, fa2 + +# CHECK-ASM-AND-OBJ: fmaxm.q fs3, fs4, fs5 +# CHECK-ASM: encoding: [0xd3,0x39,0x5a,0x2f] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fmaxm.q fs3, fs4, fs5 + # CHECK-ASM-AND-OBJ: fround.s fs1, fs2, dyn # CHECK-ASM: encoding: [0xd3,0x74,0x49,0x40] # CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} @@ -1083,6 +1398,46 @@ froundnx.h ft1, fa1, rtz # CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} froundnx.h fs1, fs2, rne +# CHECK-ASM-AND-OBJ: fround.q fs1, fs2, dyn +# CHECK-ASM: encoding: [0xd3,0x74,0x49,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fround.q fs1, fs2 + +# CHECK-ASM-AND-OBJ: fround.q fs1, fs2, dyn +# CHECK-ASM: encoding: [0xd3,0x74,0x49,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fround.q fs1, fs2, dyn + +# CHECK-ASM-AND-OBJ: fround.q fs1, fs2, rtz +# CHECK-ASM: encoding: [0xd3,0x14,0x49,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fround.q fs1, fs2, rtz + +# CHECK-ASM-AND-OBJ: fround.q fs1, fs2, rne +# CHECK-ASM: encoding: [0xd3,0x04,0x49,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fround.q fs1, fs2, rne + +# CHECK-ASM-AND-OBJ: froundnx.q fs1, fs2, dyn +# CHECK-ASM: encoding: [0xd3,0x74,0x59,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +froundnx.q fs1, fs2 + +# CHECK-ASM-AND-OBJ: froundnx.q fs1, fs2, dyn +# CHECK-ASM: encoding: [0xd3,0x74,0x59,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +froundnx.q fs1, fs2, dyn + +# CHECK-ASM-AND-OBJ: froundnx.q fs1, fs2, rtz +# CHECK-ASM: encoding: [0xd3,0x14,0x59,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +froundnx.q fs1, fs2, rtz + +# CHECK-ASM-AND-OBJ: froundnx.q fs1, fs2, rne +# CHECK-ASM: encoding: [0xd3,0x04,0x59,0x46] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +froundnx.q fs1, fs2, rne + # CHECK-ASM-AND-OBJ: fcvtmod.w.d a1, ft1, rtz # CHECK-ASM: encoding: [0xd3,0x95,0x80,0xc2] # CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} @@ -1147,3 +1502,23 @@ fgtq.h a1, fs1, fs2 # CHECK-ASM: encoding: [0xd3,0x45,0x11,0xa4] # CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} fgeq.h a1, ft1, ft2 + +# CHECK-ASM-AND-OBJ: fltq.q a1, fs1, fs2 +# CHECK-ASM: encoding: [0xd3,0xd5,0x24,0xa7] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fltq.q a1, fs1, fs2 + +# CHECK-ASM-AND-OBJ: fleq.q a1, ft1, ft2 +# CHECK-ASM: encoding: [0xd3,0xc5,0x20,0xa6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fleq.q a1, ft1, ft2 + +# CHECK-ASM-AND-OBJ: fltq.q a1, fs2, fs1 +# CHECK-ASM: encoding: [0xd3,0x55,0x99,0xa6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fgtq.q a1, fs1, fs2 + +# CHECK-ASM-AND-OBJ: fleq.q a1, ft2, ft1 +# CHECK-ASM: encoding: [0xd3,0x45,0x11,0xa6] +# CHECK-NO-EXT: error: instruction requires the following: 'Zfa' (Additional Floating-Point){{$}} +fgeq.q a1, ft1, ft2 diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td index 2f877029c8396..8270de5eb2132 100644 --- a/llvm/test/TableGen/directive1.td +++ b/llvm/test/TableGen/directive1.td @@ -84,6 +84,14 @@ def TDL_DirA : Directive<"dira"> { // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Category_enumSize = 6; // CHECK-EMPTY: +// CHECK-NEXT: enum class SourceLanguage : uint32_t { +// CHECK-NEXT: C = 1U, +// CHECK-NEXT: Fortran = 2U, +// CHECK-NEXT: LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Fortran) +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: static constexpr std::size_t SourceLanguage_enumSize = 2; +// CHECK-EMPTY: // CHECK-NEXT: enum class Directive { // CHECK-NEXT: TDLD_dira, // CHECK-NEXT: }; @@ -129,6 +137,7 @@ def TDL_DirA : Directive<"dira"> { // CHECK-NEXT: constexpr std::size_t getMaxLeafCount() { return 0; } // CHECK-NEXT: LLVM_ABI Association getDirectiveAssociation(Directive D); // CHECK-NEXT: LLVM_ABI Category getDirectiveCategory(Directive D); +// CHECK-NEXT: LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D); // CHECK-NEXT: LLVM_ABI AKind getAKind(StringRef); // CHECK-NEXT: LLVM_ABI llvm::StringRef getTdlAKindName(AKind); // CHECK-EMPTY: @@ -390,6 +399,14 @@ def TDL_DirA : Directive<"dira"> { // IMPL-NEXT: llvm_unreachable("Unexpected directive"); // IMPL-NEXT: } // IMPL-EMPTY: +// IMPL-NEXT: llvm::tdl::SourceLanguage llvm::tdl::getDirectiveLanguages(llvm::tdl::Directive D) { +// IMPL-NEXT: switch (D) { +// IMPL-NEXT: case llvm::tdl::TDLD_dira: +// IMPL-NEXT: return llvm::tdl::SourceLanguage::C | llvm::tdl::SourceLanguage::Fortran; +// IMPL-NEXT: } // switch(D) +// IMPL-NEXT: llvm_unreachable("Unexpected directive"); +// IMPL-NEXT: } +// IMPL-EMPTY: // IMPL-NEXT: static_assert(sizeof(llvm::tdl::Directive) == sizeof(int)); // IMPL-NEXT: {{.*}} static const llvm::tdl::Directive LeafConstructTable[][2] = { // IMPL-NEXT: {llvm::tdl::TDLD_dira, static_cast(0),}, diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td index 3f1a44cfdd4f9..58740cb8e1d96 100644 --- a/llvm/test/TableGen/directive2.td +++ b/llvm/test/TableGen/directive2.td @@ -75,6 +75,14 @@ def TDL_DirA : Directive<"dira"> { // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Category_enumSize = 6; // CHECK-EMPTY: +// CHECK-NEXT: enum class SourceLanguage : uint32_t { +// CHECK-NEXT: C = 1U, +// CHECK-NEXT: Fortran = 2U, +// CHECK-NEXT: LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Fortran) +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: static constexpr std::size_t SourceLanguage_enumSize = 2; +// CHECK-EMPTY: // CHECK-NEXT: enum class Directive { // CHECK-NEXT: TDLD_dira, // CHECK-NEXT: }; @@ -105,6 +113,7 @@ def TDL_DirA : Directive<"dira"> { // CHECK-NEXT: constexpr std::size_t getMaxLeafCount() { return 0; } // CHECK-NEXT: LLVM_ABI Association getDirectiveAssociation(Directive D); // CHECK-NEXT: LLVM_ABI Category getDirectiveCategory(Directive D); +// CHECK-NEXT: LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D); // CHECK-NEXT: } // namespace tdl // CHECK-NEXT: } // namespace llvm // CHECK-NEXT: #endif // LLVM_Tdl_INC @@ -321,6 +330,14 @@ def TDL_DirA : Directive<"dira"> { // IMPL-NEXT: llvm_unreachable("Unexpected directive"); // IMPL-NEXT: } // IMPL-EMPTY: +// IMPL-NEXT: llvm::tdl::SourceLanguage llvm::tdl::getDirectiveLanguages(llvm::tdl::Directive D) { +// IMPL-NEXT: switch (D) { +// IMPL-NEXT: case llvm::tdl::TDLD_dira: +// IMPL-NEXT: return llvm::tdl::SourceLanguage::C | llvm::tdl::SourceLanguage::Fortran; +// IMPL-NEXT: } // switch(D) +// IMPL-NEXT: llvm_unreachable("Unexpected directive"); +// IMPL-NEXT: } +// IMPL-EMPTY: // IMPL-NEXT: static_assert(sizeof(llvm::tdl::Directive) == sizeof(int)); // IMPL-NEXT: {{.*}} static const llvm::tdl::Directive LeafConstructTable[][2] = { // IMPL-NEXT: {llvm::tdl::TDLD_dira, static_cast(0),}, diff --git a/llvm/test/ThinLTO/X86/cache-emit-asm.ll b/llvm/test/ThinLTO/X86/cache-emit-asm.ll new file mode 100644 index 0000000000000..b6e5ca25a637d --- /dev/null +++ b/llvm/test/ThinLTO/X86/cache-emit-asm.ll @@ -0,0 +1,15 @@ +;; This test runs thin LTO with cache only to look for memory errors, either +;; as crashes or sanitizer errors. MCAsmStreamer has specific assumptions about +;; the lifetime of the output stream that are easy to overlook (see #138194). + +; RUN: rm -rf %t && mkdir -p %t +; RUN: opt -module-hash -module-summary -thinlto-bc %s -o %t1.bc +; RUN: llvm-lto2 run -cache-dir %t/cache --filetype=asm -o %t.o %t1.bc -r=%t1.bc,globalfunc + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @globalfunc() { +entry: + ret void +} diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addr-reuse.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addr-reuse.ll new file mode 100644 index 0000000000000..019f311406550 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addr-reuse.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -p 'require,codegenprepare' -cgpp-huge-func=0 < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +declare void @g(ptr) + +; %load and %load5 use the same address, %load5 is optimized first, %load is +; optimized later and reuse the same address computation instruction. We must +; make sure not to generate use before def error. + +define void @f(ptr %arg) { +; CHECK-LABEL: define void @f( +; CHECK-SAME: ptr [[ARG:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr [[ARG]], i64 -64 +; CHECK-NEXT: call void @g(ptr [[GETELEMENTPTR]]) +; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[ARG]], i64 -64 +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[SUNKADDR1]], align 8 +; CHECK-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[ARG]], i64 -56 +; CHECK-NEXT: [[LOAD4:%.*]] = load i32, ptr [[SUNKADDR]], align 8 +; CHECK-NEXT: [[LOAD5:%.*]] = load ptr, ptr [[SUNKADDR1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 0) +; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0 +; CHECK-NEXT: ret void +; +bb: + %getelementptr = getelementptr i8, ptr %arg, i64 -64 + %getelementptr1 = getelementptr i8, ptr %arg, i64 -56 + call void @g(ptr %getelementptr) + br label %bb3 + +bb3: + %load = load ptr, ptr %getelementptr, align 8 + %load4 = load i32, ptr %getelementptr1, align 8 + %load5 = load ptr, ptr %getelementptr, align 8 + %add = add i32 1, 0 + %icmp = icmp eq i32 %add, 0 + br i1 %icmp, label %bb7, label %bb7 + +bb7: + ret void +} diff --git a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll index 8bd0b4100cff9..deefe05ddb317 100644 --- a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll +++ b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll @@ -6,8 +6,7 @@ define i1 @test_second_and_condition_implied_by_first(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_1]], true -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -31,8 +30,7 @@ define i1 @test_first_and_condition_implied_by_second_ops(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[AND:%.*]] = and i1 true, [[C_1]] -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -56,8 +54,7 @@ define i1 @test_second_and_condition_implied_by_first_select_form(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[AND:%.*]] = select i1 [[C_1]], i1 true, i1 false -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -105,8 +102,7 @@ define i1 @test_same_cond_for_and(i8 %x) { ; CHECK-LABEL: @test_same_cond_for_and( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 -; CHECK-NEXT: [[AND:%.*]] = and i1 true, [[C_1]] -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -152,8 +148,7 @@ define i1 @test_second_and_condition_not_implied_by_first(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[C_2:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[AND:%.*]] = and i1 true, [[C_1]] -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -371,8 +366,7 @@ define i1 @test_and_used_in_false_branch(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_1]], true -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 true ; CHECK: else: @@ -397,8 +391,7 @@ define i1 @test_or_used_in_false_branch(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ule i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ule i8 [[X]], 5 -; CHECK-NEXT: [[AND:%.*]] = or i1 [[C_1]], false -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 [[T_1]] ; CHECK: else: @@ -423,8 +416,7 @@ define i1 @test_or_used_in_false_branch2(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[AND:%.*]] = or i1 false, [[T_1]] -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[T_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 [[T_1]] ; CHECK: else: @@ -450,8 +442,7 @@ define i1 @and_select_first_implies_second_may_be_poison(ptr noundef %A, ptr nou ; CHECK-NEXT: [[C_1:%.*]] = icmp ne ptr [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 -1 ; CHECK-NEXT: [[C_2:%.*]] = icmp ugt ptr [[GEP]], [[A]] -; CHECK-NEXT: [[AND:%.*]] = select i1 [[C_2]], i1 true, i1 false -; CHECK-NEXT: ret i1 [[AND]] +; CHECK-NEXT: ret i1 [[C_2]] ; entry: %c.1 = icmp ne ptr %A, %B @@ -504,8 +495,7 @@ define void @and_tree_second_implies_first(i32 noundef %v0, i32 noundef %v1, i32 ; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i32 [[V1]], [[V2:%.*]] ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[CMP0]], [[CMP1]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[V0]], [[V2]] -; CHECK-NEXT: [[AND2:%.*]] = and i1 false, [[AND1]] -; CHECK-NEXT: br i1 [[AND2]], label [[IF_THEN:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[RETURN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @side_effect() ; CHECK-NEXT: br label [[RETURN]] @@ -535,8 +525,7 @@ define void @and_tree_second_implies_first_perm1(i32 noundef %v0, i32 noundef %v ; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i32 [[V1]], [[V2:%.*]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[V0]], [[V2]] ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[CMP2]], [[CMP1]] -; CHECK-NEXT: [[AND2:%.*]] = and i1 false, [[AND1]] -; CHECK-NEXT: br i1 [[AND2]], label [[IF_THEN:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[RETURN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @side_effect() ; CHECK-NEXT: br label [[RETURN]] @@ -567,8 +556,7 @@ define void @and_tree_second_implies_first_perm2(i32 noundef %v0, i32 noundef %v ; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i32 [[V1]], [[V2:%.*]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[V0]], [[V2]] ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[CMP0]], [[CMP2]] -; CHECK-NEXT: [[AND2:%.*]] = and i1 false, [[AND1]] -; CHECK-NEXT: br i1 [[AND2]], label [[IF_THEN:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[RETURN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @side_effect() ; CHECK-NEXT: br label [[RETURN]] @@ -629,8 +617,7 @@ define void @or_tree_second_implies_first(i32 noundef %v0, i32 noundef %v1, i32 ; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i32 [[V1]], [[V2:%.*]] ; CHECK-NEXT: [[AND1:%.*]] = or i1 [[CMP0]], [[CMP1]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[V0]], [[V2]] -; CHECK-NEXT: [[AND2:%.*]] = or i1 true, [[AND1]] -; CHECK-NEXT: br i1 [[AND2]], label [[IF_THEN:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: br i1 true, label [[IF_THEN:%.*]], label [[RETURN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @side_effect() ; CHECK-NEXT: br label [[RETURN]] @@ -659,8 +646,7 @@ define void @or_tree_second_implies_first_with_unknown_cond(i64 %x, i1 %cond) { ; CHECK-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[X:%.*]], 1 ; CHECK-NEXT: [[OR1:%.*]] = select i1 [[CMP1]], i1 [[COND:%.*]], i1 false ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[X]], 2 -; CHECK-NEXT: [[OR2:%.*]] = select i1 [[OR1]], i1 false, i1 false -; CHECK-NEXT: br i1 [[OR2]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[IF_END:%.*]] ; CHECK: if.then: ; CHECK-NEXT: call void @side_effect() ; CHECK-NEXT: br label [[IF_END]] diff --git a/llvm/test/Transforms/ConstraintElimination/eq.ll b/llvm/test/Transforms/ConstraintElimination/eq.ll index 04cd39490cdef..511a08f7796a3 100644 --- a/llvm/test/Transforms/ConstraintElimination/eq.ll +++ b/llvm/test/Transforms/ConstraintElimination/eq.ll @@ -432,8 +432,7 @@ define i1 @test_eq_for_signed_cmp(i32 noundef %v0, i32 noundef %v1, i32 noundef ; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i32 [[V0]], [[V1:%.*]] ; CHECK-NEXT: [[AND0:%.*]] = and i1 [[CMP1]], [[CMP]] ; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[V1]], [[V2]] -; CHECK-NEXT: [[AND1:%.*]] = and i1 false, [[AND0]] -; CHECK-NEXT: ret i1 [[AND1]] +; CHECK-NEXT: ret i1 false ; entry: %cmp = icmp eq i32 %v2, %v0 @@ -457,8 +456,7 @@ define i1 @test_eq_for_signed_cmp_with_decompsition(i32 noundef %v0, i32 noundef ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[AND0]], [[CMP2]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[CMP3]] ; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[V1]], [[V2]] -; CHECK-NEXT: [[AND3:%.*]] = and i1 false, [[AND2]] -; CHECK-NEXT: ret i1 [[AND3]] +; CHECK-NEXT: ret i1 false ; entry: %v0add = add nsw i32 %v0, %addend0 diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-signed-predicates.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-signed-predicates.ll index 52094914f6962..c9f4984bcba60 100644 --- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-signed-predicates.ll +++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-signed-predicates.ll @@ -611,14 +611,12 @@ define i4 @ptr_N_signed_positive_assume(ptr %src, ptr %lower, ptr %upper, i16 %N ; CHECK: step.check: ; CHECK-NEXT: [[STEP_POS:%.*]] = icmp sge i16 [[STEP:%.*]], 0 ; CHECK-NEXT: [[STEP_SLT_N:%.*]] = icmp slt i16 [[STEP]], [[N]] -; CHECK-NEXT: [[AND_STEP:%.*]] = and i1 false, [[STEP_SLT_N]] -; CHECK-NEXT: br i1 [[AND_STEP]], label [[PTR_CHECK:%.*]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 false, label [[PTR_CHECK:%.*]], label [[EXIT:%.*]] ; CHECK: ptr.check: ; CHECK-NEXT: [[SRC_STEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i16 [[STEP]] ; CHECK-NEXT: [[CMP_STEP_START:%.*]] = icmp slt ptr [[SRC_STEP]], [[LOWER]] ; CHECK-NEXT: [[CMP_STEP_END:%.*]] = icmp sge ptr [[SRC_STEP]], [[UPPER]] -; CHECK-NEXT: [[OR_CHECK:%.*]] = or i1 true, [[CMP_STEP_END]] -; CHECK-NEXT: br i1 [[OR_CHECK]], label [[TRAP_BB]], label [[EXIT]] +; CHECK-NEXT: br i1 true, label [[TRAP_BB]], label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret i4 3 ; diff --git a/llvm/test/Transforms/ConstraintElimination/geps-precondition-overflow-check.ll b/llvm/test/Transforms/ConstraintElimination/geps-precondition-overflow-check.ll index 08b25c6065aac..d90b986c8e539 100644 --- a/llvm/test/Transforms/ConstraintElimination/geps-precondition-overflow-check.ll +++ b/llvm/test/Transforms/ConstraintElimination/geps-precondition-overflow-check.ll @@ -36,8 +36,7 @@ define i1 @overflow_check_2_and(ptr %dst) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DST_5:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 5 ; CHECK-NEXT: [[DST_5_UGE:%.*]] = icmp uge ptr [[DST_5]], [[DST]] -; CHECK-NEXT: [[AND:%.*]] = and i1 true, [[DST_5_UGE]] -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[DST_5_UGE]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: [[DST_4:%.*]] = getelementptr i32, ptr [[DST]], i64 4 ; CHECK-NEXT: [[TRUE_DST_4_UGE:%.*]] = icmp uge ptr [[DST_4]], [[DST]] @@ -65,8 +64,7 @@ define i1 @overflow_check_3_and(ptr %dst) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DST_5:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 5 ; CHECK-NEXT: [[DST_5_UGE:%.*]] = icmp uge ptr [[DST_5]], [[DST]] -; CHECK-NEXT: [[AND:%.*]] = and i1 true, [[DST_5_UGE]] -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[DST_5_UGE]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: [[DST_4:%.*]] = getelementptr i32, ptr [[DST]], i64 4 ; CHECK-NEXT: [[DST_4_UGE:%.*]] = icmp uge ptr [[DST_4]], [[DST]] @@ -98,8 +96,7 @@ define i1 @overflow_check_4_and(ptr %dst) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DST_5:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 5 ; CHECK-NEXT: [[DST_5_UGE:%.*]] = icmp uge ptr [[DST_5]], [[DST]] -; CHECK-NEXT: [[AND:%.*]] = and i1 true, [[DST_5_UGE]] -; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[DST_5_UGE]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: [[DST_4:%.*]] = getelementptr i32, ptr [[DST]], i64 4 ; CHECK-NEXT: [[TRUE_DST_4_UGE:%.*]] = icmp uge ptr [[DST_4]], [[DST]] @@ -152,8 +149,7 @@ define i1 @overflow_check_3_or(ptr %dst) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DST_5:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 5 ; CHECK-NEXT: [[DST_5_UGE:%.*]] = icmp uge ptr [[DST_5]], [[DST]] -; CHECK-NEXT: [[OR:%.*]] = or i1 false, [[DST_5_UGE]] -; CHECK-NEXT: br i1 [[OR]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[DST_5_UGE]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: [[DST_4:%.*]] = getelementptr i32, ptr [[DST]], i64 4 ; CHECK-NEXT: [[TRUE_DST_4_UGE:%.*]] = icmp uge ptr [[DST_4]], [[DST]] diff --git a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll index 279238bea1842..91546d4abf438 100644 --- a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll +++ b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll @@ -23,8 +23,7 @@ define void @checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %n) ; CHECK-NEXT: [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]] ; CHECK-NEXT: [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]] ; CHECK-NEXT: [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]] -; CHECK-NEXT: [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]] -; CHECK-NEXT: br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]] +; CHECK-NEXT: br i1 [[CMP_PTR_IV_UPPER]], label [[TRAP]], label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: store i8 0, ptr [[PTR_IV]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1 @@ -88,14 +87,13 @@ define void @some_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 ; CHECK-NEXT: [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]] ; CHECK-NEXT: [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]] ; CHECK-NEXT: [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]] -; CHECK-NEXT: [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]] -; CHECK-NEXT: br i1 [[OR]], label [[TRAP]], label [[LOOP_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP_PTR_IV_UPPER]], label [[TRAP]], label [[LOOP_BODY:%.*]] ; CHECK: loop.body: ; CHECK-NEXT: [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1 ; CHECK-NEXT: [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]] ; CHECK-NEXT: [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]] ; CHECK-NEXT: [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]] -; CHECK-NEXT: br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]] +; CHECK-NEXT: br i1 [[CMP_PTR_IV_UPPER]], label [[TRAP]], label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: store i8 0, ptr [[PTR_IV]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1 @@ -165,14 +163,13 @@ define void @no_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 % ; CHECK-NEXT: [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]] ; CHECK-NEXT: [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]] ; CHECK-NEXT: [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]] -; CHECK-NEXT: [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]] -; CHECK-NEXT: br i1 [[OR]], label [[TRAP]], label [[LOOP_BODY:%.*]] +; CHECK-NEXT: br i1 [[CMP_PTR_IV_UPPER]], label [[TRAP]], label [[LOOP_BODY:%.*]] ; CHECK: loop.body: ; CHECK-NEXT: [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1 ; CHECK-NEXT: [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]] ; CHECK-NEXT: [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]] ; CHECK-NEXT: [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]] -; CHECK-NEXT: br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]] +; CHECK-NEXT: br i1 [[CMP_PTR_IV_UPPER]], label [[TRAP]], label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: store i8 0, ptr [[PTR_IV]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1 diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll index 1842ca2d82545..df0cb40965430 100644 --- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll +++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll @@ -212,8 +212,7 @@ define void @test2_with_ne(ptr %src, ptr %lower, ptr %upper, i8 %N) { ; CHECK-NEXT: [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]] ; CHECK-NEXT: [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]] ; CHECK-NEXT: [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]] -; CHECK-NEXT: [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]] -; CHECK-NEXT: br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]] +; CHECK-NEXT: br i1 [[CMP_IV_END]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]] ; CHECK: loop.body.1: ; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1 ; CHECK-NEXT: [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]] @@ -306,8 +305,7 @@ define void @test3(ptr %src, ptr %lower, ptr %upper, i8 %N) { ; CHECK-NEXT: [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]] ; CHECK-NEXT: [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]] ; CHECK-NEXT: [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]] -; CHECK-NEXT: [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]] -; CHECK-NEXT: br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]] +; CHECK-NEXT: br i1 [[CMP_IV_END]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]] ; CHECK: loop.body.1: ; CHECK-NEXT: [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[NEXT]] ; CHECK-NEXT: [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]] diff --git a/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll index f5c108822b8cd..4303cacc59ed1 100644 --- a/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll +++ b/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll @@ -6,8 +6,7 @@ define i1 @test_second_or_condition_implied_by_first(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ule i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[OR:%.*]] = or i1 true, [[T_1]] -; CHECK-NEXT: br i1 [[OR]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 true, label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -31,8 +30,7 @@ define i1 @test_first_or_condition_implied_by_second_ops(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ule i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[OR:%.*]] = or i1 [[T_1]], true -; CHECK-NEXT: br i1 [[OR]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 true, label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -105,8 +103,7 @@ define i1 @test_same_cond_for_or(i8 %x) { ; CHECK-LABEL: @test_same_cond_for_or( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 -; CHECK-NEXT: [[OR:%.*]] = or i1 false, [[C_1]] -; CHECK-NEXT: br i1 [[OR]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -152,8 +149,7 @@ define i1 @test_second_or_condition_not_implied_by_first(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[C_2:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[OR:%.*]] = or i1 [[C_2]], false -; CHECK-NEXT: br i1 [[OR]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 false ; CHECK: else: @@ -244,8 +240,7 @@ define i1 @test_or_used_in_false_branch(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ule i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ule i8 [[X]], 5 -; CHECK-NEXT: [[OR:%.*]] = or i1 [[C_1]], false -; CHECK-NEXT: br i1 [[OR]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 [[T_1]] ; CHECK: else: @@ -270,8 +265,7 @@ define i1 @test_or_used_in_false_branch2(i8 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 ; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 -; CHECK-NEXT: [[OR:%.*]] = or i1 false, [[T_1]] -; CHECK-NEXT: br i1 [[OR]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: br i1 [[T_1]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: ; CHECK-NEXT: ret i1 [[T_1]] ; CHECK: else: @@ -307,3 +301,17 @@ entry: %or = select i1 %cmp.eq, i1 true, i1 %cmp.eq.1 ret i1 %or } + +define i1 @test_or_disjoint_set_operand(i8 %x) { +; CHECK-LABEL: @test_or_disjoint_set_operand( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[X:%.*]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i8 [[X]], 0 +; CHECK-NEXT: ret i1 true +; +entry: + %cmp1 = icmp slt i8 %x, 1 + %cmp2 = icmp ne i8 %x, 0 + %or = or disjoint i1 %cmp2, %cmp1 + ret i1 %or +} diff --git a/llvm/test/Transforms/ConstraintElimination/or.ll b/llvm/test/Transforms/ConstraintElimination/or.ll index b401d6f181369..0827669f1bcbe 100644 --- a/llvm/test/Transforms/ConstraintElimination/or.ll +++ b/llvm/test/Transforms/ConstraintElimination/or.ll @@ -124,8 +124,7 @@ define i1 @test_or_chain_ule_1(i4 %x, i4 %y, i4 %z, i4 %a, i4 %b) { ; CHECK-NEXT: [[C_3:%.*]] = icmp ule i4 2, [[X]] ; CHECK-NEXT: [[C_4:%.*]] = icmp ule i4 2, [[A:%.*]] ; CHECK-NEXT: [[OR_1:%.*]] = or i1 [[C_1]], [[C_2]] -; CHECK-NEXT: [[OR_2:%.*]] = or i1 [[OR_1]], true -; CHECK-NEXT: [[OR_3:%.*]] = or i1 [[C_4]], [[OR_2]] +; CHECK-NEXT: [[OR_3:%.*]] = or i1 [[C_4]], true ; CHECK-NEXT: br i1 [[OR_3]], label [[BB1:%.*]], label [[EXIT:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[C_5:%.*]] = icmp ule i4 [[X]], [[Z]] diff --git a/llvm/test/Transforms/Coroutines/gh105595.ll b/llvm/test/Transforms/Coroutines/gh105595.ll new file mode 100644 index 0000000000000..0efe21216e998 --- /dev/null +++ b/llvm/test/Transforms/Coroutines/gh105595.ll @@ -0,0 +1,31 @@ +; Test that store-load operation that crosses suspension point will not be eliminated by DSE +; Coro result conversion function that attempts to modify promise shall produce this pattern +; RUN: opt < %s -passes='coro-early,dse' -S | FileCheck %s + +define void @fn() presplitcoroutine { + %__promise = alloca ptr, align 8 + %id = call token @llvm.coro.id(i32 16, ptr %__promise, ptr @fn, ptr null) + %hdl = call ptr @llvm.coro.begin(token %id, ptr null) +; CHECK: %promise.addr = call ptr @llvm.coro.promise(ptr %hdl, i32 8, i1 false) + %save = call token @llvm.coro.save(ptr null) + %sp = call i8 @llvm.coro.suspend(token %save, i1 false) + %flag = icmp ule i8 %sp, 1 + br i1 %flag, label %resume, label %suspend + +resume: +; CHECK: call void @use_value(ptr %promise.addr) + call void @use_value(ptr %__promise) + br label %suspend + +suspend: +; load value when resume +; CHECK: %null = load ptr, ptr %promise.addr, align 8 + %null = load ptr, ptr %__promise, align 8 + call void @use_value(ptr %null) +; store value when suspend +; CHECK: store ptr null, ptr %promise.addr, align 8 + store ptr null, ptr %__promise, align 8 + ret void +} + +declare void @use_value(ptr) diff --git a/llvm/test/Transforms/ForcedFunctionAttrs/open-file-error.ll b/llvm/test/Transforms/ForcedFunctionAttrs/open-file-error.ll new file mode 100644 index 0000000000000..61db001d7eb1e --- /dev/null +++ b/llvm/test/Transforms/ForcedFunctionAttrs/open-file-error.ll @@ -0,0 +1,6 @@ +; RUN: not opt -disable-output -passes='forceattrs' -forceattrs-csv-path="%S/CannotOpenFile.csv" %s 2>&1 | FileCheck -DMSG=%errc_ENOENT %s + +; CHECK: error: cannot open CSV file: [[MSG]] +define void @first_function() { + ret void +} diff --git a/llvm/test/Transforms/GVN/phi.ll b/llvm/test/Transforms/GVN/phi.ll index 5d4f227132a6f..5b607f7559c1b 100644 --- a/llvm/test/Transforms/GVN/phi.ll +++ b/llvm/test/Transforms/GVN/phi.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=gvn < %s | FileCheck %s +; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s +; RUN: opt -S -passes='gvn' < %s | FileCheck %s define i64 @test1(i1 %c, i64 %a, i64 %b) { @@ -198,3 +199,5 @@ next: %phi = phi i64 [%a, %merge] ret i64 %phi } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; MDEP: {{.*}} diff --git a/llvm/test/Transforms/GVN/pre-compare.ll b/llvm/test/Transforms/GVN/pre-compare.ll index ea8fbce01bd6c..574d40dfb71d5 100644 --- a/llvm/test/Transforms/GVN/pre-compare.ll +++ b/llvm/test/Transforms/GVN/pre-compare.ll @@ -1,4 +1,6 @@ -; RUN: opt -passes=gvn -S < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s +; RUN: opt -passes='gvn' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s ; C source: ; @@ -37,6 +39,28 @@ @.str3 = private unnamed_addr constant [12 x i8] c"step 2: %d\0A\00", align 1 define void @f(i32 %x) noreturn nounwind uwtable ssp { +; CHECK-LABEL: define void @f( +; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 1 +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_COND_PREHEADER:.*]], label %[[IF_THEN:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[X]], 2 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], ptr @.str, ptr @.str1 +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @puts(ptr [[COND]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: br label %[[FOR_COND_PREHEADER]] +; CHECK: [[FOR_COND_PREHEADER]]: +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i32 [[X]], 2 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @puts(ptr @.str2) #[[ATTR1]] +; CHECK-NEXT: br i1 [[CMP3]], label %[[FOR_COND_BACKEDGE:.*]], label %[[IF_END5:.*]] +; CHECK: [[IF_END5]]: +; CHECK-NEXT: [[CALL6:%.*]] = tail call i32 (ptr, ...) @printf(ptr @.str3, i32 [[X]]) #[[ATTR1]] +; CHECK-NEXT: br label %[[FOR_COND_BACKEDGE]] +; CHECK: [[FOR_COND_BACKEDGE]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; entry: %cmp = icmp eq i32 %x, 1 br i1 %cmp, label %for.cond.preheader, label %if.then @@ -66,3 +90,6 @@ for.cond.backedge: ; preds = %if.end5, %for.cond declare i32 @puts(ptr nocapture) nounwind declare i32 @printf(ptr nocapture, ...) nounwind +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; MDEP: {{.*}} +; MSSA: {{.*}} diff --git a/llvm/test/Transforms/GVN/readattrs.ll b/llvm/test/Transforms/GVN/readattrs.ll index b16c53adc0d4d..be018834014d5 100644 --- a/llvm/test/Transforms/GVN/readattrs.ll +++ b/llvm/test/Transforms/GVN/readattrs.ll @@ -1,4 +1,6 @@ -; RUN: opt -passes=gvn -S -o - < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=gvn -S -o - < %s | FileCheck --check-prefixes=CHECK,MDEP %s +; RUN: opt -passes='gvn' -S -o - < %s | FileCheck --check-prefixes=CHECK,MSSA %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" @@ -6,12 +8,24 @@ target triple = "x86_64-unknown-linux-gnu" declare void @use(ptr readonly nocapture) define i8 @test() { +; MDEP-LABEL: define i8 @test() { +; MDEP-NEXT: [[A:%.*]] = alloca i8, align 1 +; MDEP-NEXT: store i8 1, ptr [[A]], align 1 +; MDEP-NEXT: call void @use(ptr [[A]]) +; MDEP-NEXT: ret i8 1 +; +; MSSA-LABEL: define i8 @test() { +; MSSA-NEXT: [[A:%.*]] = alloca i8, align 1 +; MSSA-NEXT: store i8 1, ptr [[A]], align 1 +; MSSA-NEXT: call void @use(ptr [[A]]) +; MSSA-NEXT: [[B:%.*]] = load i8, ptr [[A]], align 1 +; MSSA-NEXT: ret i8 [[B]] +; %a = alloca i8 store i8 1, ptr %a call void @use(ptr %a) %b = load i8, ptr %a ret i8 %b -; CHECK-LABEL: define i8 @test( -; CHECK: call void @use(ptr %a) -; CHECK-NEXT: ret i8 1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/GVN/setjmp.ll b/llvm/test/Transforms/GVN/setjmp.ll index 07b7028346760..7777038f89cb1 100644 --- a/llvm/test/Transforms/GVN/setjmp.ll +++ b/llvm/test/Transforms/GVN/setjmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=gvn < %s | FileCheck %s - +; RUN: opt -S -passes=gvn < %s | FileCheck --check-prefixes=CHECK,MDEP %s +; RUN: opt -S -passes='gvn' -verify-analysis-invalidation < %s | FileCheck --check-prefixes=CHECK,MSSA %s declare i32 @setjmp() returns_twice declare void @longjmp() declare ptr @malloc(i64) @@ -38,18 +38,32 @@ if.end: ; We are still allowed to optimize non-volatile accesses to allocas. define i32 @test_alloca() { -; CHECK-LABEL: define i32 @test_alloca() { -; CHECK-NEXT: [[ALLOC:%.*]] = alloca i43, align 8 -; CHECK-NEXT: store i32 10, ptr [[ALLOC]], align 4 -; CHECK-NEXT: [[SJ:%.*]] = call i32 @setjmp() -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[SJ]], 0 -; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] -; CHECK: [[IF_THEN]]: -; CHECK-NEXT: store i32 20, ptr [[ALLOC]], align 4 -; CHECK-NEXT: call void @longjmp() -; CHECK-NEXT: unreachable -; CHECK: [[IF_END]]: -; CHECK-NEXT: ret i32 10 +; MDEP-LABEL: define i32 @test_alloca() { +; MDEP-NEXT: [[ALLOC:%.*]] = alloca i43, align 8 +; MDEP-NEXT: store i32 10, ptr [[ALLOC]], align 4 +; MDEP-NEXT: [[SJ:%.*]] = call i32 @setjmp() +; MDEP-NEXT: [[CMP:%.*]] = icmp eq i32 [[SJ]], 0 +; MDEP-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; MDEP: [[IF_THEN]]: +; MDEP-NEXT: store i32 20, ptr [[ALLOC]], align 4 +; MDEP-NEXT: call void @longjmp() +; MDEP-NEXT: unreachable +; MDEP: [[IF_END]]: +; MDEP-NEXT: ret i32 10 +; +; MSSA-LABEL: define i32 @test_alloca() { +; MSSA-NEXT: [[ALLOC:%.*]] = alloca i43, align 8 +; MSSA-NEXT: store i32 10, ptr [[ALLOC]], align 4 +; MSSA-NEXT: [[SJ:%.*]] = call i32 @setjmp() +; MSSA-NEXT: [[CMP:%.*]] = icmp eq i32 [[SJ]], 0 +; MSSA-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; MSSA: [[IF_THEN]]: +; MSSA-NEXT: store i32 20, ptr [[ALLOC]], align 4 +; MSSA-NEXT: call void @longjmp() +; MSSA-NEXT: unreachable +; MSSA: [[IF_END]]: +; MSSA-NEXT: [[RES:%.*]] = load i32, ptr [[ALLOC]], align 4 +; MSSA-NEXT: ret i32 [[RES]] ; %alloc = alloca i43 store i32 10, ptr %alloc, align 4 diff --git a/llvm/test/Transforms/GVN/tbaa.ll b/llvm/test/Transforms/GVN/tbaa.ll index b5dd3867bdbc2..366dfeca8b758 100644 --- a/llvm/test/Transforms/GVN/tbaa.ll +++ b/llvm/test/Transforms/GVN/tbaa.ll @@ -1,12 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=gvn -S < %s | FileCheck %s +; RUN: opt -passes=gvn -S < %s | FileCheck --check-prefixes=CHECK,MDEP %s +; RUN: opt -passes='gvn' -S < %s | FileCheck --check-prefixes=CHECK,MSSA %s define i32 @test1(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test1( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test1( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test1( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]) +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !0 %b = call i32 @foo(ptr %p) @@ -15,11 +23,18 @@ define i32 @test1(ptr %p, ptr %q) { } define i32 @test2(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test2( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test2( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test2( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !0 %b = call i32 @foo(ptr %p), !tbaa !0 @@ -28,11 +43,18 @@ define i32 @test2(ptr %p, ptr %q) { } define i32 @test3(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test3( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test3( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test3( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4]] +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !3 %b = call i32 @foo(ptr %p), !tbaa !3 @@ -41,11 +63,18 @@ define i32 @test3(ptr %p, ptr %q) { } define i32 @test4(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test4( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test4( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test4( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !1 %b = call i32 @foo(ptr %p), !tbaa !0 @@ -54,11 +83,18 @@ define i32 @test4(ptr %p, ptr %q) { } define i32 @test5(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test5( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test5( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test5( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]] +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !0 %b = call i32 @foo(ptr %p), !tbaa !1 @@ -67,11 +103,18 @@ define i32 @test5(ptr %p, ptr %q) { } define i32 @test6(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test6( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test6( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test6( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4]] +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !0 %b = call i32 @foo(ptr %p), !tbaa !3 @@ -80,11 +123,18 @@ define i32 @test6(ptr %p, ptr %q) { } define i32 @test7(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test7( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test7( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test7( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4]] +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !4 %b = call i32 @foo(ptr %p), !tbaa !3 @@ -93,10 +143,18 @@ define i32 @test7(ptr %p, ptr %q) { } define i32 @test8(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test8( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: store i32 15, ptr [[P]], align 4 -; CHECK-NEXT: ret i32 0 +; MDEP-LABEL: define i32 @test8( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: store i32 15, ptr [[P]], align 4 +; MDEP-NEXT: ret i32 0 +; +; MSSA-LABEL: define i32 @test8( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10:![0-9]+]] +; MSSA-NEXT: store i32 15, ptr [[P]], align 4 +; MSSA-NEXT: [[B:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10]] +; MSSA-NEXT: [[C:%.*]] = sub i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; ; Since we know the location is invariant, we can forward the ; load across the potentially aliasing store. @@ -109,10 +167,18 @@ define i32 @test8(ptr %p, ptr %q) { } define i32 @test9(ptr %p, ptr %q) { -; CHECK-LABEL: define i32 @test9( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: call void @clobber() -; CHECK-NEXT: ret i32 0 +; MDEP-LABEL: define i32 @test9( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: call void @clobber() +; MDEP-NEXT: ret i32 0 +; +; MSSA-LABEL: define i32 @test9( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10]] +; MSSA-NEXT: call void @clobber() +; MSSA-NEXT: [[B:%.*]] = load i32, ptr [[Q]], align 4, !tbaa [[TBAA10]] +; MSSA-NEXT: [[C:%.*]] = sub i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; ; Since we know the location is invariant, we can forward the ; load across the potentially aliasing store (within the call). @@ -127,11 +193,18 @@ define i32 @test9(ptr %p, ptr %q) { define i32 @test10(ptr %p, ptr %q) { ; If one access encloses the other, then the merged access is the enclosed one ; and not just the common final access type. -; CHECK-LABEL: define i32 @test10( -; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { -; CHECK-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA10:![0-9]+]] -; CHECK-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] -; CHECK-NEXT: ret i32 [[C]] +; MDEP-LABEL: define i32 @test10( +; MDEP-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MDEP-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA10:![0-9]+]] +; MDEP-NEXT: [[C:%.*]] = add i32 [[A]], [[A]] +; MDEP-NEXT: ret i32 [[C]] +; +; MSSA-LABEL: define i32 @test10( +; MSSA-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) { +; MSSA-NEXT: [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA13:![0-9]+]] +; MSSA-NEXT: [[B:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA17:![0-9]+]] +; MSSA-NEXT: [[C:%.*]] = add i32 [[A]], [[B]] +; MSSA-NEXT: ret i32 [[C]] ; %a = call i32 @foo(ptr %p), !tbaa !15 ; TAG_X_i %b = call i32 @foo(ptr %p), !tbaa !19 ; TAG_Y_x_i @@ -165,18 +238,40 @@ declare i32 @foo(ptr) readonly !9 = !{!"yet another root"} !10 = !{!"node", !9, i64 1} ;. -; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} -; CHECK: [[META1]] = !{!"C", [[META2:![0-9]+]]} -; CHECK: [[META2]] = !{!"A", [[META3:![0-9]+]]} -; CHECK: [[META3]] = !{!"tbaa root"} -; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0} -; CHECK: [[META5]] = !{!"B", [[META2]]} -; CHECK: [[TBAA6]] = !{[[META2]], [[META2]], i64 0} -; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0} -; CHECK: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]} -; CHECK: [[META9]] = !{!"another root"} -; CHECK: [[TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0} -; CHECK: [[META11]] = !{!"struct X", [[META12]], i64 0} -; CHECK: [[META12]] = !{!"int", [[META13:![0-9]+]], i64 0} -; CHECK: [[META13]] = !{!"char", [[META3]], i64 0} +; MDEP: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; MDEP: [[META1]] = !{!"C", [[META2:![0-9]+]]} +; MDEP: [[META2]] = !{!"A", [[META3:![0-9]+]]} +; MDEP: [[META3]] = !{!"tbaa root"} +; MDEP: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0} +; MDEP: [[META5]] = !{!"B", [[META2]]} +; MDEP: [[TBAA6]] = !{[[META2]], [[META2]], i64 0} +; MDEP: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0} +; MDEP: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]} +; MDEP: [[META9]] = !{!"another root"} +; MDEP: [[TBAA10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]], i64 0} +; MDEP: [[META11]] = !{!"struct X", [[META12]], i64 0} +; MDEP: [[META12]] = !{!"int", [[META13:![0-9]+]], i64 0} +; MDEP: [[META13]] = !{!"char", [[META3]], i64 0} +;. +; MSSA: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; MSSA: [[META1]] = !{!"C", [[META2:![0-9]+]]} +; MSSA: [[META2]] = !{!"A", [[META3:![0-9]+]]} +; MSSA: [[META3]] = !{!"tbaa root"} +; MSSA: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0} +; MSSA: [[META5]] = !{!"B", [[META2]]} +; MSSA: [[TBAA6]] = !{[[META2]], [[META2]], i64 0} +; MSSA: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0} +; MSSA: [[META8]] = !{!"scalar type", [[META9:![0-9]+]]} +; MSSA: [[META9]] = !{!"another root"} +; MSSA: [[TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0, i64 1} +; MSSA: [[META11]] = !{!"node", [[META12:![0-9]+]]} +; MSSA: [[META12]] = !{!"yet another root"} +; MSSA: [[TBAA13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]], i64 0} +; MSSA: [[META14]] = !{!"struct X", [[META15]], i64 0} +; MSSA: [[META15]] = !{!"int", [[META16:![0-9]+]], i64 0} +; MSSA: [[META16]] = !{!"char", [[META3]], i64 0} +; MSSA: [[TBAA17]] = !{[[META18:![0-9]+]], [[META15]], i64 0} +; MSSA: [[META18]] = !{!"struct Y", [[META14]], i64 0} ;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll index d7b07b9891c41..646a67d15d392 100644 --- a/llvm/test/Transforms/GVN/vscale.ll +++ b/llvm/test/Transforms/GVN/vscale.ll @@ -1,14 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S < %s -passes=gvn,dce | FileCheck %s +; RUN: opt -S < %s -passes=gvn,dce | FileCheck --check-prefixes=CHECK,MDEP %s +; RUN: opt -S < %s -passes='gvn',dce | FileCheck --check-prefixes=CHECK,MSSA %s ; Analyze Load from clobbering Load. define @load_store_clobber_load(ptr %p) { -; CHECK-LABEL: @load_store_clobber_load( -; CHECK-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 -; CHECK-NEXT: store zeroinitializer, ptr undef, align 16 -; CHECK-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD1]] -; CHECK-NEXT: ret [[ADD]] +; MDEP-LABEL: @load_store_clobber_load( +; MDEP-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; MDEP-NEXT: store zeroinitializer, ptr undef, align 16 +; MDEP-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD1]] +; MDEP-NEXT: ret [[ADD]] +; +; MSSA-LABEL: @load_store_clobber_load( +; MSSA-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; MSSA-NEXT: store zeroinitializer, ptr undef, align 16 +; MSSA-NEXT: [[LOAD2:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD2]] +; MSSA-NEXT: ret [[ADD]] ; %load1 = load , ptr %p store zeroinitializer, ptr undef @@ -33,11 +41,18 @@ define @load_store_clobber_load_mayalias(ptr %p, ptr %p2) { } define @load_store_clobber_load_noalias(ptr noalias %p, ptr noalias %p2) { -; CHECK-LABEL: @load_store_clobber_load_noalias( -; CHECK-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 -; CHECK-NEXT: store zeroinitializer, ptr [[P2:%.*]], align 16 -; CHECK-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD1]] -; CHECK-NEXT: ret [[ADD]] +; MDEP-LABEL: @load_store_clobber_load_noalias( +; MDEP-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; MDEP-NEXT: store zeroinitializer, ptr [[P2:%.*]], align 16 +; MDEP-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD1]] +; MDEP-NEXT: ret [[ADD]] +; +; MSSA-LABEL: @load_store_clobber_load_noalias( +; MSSA-NEXT: [[LOAD1:%.*]] = load , ptr [[P:%.*]], align 16 +; MSSA-NEXT: store zeroinitializer, ptr [[P2:%.*]], align 16 +; MSSA-NEXT: [[LOAD2:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: [[ADD:%.*]] = add [[LOAD1]], [[LOAD2]] +; MSSA-NEXT: ret [[ADD]] ; %load1 = load , ptr %p store zeroinitializer, ptr %p2 @@ -48,11 +63,18 @@ define @load_store_clobber_load_noalias(ptr noalias %p, ptr n ; BasicAA return MayAlias for %gep1,%gep2, could improve as MustAlias. define i32 @load_clobber_load_gep1(ptr %p) { -; CHECK-LABEL: @load_clobber_load_gep1( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD1]] -; CHECK-NEXT: ret i32 [[ADD]] +; MDEP-LABEL: @load_clobber_load_gep1( +; MDEP-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 +; MDEP-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; MDEP-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD1]] +; MDEP-NEXT: ret i32 [[ADD]] +; +; MSSA-LABEL: @load_clobber_load_gep1( +; MSSA-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 +; MSSA-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP1]], align 4 +; MSSA-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP1]], align 4 +; MSSA-NEXT: [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]] +; MSSA-NEXT: ret i32 [[ADD]] ; %gep1 = getelementptr , ptr %p, i64 0, i64 1 %load1 = load i32, ptr %gep1 @@ -132,9 +154,14 @@ define @load_clobber_load_sideeffect(ptr %p) { ; Analyze Load from clobbering Store. define @store_forward_to_load(ptr %p) { -; CHECK-LABEL: @store_forward_to_load( -; CHECK-NEXT: store zeroinitializer, ptr [[P:%.*]], align 16 -; CHECK-NEXT: ret zeroinitializer +; MDEP-LABEL: @store_forward_to_load( +; MDEP-NEXT: store zeroinitializer, ptr [[P:%.*]], align 16 +; MDEP-NEXT: ret zeroinitializer +; +; MSSA-LABEL: @store_forward_to_load( +; MSSA-NEXT: store zeroinitializer, ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store zeroinitializer, ptr %p %load = load , ptr %p @@ -174,9 +201,15 @@ define i32 @store_clobber_load() { declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) define i32 @memset_clobber_load(ptr %p) { -; CHECK-LABEL: @memset_clobber_load( -; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) -; CHECK-NEXT: ret i32 16843009 +; MDEP-LABEL: @memset_clobber_load( +; MDEP-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; MDEP-NEXT: ret i32 16843009 +; +; MSSA-LABEL: @memset_clobber_load( +; MSSA-NEXT: tail call void @llvm.memset.p0.i64(ptr [[P:%.*]], i8 1, i64 200, i1 false) +; MSSA-NEXT: [[GEP:%.*]] = getelementptr , ptr [[P]], i64 0, i64 5 +; MSSA-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; MSSA-NEXT: ret i32 [[LOAD]] ; tail call void @llvm.memset.p0.i64(ptr %p, i8 1, i64 200, i1 false) %gep = getelementptr , ptr %p, i64 0, i64 5 @@ -214,15 +247,28 @@ define i32 @memset_clobber_load_nonconst_index(ptr %p, i64 %idx1, i64 %idx2) { ; Load elimination across BBs define ptr @load_from_alloc_replaced_with_undef() { -; CHECK-LABEL: @load_from_alloc_replaced_with_undef( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A:%.*]] = alloca , align 16 -; CHECK-NEXT: br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: store zeroinitializer, ptr [[A]], align 16 -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: -; CHECK-NEXT: ret ptr [[A]] +; MDEP-LABEL: @load_from_alloc_replaced_with_undef( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[A:%.*]] = alloca , align 16 +; MDEP-NEXT: br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; MDEP: if.then: +; MDEP-NEXT: store zeroinitializer, ptr [[A]], align 16 +; MDEP-NEXT: br label [[IF_END]] +; MDEP: if.end: +; MDEP-NEXT: ret ptr [[A]] +; +; MSSA-LABEL: @load_from_alloc_replaced_with_undef( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[A:%.*]] = alloca , align 16 +; MSSA-NEXT: [[GEP:%.*]] = getelementptr , ptr [[A]], i64 0, i64 1 +; MSSA-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 +; MSSA-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[LOAD]], 0 +; MSSA-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; MSSA: if.then: +; MSSA-NEXT: store zeroinitializer, ptr [[A]], align 16 +; MSSA-NEXT: br label [[IF_END]] +; MSSA: if.end: +; MSSA-NEXT: ret ptr [[A]] ; entry: %a = alloca @@ -240,16 +286,29 @@ if.end: } define i32 @redundant_load_elimination_1(ptr %p) { -; CHECK-LABEL: @redundant_load_elimination_1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP:%.*]] = getelementptr , ptr [[P:%.*]], i64 1, i64 1 -; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0 -; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; CHECK: if.then: -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: -; CHECK-NEXT: ret i32 [[LOAD1]] +; MDEP-LABEL: @redundant_load_elimination_1( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[GEP:%.*]] = getelementptr , ptr [[P:%.*]], i64 1, i64 1 +; MDEP-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4 +; MDEP-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0 +; MDEP-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; MDEP: if.then: +; MDEP-NEXT: br label [[IF_END]] +; MDEP: if.end: +; MDEP-NEXT: ret i32 [[LOAD1]] +; +; MSSA-LABEL: @redundant_load_elimination_1( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[GEP:%.*]] = getelementptr , ptr [[P:%.*]], i64 1, i64 1 +; MSSA-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4 +; MSSA-NEXT: [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0 +; MSSA-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; MSSA: if.then: +; MSSA-NEXT: [[LOAD2:%.*]] = load i32, ptr [[GEP]], align 4 +; MSSA-NEXT: br label [[IF_END]] +; MSSA: if.end: +; MSSA-NEXT: [[RESULT:%.*]] = phi i32 [ [[LOAD2]], [[IF_THEN]] ], [ [[LOAD1]], [[ENTRY:%.*]] ] +; MSSA-NEXT: ret i32 [[RESULT]] ; entry: %gep = getelementptr , ptr %p, i64 1, i64 1 @@ -300,17 +359,30 @@ if.else: } define void @redundant_load_elimination_zero_index(i1 %c, ptr %p, ptr %q) { -; CHECK-LABEL: @redundant_load_elimination_zero_index( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 -; CHECK-NEXT: store i32 1, ptr [[P]], align 4 -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: store i32 0, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: ret void -; CHECK: if.else: -; CHECK-NEXT: ret void +; MDEP-LABEL: @redundant_load_elimination_zero_index( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 +; MDEP-NEXT: store i32 0, ptr [[GEP1]], align 4 +; MDEP-NEXT: store i32 1, ptr [[P]], align 4 +; MDEP-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; MDEP: if.then: +; MDEP-NEXT: store i32 0, ptr [[Q:%.*]], align 4 +; MDEP-NEXT: ret void +; MDEP: if.else: +; MDEP-NEXT: ret void +; +; MSSA-LABEL: @redundant_load_elimination_zero_index( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 1 +; MSSA-NEXT: store i32 0, ptr [[GEP1]], align 4 +; MSSA-NEXT: store i32 1, ptr [[P]], align 4 +; MSSA-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; MSSA: if.then: +; MSSA-NEXT: [[T:%.*]] = load i32, ptr [[GEP1]], align 4 +; MSSA-NEXT: store i32 [[T]], ptr [[Q:%.*]], align 4 +; MSSA-NEXT: ret void +; MSSA: if.else: +; MSSA-NEXT: ret void ; entry: %gep1 = getelementptr , ptr %p, i64 0, i64 1 @@ -328,19 +400,34 @@ if.else: } define void @redundant_load_elimination_zero_index_1(i1 %c, ptr %p, ptr %q, i64 %i) { -; CHECK-LABEL: @redundant_load_elimination_zero_index_1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[J:%.*]] = add i64 [[I:%.*]], 1 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 [[J]] -; CHECK-NEXT: store i32 0, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr , ptr [[P]], i64 0, i64 [[I]] -; CHECK-NEXT: store i32 1, ptr [[GEP2]], align 4 -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: store i32 0, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: ret void -; CHECK: if.else: -; CHECK-NEXT: ret void +; MDEP-LABEL: @redundant_load_elimination_zero_index_1( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[J:%.*]] = add i64 [[I:%.*]], 1 +; MDEP-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 [[J]] +; MDEP-NEXT: store i32 0, ptr [[GEP1]], align 4 +; MDEP-NEXT: [[GEP2:%.*]] = getelementptr , ptr [[P]], i64 0, i64 [[I]] +; MDEP-NEXT: store i32 1, ptr [[GEP2]], align 4 +; MDEP-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; MDEP: if.then: +; MDEP-NEXT: store i32 0, ptr [[Q:%.*]], align 4 +; MDEP-NEXT: ret void +; MDEP: if.else: +; MDEP-NEXT: ret void +; +; MSSA-LABEL: @redundant_load_elimination_zero_index_1( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[J:%.*]] = add i64 [[I:%.*]], 1 +; MSSA-NEXT: [[GEP1:%.*]] = getelementptr , ptr [[P:%.*]], i64 0, i64 [[J]] +; MSSA-NEXT: store i32 0, ptr [[GEP1]], align 4 +; MSSA-NEXT: [[GEP2:%.*]] = getelementptr , ptr [[P]], i64 0, i64 [[I]] +; MSSA-NEXT: store i32 1, ptr [[GEP2]], align 4 +; MSSA-NEXT: br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; MSSA: if.then: +; MSSA-NEXT: [[T:%.*]] = load i32, ptr [[GEP1]], align 4 +; MSSA-NEXT: store i32 [[T]], ptr [[Q:%.*]], align 4 +; MSSA-NEXT: ret void +; MSSA: if.else: +; MSSA-NEXT: ret void ; entry: %j = add i64 %i, 1 @@ -391,10 +478,15 @@ if.else: ; Different sizes / types define @load_v16i8_store_v4i32_forward_load(ptr %p, %x) { -; CHECK-LABEL: @load_v16i8_store_v4i32_forward_load( -; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to -; CHECK-NEXT: ret [[LOAD]] +; MDEP-LABEL: @load_v16i8_store_v4i32_forward_load( +; MDEP-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MDEP-NEXT: [[TMP1:%.*]] = bitcast [[X]] to +; MDEP-NEXT: ret [[TMP1]] +; +; MSSA-LABEL: @load_v16i8_store_v4i32_forward_load( +; MSSA-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store %x, ptr %p %load = load , ptr %p @@ -402,10 +494,15 @@ define @load_v16i8_store_v4i32_forward_load(ptr %p, @load_v4f32_store_v4i32_forward_load(ptr %p, %x) { -; CHECK-LABEL: @load_v4f32_store_v4i32_forward_load( -; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to -; CHECK-NEXT: ret [[LOAD]] +; MDEP-LABEL: @load_v4f32_store_v4i32_forward_load( +; MDEP-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MDEP-NEXT: [[TMP1:%.*]] = bitcast [[X]] to +; MDEP-NEXT: ret [[TMP1]] +; +; MSSA-LABEL: @load_v4f32_store_v4i32_forward_load( +; MSSA-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store %x, ptr %p %load = load , ptr %p @@ -413,10 +510,15 @@ define @load_v4f32_store_v4i32_forward_load(ptr %p, @load_v4f32_store_v16i8_forward_load(ptr %p, %x) { -; CHECK-LABEL: @load_v4f32_store_v16i8_forward_load( -; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to -; CHECK-NEXT: ret [[LOAD]] +; MDEP-LABEL: @load_v4f32_store_v16i8_forward_load( +; MDEP-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MDEP-NEXT: [[TMP1:%.*]] = bitcast [[X]] to +; MDEP-NEXT: ret [[TMP1]] +; +; MSSA-LABEL: @load_v4f32_store_v16i8_forward_load( +; MSSA-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store %x, ptr %p %load = load , ptr %p @@ -424,10 +526,15 @@ define @load_v4f32_store_v16i8_forward_load(ptr %p, @load_v4i32_store_v4f32_forward_load(ptr %p, %x) { -; CHECK-LABEL: @load_v4i32_store_v4f32_forward_load( -; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = bitcast [[X]] to -; CHECK-NEXT: ret [[LOAD]] +; MDEP-LABEL: @load_v4i32_store_v4f32_forward_load( +; MDEP-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MDEP-NEXT: [[TMP1:%.*]] = bitcast [[X]] to +; MDEP-NEXT: ret [[TMP1]] +; +; MSSA-LABEL: @load_v4i32_store_v4f32_forward_load( +; MSSA-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store %x, ptr %p %load = load , ptr %p @@ -494,11 +601,16 @@ define @load_v2i32_store_v4i32_forward_load_offsetc(ptr %p, < } define @load_v2p0_store_v4i32_forward_load(ptr %p, %x) { -; CHECK-LABEL: @load_v2p0_store_v4i32_forward_load( -; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[X]] to -; CHECK-NEXT: [[LOAD:%.*]] = inttoptr [[TMP1]] to -; CHECK-NEXT: ret [[LOAD]] +; MDEP-LABEL: @load_v2p0_store_v4i32_forward_load( +; MDEP-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MDEP-NEXT: [[TMP1:%.*]] = bitcast [[X]] to +; MDEP-NEXT: [[TMP2:%.*]] = inttoptr [[TMP1]] to +; MDEP-NEXT: ret [[TMP2]] +; +; MSSA-LABEL: @load_v2p0_store_v4i32_forward_load( +; MSSA-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store %x, ptr %p %load = load , ptr %p @@ -506,10 +618,15 @@ define @load_v2p0_store_v4i32_forward_load(ptr %p, @load_v2i64_store_v2p0_forward_load(ptr %p, %x) { -; CHECK-LABEL: @load_v2i64_store_v2p0_forward_load( -; CHECK-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[LOAD:%.*]] = ptrtoint [[X]] to -; CHECK-NEXT: ret [[LOAD]] +; MDEP-LABEL: @load_v2i64_store_v2p0_forward_load( +; MDEP-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MDEP-NEXT: [[TMP1:%.*]] = ptrtoint [[X]] to +; MDEP-NEXT: ret [[TMP1]] +; +; MSSA-LABEL: @load_v2i64_store_v2p0_forward_load( +; MSSA-NEXT: store [[X:%.*]], ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store %x, ptr %p %load = load , ptr %p @@ -539,9 +656,14 @@ define <16 x i8> @load_v16i8_store_nxv4i32_forward_load(ptr %p, @load_v16i8_store_v4i32_forward_constant(ptr %p) { -; CHECK-LABEL: @load_v16i8_store_v4i32_forward_constant( -; CHECK-NEXT: store splat (i32 4), ptr [[P:%.*]], align 16 -; CHECK-NEXT: ret bitcast ( splat (i32 4) to ) +; MDEP-LABEL: @load_v16i8_store_v4i32_forward_constant( +; MDEP-NEXT: store splat (i32 4), ptr [[P:%.*]], align 16 +; MDEP-NEXT: ret bitcast ( splat (i32 4) to ) +; +; MSSA-LABEL: @load_v16i8_store_v4i32_forward_constant( +; MSSA-NEXT: store splat (i32 4), ptr [[P:%.*]], align 16 +; MSSA-NEXT: [[LOAD:%.*]] = load , ptr [[P]], align 16 +; MSSA-NEXT: ret [[LOAD]] ; store splat (i32 4), ptr %p %load = load , ptr %p @@ -571,35 +693,65 @@ define {} @load_v16i8_store_v4i32_struct_forward_load(ptr %p, } define { , , , } @bigexample({ , , , } %a) vscale_range(1,16) { -; CHECK-LABEL: @bigexample( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[REF_TMP:%.*]] = alloca { , , , }, align 16 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]]) -; CHECK-NEXT: [[A_ELT:%.*]] = extractvalue { , , , } [[A:%.*]], 0 -; CHECK-NEXT: store [[A_ELT]], ptr [[REF_TMP]], align 16 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 -; CHECK-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]] -; CHECK-NEXT: [[A_ELT2:%.*]] = extractvalue { , , , } [[A]], 1 -; CHECK-NEXT: store [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16 -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 5 -; CHECK-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]] -; CHECK-NEXT: [[A_ELT4:%.*]] = extractvalue { , , , } [[A]], 2 -; CHECK-NEXT: store [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP0]], 48 -; CHECK-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP5]] -; CHECK-NEXT: [[A_ELT6:%.*]] = extractvalue { , , , } [[A]], 3 -; CHECK-NEXT: store [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 -; CHECK-NEXT: [[DOTUNPACK:%.*]] = bitcast [[A_ELT]] to -; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { , , , } poison, [[DOTUNPACK]], 0 -; CHECK-NEXT: [[DOTUNPACK8:%.*]] = bitcast [[A_ELT2]] to -; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP6]], [[DOTUNPACK8]], 1 -; CHECK-NEXT: [[DOTUNPACK10:%.*]] = bitcast [[A_ELT4]] to -; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , , , } [[TMP9]], [[DOTUNPACK10]], 2 -; CHECK-NEXT: [[DOTUNPACK12:%.*]] = bitcast [[A_ELT6]] to -; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { , , , } [[TMP12]], [[DOTUNPACK12]], 3 -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) -; CHECK-NEXT: ret { , , , } [[TMP15]] +; MDEP-LABEL: @bigexample( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[REF_TMP:%.*]] = alloca { , , , }, align 16 +; MDEP-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; MDEP-NEXT: [[A_ELT:%.*]] = extractvalue { , , , } [[A:%.*]], 0 +; MDEP-NEXT: store [[A_ELT]], ptr [[REF_TMP]], align 16 +; MDEP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; MDEP-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; MDEP-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]] +; MDEP-NEXT: [[A_ELT2:%.*]] = extractvalue { , , , } [[A]], 1 +; MDEP-NEXT: store [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16 +; MDEP-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 5 +; MDEP-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP2]] +; MDEP-NEXT: [[A_ELT4:%.*]] = extractvalue { , , , } [[A]], 2 +; MDEP-NEXT: store [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16 +; MDEP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP0]], 48 +; MDEP-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]] +; MDEP-NEXT: [[A_ELT6:%.*]] = extractvalue { , , , } [[A]], 3 +; MDEP-NEXT: store [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 +; MDEP-NEXT: [[TMP4:%.*]] = bitcast [[A_ELT]] to +; MDEP-NEXT: [[TMP5:%.*]] = insertvalue { , , , } poison, [[TMP4]], 0 +; MDEP-NEXT: [[TMP6:%.*]] = bitcast [[A_ELT2]] to +; MDEP-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP5]], [[TMP6]], 1 +; MDEP-NEXT: [[TMP8:%.*]] = bitcast [[A_ELT4]] to +; MDEP-NEXT: [[TMP9:%.*]] = insertvalue { , , , } [[TMP7]], [[TMP8]], 2 +; MDEP-NEXT: [[TMP10:%.*]] = bitcast [[A_ELT6]] to +; MDEP-NEXT: [[TMP11:%.*]] = insertvalue { , , , } [[TMP9]], [[TMP10]], 3 +; MDEP-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; MDEP-NEXT: ret { , , , } [[TMP11]] +; +; MSSA-LABEL: @bigexample( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[REF_TMP:%.*]] = alloca { , , , }, align 16 +; MSSA-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; MSSA-NEXT: [[A_ELT:%.*]] = extractvalue { , , , } [[A:%.*]], 0 +; MSSA-NEXT: store [[A_ELT]], ptr [[REF_TMP]], align 16 +; MSSA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; MSSA-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 +; MSSA-NEXT: [[REF_TMP_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP1]] +; MSSA-NEXT: [[A_ELT2:%.*]] = extractvalue { , , , } [[A]], 1 +; MSSA-NEXT: store [[A_ELT2]], ptr [[REF_TMP_REPACK1]], align 16 +; MSSA-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 5 +; MSSA-NEXT: [[REF_TMP_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP2]] +; MSSA-NEXT: [[A_ELT4:%.*]] = extractvalue { , , , } [[A]], 2 +; MSSA-NEXT: store [[A_ELT4]], ptr [[REF_TMP_REPACK3]], align 16 +; MSSA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP0]], 48 +; MSSA-NEXT: [[REF_TMP_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[REF_TMP]], i64 [[TMP3]] +; MSSA-NEXT: [[A_ELT6:%.*]] = extractvalue { , , , } [[A]], 3 +; MSSA-NEXT: store [[A_ELT6]], ptr [[REF_TMP_REPACK5]], align 16 +; MSSA-NEXT: [[DOTUNPACK:%.*]] = load , ptr [[REF_TMP]], align 16 +; MSSA-NEXT: [[TMP4:%.*]] = insertvalue { , , , } poison, [[DOTUNPACK]], 0 +; MSSA-NEXT: [[DOTUNPACK8:%.*]] = load , ptr [[REF_TMP_REPACK1]], align 16 +; MSSA-NEXT: [[TMP5:%.*]] = insertvalue { , , , } [[TMP4]], [[DOTUNPACK8]], 1 +; MSSA-NEXT: [[DOTUNPACK10:%.*]] = load , ptr [[REF_TMP_REPACK3]], align 16 +; MSSA-NEXT: [[TMP6:%.*]] = insertvalue { , , , } [[TMP5]], [[DOTUNPACK10]], 2 +; MSSA-NEXT: [[DOTUNPACK12:%.*]] = load , ptr [[REF_TMP_REPACK5]], align 16 +; MSSA-NEXT: [[TMP7:%.*]] = insertvalue { , , , } [[TMP6]], [[DOTUNPACK12]], 3 +; MSSA-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull [[REF_TMP]]) +; MSSA-NEXT: ret { , , , } [[TMP7]] ; entry: %ref.tmp = alloca { , , , }, align 16 @@ -643,12 +795,21 @@ entry: } define @scalable_store_to_fixed_load( %.coerce) vscale_range(4,4) { -; CHECK-LABEL: @scalable_store_to_fixed_load( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64 -; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] -; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 -; CHECK-NEXT: ret [[TMP0]] +; MDEP-LABEL: @scalable_store_to_fixed_load( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64 +; MDEP-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; MDEP-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; MDEP-NEXT: ret [[TMP0]] +; +; MSSA-LABEL: @scalable_store_to_fixed_load( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64 +; MSSA-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; MSSA-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; MSSA-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; MSSA-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) +; MSSA-NEXT: ret [[CAST_SCALABLE]] ; entry: %retval = alloca { <16 x float> } @@ -661,11 +822,19 @@ entry: ; Here, only the lower bound for the vscale is known, but this is enough to allow a forward to a load to 16 elements. define @scalable_store_to_fixed_load_only_lower_bound( %a) vscale_range(4) { -; CHECK-LABEL: @scalable_store_to_fixed_load_only_lower_bound( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[RETVAL:%.*]] = alloca { }, align 16 -; CHECK-NEXT: store [[A:%.*]], ptr [[RETVAL]], align 16 -; CHECK-NEXT: ret [[A]] +; MDEP-LABEL: @scalable_store_to_fixed_load_only_lower_bound( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[RETVAL:%.*]] = alloca { }, align 16 +; MDEP-NEXT: store [[A:%.*]], ptr [[RETVAL]], align 16 +; MDEP-NEXT: ret [[A]] +; +; MSSA-LABEL: @scalable_store_to_fixed_load_only_lower_bound( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[RETVAL:%.*]] = alloca { }, align 16 +; MSSA-NEXT: store [[A:%.*]], ptr [[RETVAL]], align 16 +; MSSA-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; MSSA-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP0]], i64 0) +; MSSA-NEXT: ret [[CAST_SCALABLE]] ; entry: %retval = alloca { } @@ -752,12 +921,19 @@ entry: ; This function does not have a fixed vscale, but the loaded vector is still known ; to be smaller or equal in size compared to the stored vector. define <4 x float> @scalable_store_to_small_fixed_load( %a) { -; CHECK-LABEL: @scalable_store_to_small_fixed_load( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[PTR:%.*]] = alloca , align 16 -; CHECK-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.nxv4f32( [[A]], i64 0) -; CHECK-NEXT: ret <4 x float> [[TMP0]] +; MDEP-LABEL: @scalable_store_to_small_fixed_load( +; MDEP-NEXT: entry: +; MDEP-NEXT: [[PTR:%.*]] = alloca , align 16 +; MDEP-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; MDEP-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.nxv4f32( [[A]], i64 0) +; MDEP-NEXT: ret <4 x float> [[TMP0]] +; +; MSSA-LABEL: @scalable_store_to_small_fixed_load( +; MSSA-NEXT: entry: +; MSSA-NEXT: [[PTR:%.*]] = alloca , align 16 +; MSSA-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; MSSA-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[PTR]], align 16 +; MSSA-NEXT: ret <4 x float> [[TMP0]] ; entry: %ptr = alloca diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll index e04786e50e96c..2d024bd83e5ce 100644 --- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll +++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -p loop-unroll -S %s | FileCheck %s +; RUN: opt -p loop-unroll -unroll-full-max-count=0 -S %s | FileCheck %s define i64 @peel_single_block_loop_iv_step_1() { ; CHECK-LABEL: define i64 @peel_single_block_loop_iv_step_1() { @@ -68,16 +68,14 @@ exit: ret i64 %iv } - - define i64 @peel_single_block_loop_iv_step_1_eq_pred() { ; CHECK-LABEL: define i64 @peel_single_block_loop_iv_step_1_eq_pred() { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[CMP18_NOT:%.*]] = icmp eq i64 [[IV]], 63 -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP18_NOT]], i32 10, i32 20 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 63 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20 ; CHECK-NEXT: call void @foo(i32 [[COND]]) ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 64 @@ -136,22 +134,28 @@ exit: define i64 @peel_single_block_loop_iv_step_1_nested_loop() { ; CHECK-LABEL: define i64 @peel_single_block_loop_iv_step_1_nested_loop() { -; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] ; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[IV_NEXT_PEEL:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[CMP18_NOT_PEEL:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63 -; CHECK-NEXT: [[COND_PEEL:%.*]] = select i1 [[CMP18_NOT_PEEL]], i32 10, i32 20 -; CHECK-NEXT: call void @foo(i32 [[COND_PEEL]]) -; CHECK-NEXT: [[IV_NEXT_PEEL]] = add i64 [[IV_NEXT_LCSSA]], 1 -; CHECK-NEXT: [[EC_PEEL:%.*]] = icmp ne i64 [[IV_NEXT_PEEL]], 64 -; CHECK-NEXT: br i1 [[EC_PEEL]], label %[[LOOP]], label %[[OUTER_LATCH:.*]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 63 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20 +; CHECK-NEXT: call void @foo(i32 [[COND]]) +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[OUTER_LATCH]] ; CHECK: [[OUTER_LATCH]]: -; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV_NEXT_LCSSA]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ] ; CHECK-NEXT: call void @foo(i32 1) -; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; CHECK-NEXT: [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1 +; CHECK-NEXT: [[OUTER_EC:%.*]] = icmp ne i64 [[OUTER_IV_NEXT]], 100 +; CHECK-NEXT: br i1 [[OUTER_EC]], label %[[EXIT:.*]], label %[[OUTER_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA_LCSSA:%.*]] = phi i64 [ [[IV_LCSSA]], %[[OUTER_LATCH]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA_LCSSA]] ; entry: br label %outer.header @@ -184,21 +188,21 @@ define i64 @peel_multi_block_loop_iv_step_1() { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_PEEL:%.*]], %[[LATCH:.*]] ] -; CHECK-NEXT: [[CMP18_NOT_PEEL:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63 -; CHECK-NEXT: [[COND_PEEL:%.*]] = select i1 [[CMP18_NOT_PEEL]], i32 10, i32 20 -; CHECK-NEXT: call void @foo(i32 [[COND_PEEL]]) -; CHECK-NEXT: [[C_PEEL:%.*]] = call i1 @cond() -; CHECK-NEXT: br i1 [[C_PEEL]], label %[[THEN:.*]], label %[[LATCH]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 63 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20 +; CHECK-NEXT: call void @foo(i32 [[COND]]) +; CHECK-NEXT: [[C:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LATCH]] ; CHECK: [[THEN]]: -; CHECK-NEXT: call void @foo(i32 [[COND_PEEL]]) +; CHECK-NEXT: call void @foo(i32 [[COND]]) ; CHECK-NEXT: br label %[[LATCH]] ; CHECK: [[LATCH]]: -; CHECK-NEXT: [[IV_NEXT_PEEL]] = add i64 [[IV_NEXT_LCSSA]], 1 -; CHECK-NEXT: [[EC_PEEL:%.*]] = icmp ne i64 [[IV_NEXT_PEEL]], 64 -; CHECK-NEXT: br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 64 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV_NEXT_LCSSA]], %[[LATCH]] ] +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LATCH]] ] ; CHECK-NEXT: ret i64 [[IV_LCSSA]] ; entry: @@ -264,6 +268,69 @@ exit: ret i64 %iv } +define i64 @peel_single_block_loop_iv_step_1_btc_0() { +; CHECK-LABEL: define i64 @peel_single_block_loop_iv_step_1_btc_0() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 0 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20 +; CHECK-NEXT: call void @foo(i32 [[COND]]) +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 1 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %cmp = icmp eq i64 %iv, 0 + %cond = select i1 %cmp, i32 10, i32 20 + call void @foo(i32 %cond) + %iv.next = add i64 %iv, 1 + %ec = icmp ne i64 %iv.next, 1 + br i1 %ec, label %loop, label %exit + +exit: + ret i64 %iv +} + +define i64 @peel_single_block_loop_iv_step_1_btc_1() { +; CHECK-LABEL: define i64 @peel_single_block_loop_iv_step_1_btc_1() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20 +; CHECK-NEXT: call void @foo(i32 [[COND]]) +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 2 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %cmp = icmp eq i64 %iv, 1 + %cond = select i1 %cmp, i32 10, i32 20 + call void @foo(i32 %cond) + %iv.next = add i64 %iv, 1 + %ec = icmp ne i64 %iv.next, 2 + br i1 %ec, label %loop, label %exit + +exit: + ret i64 %iv +} define i64 @peel_single_block_loop_iv_step_1_may_execute_only_once(i64 %n) { ; CHECK-LABEL: define i64 @peel_single_block_loop_iv_step_1_may_execute_only_once( @@ -427,5 +494,62 @@ exit: ret i32 %sum.0.lcssa } +define i64 @peel_multi_exit_multi_latch_loop_iv_step_1(i64 %N) { +; CHECK-LABEL: define i64 @peel_multi_exit_multi_latch_loop_iv_step_1( +; CHECK-SAME: i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_BE:%.*]], %[[LOOP_BACKEDGE:.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 63 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 10, i32 20 +; CHECK-NEXT: call void @foo(i32 [[COND]]) +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[C_1]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: call void @foo(i32 20) +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC_1:%.*]] = icmp ne i64 [[IV_NEXT_1]], 64 +; CHECK-NEXT: br i1 [[EC_1]], label %[[EXIT:.*]], label %[[LOOP_BACKEDGE]] +; CHECK: [[LOOP_BACKEDGE]]: +; CHECK-NEXT: [[IV_BE]] = phi i64 [ [[IV_NEXT_1]], %[[THEN]] ], [ [[IV_NEXT_2:%.*]], %[[ELSE]] ] +; CHECK-NEXT: br label %[[LOOP]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: call void @foo(i32 10) +; CHECK-NEXT: [[IV_NEXT_2]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC_2:%.*]] = icmp ne i64 [[IV_NEXT_2]], 64 +; CHECK-NEXT: br i1 [[EC_2]], label %[[LOOP_BACKEDGE]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[ELSE]] ], [ [[IV]], %[[THEN]] ] +; CHECK-NEXT: ret i64 [[IV_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next.1, %then ], [ %iv.next.2, %else ] + %cmp = icmp eq i64 %iv, 63 + %cond = select i1 %cmp, i32 10, i32 20 + call void @foo(i32 %cond) + %c.1 = icmp eq i64 %iv, %N + br i1 %c.1, label %then, label %else + +then: + call void @foo(i32 20) + %iv.next.1 = add i64 %iv, 1 + %ec.1 = icmp ne i64 %iv.next.1, 64 + br i1 %ec.1, label %exit, label %loop + +else: + call void @foo(i32 10) + %iv.next.2 = add i64 %iv, 1 + %ec.2 = icmp ne i64 %iv.next.2, 64 + br i1 %ec.2, label %loop, label %exit + +exit: + ret i64 %iv +} + declare void @foo(i32) declare i1 @cond() + diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll index 3c8bbaa46f275..43b942458a39e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll @@ -28,7 +28,7 @@ define void @test_blend_feeding_replicated_store_1(i64 %N, ptr noalias %src, ptr ; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i1> zeroinitializer, <16 x i1> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true) ; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP7]], <16 x ptr> [[BROADCAST_SPLAT]], <16 x ptr> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x ptr> [[BROADCAST_SPLAT]], <16 x ptr> zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP9]], i32 0 ; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: @@ -219,7 +219,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i ; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i1> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> splat (i8 1) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i8> splat (i8 1), <16 x i8> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index d9ec09ffaa934..2c0fb797d1d10 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -354,7 +354,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP9:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) ; TFCOMMON-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer ; TFCOMMON-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP10]]) -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[TMP9]], [[TMP11]] +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[TMP11]], [[TMP9]] ; TFCOMMON-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] ; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] @@ -397,8 +397,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP12]], zeroinitializer ; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP19]]) ; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP20]]) -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[TMP21]] -; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP16]], [[TMP18]], [[TMP22]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP19]], [[TMP21]], [[TMP17]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP20]], [[TMP22]], [[TMP18]] ; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] ; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll index 2078a10d04ce7..ce3b2a9f216f2 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -23,11 +23,11 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) @@ -105,11 +105,11 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index a11cc15a8a85b..d021306b89aab 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -646,12 +646,11 @@ define i64 @mla_i16_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -726,12 +725,11 @@ define i64 @mla_i8_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -855,10 +853,10 @@ define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) @@ -910,10 +908,10 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) @@ -1016,10 +1014,10 @@ define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP4]], <16 x i16> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP5]]) @@ -1122,10 +1120,10 @@ define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalia ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) @@ -1459,9 +1457,8 @@ define i64 @mla_xx_sext_zext(ptr nocapture noundef readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -1528,11 +1525,11 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP2]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -1667,24 +1664,55 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP11]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644 ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], splat (i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) +; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3:%.*]], [[ADD:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = phi i64 [ [[ADD1:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD3:%.*]] = phi i64 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3]], [[ADD]] ; CHECK-NEXT: ret i64 [[DIV]] ; CHECK: for.body: -; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY1]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD3]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X:%.*]], i32 [[I_013]] +; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD1]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_013]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP0]], 8 ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[SHR]] to i64 -; CHECK-NEXT: [[ADD]] = add nsw i64 [[S_014]], [[CONV]] +; CHECK-NEXT: [[ADD1]] = add nsw i64 [[S_014]], [[CONV]] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV]] -; CHECK-NEXT: [[ADD3]] = add nuw nsw i64 [[MUL]], [[T_012]] +; CHECK-NEXT: [[ADD5]] = add nuw nsw i64 [[MUL]], [[T_012]] ; CHECK-NEXT: [[ADD4]] = add nuw nsw i32 [[I_013]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[ADD4]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY1]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; entry: %cmp11 = icmp sgt i32 %n, 0 @@ -1720,10 +1748,10 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 7 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 15 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -8 ; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -1731,28 +1759,26 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i32> [[TMP7]] to <4 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i32> [[TMP11]] to <4 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP8]]) +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]]) ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <8 x i64> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP13]]) ; CHECK-NEXT: [[TMP16]] = add i64 [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] @@ -1787,7 +1813,7 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[ADD12]] = add nsw i64 [[ADD]], [[CONV11]] ; CHECK-NEXT: [[ADD13]] = add nuw nsw i32 [[I_025]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD13]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP40:![0-9]+]] ; entry: %cmp23 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll new file mode 100644 index 0000000000000..4de0e666149f3 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s +; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL + +define void @simple(ptr noalias %a, ptr noalias %b, %c, i64 %N) vscale_range(2, 1024) { +; CHECK-LABEL: define void @simple( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[C]], [[VP_OP_LOAD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 +; CHECK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP18]], ptr align 4 [[TMP20]], splat (i1 true), i32 [[TMP12]]) +; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; LOOP-DEL-LABEL: define void @simple( +; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] +; LOOP-DEL: vector.ph: +; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] +; LOOP-DEL: vector.body: +; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; LOOP-DEL-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true) +; LOOP-DEL-NEXT: [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]] +; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; LOOP-DEL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP5]]) +; LOOP-DEL-NEXT: [[TMP11:%.*]] = add nsw [[C]], [[VP_OP_LOAD1]] +; LOOP-DEL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] +; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 +; LOOP-DEL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP11]], ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) +; LOOP-DEL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64 +; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] +; LOOP-DEL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LOOP-DEL: for.body: +; LOOP-DEL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; LOOP-DEL-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; LOOP-DEL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; LOOP-DEL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; LOOP-DEL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; LOOP-DEL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; LOOP-DEL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; LOOP-DEL: for.cond.cleanup: +; LOOP-DEL-NEXT: ret void +; +entry: + %0 = sub i64 -1, %N + %1 = call i64 @llvm.vscale.i64() + %2 = mul i64 %1, 4 + %3 = icmp ult i64 %0, %2 + br i1 %3, label %scalar.ph, label %vector.ph + +vector.ph: ; preds = %entry + %4 = call i64 @llvm.vscale.i64() + %5 = mul i64 %4, 4 + %6 = call i64 @llvm.vscale.i64() + %7 = mul i64 %6, 4 + %8 = sub i64 %7, 1 + %n.rnd.up = add i64 %N, %8 + %n.mod.vf = urem i64 %n.rnd.up, %5 + %n.vec = sub i64 %n.rnd.up, %n.mod.vf + %9 = call i64 @llvm.vscale.i64() + %10 = mul i64 %9, 4 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] + %11 = sub i64 %N, %evl.based.iv + %12 = call i32 @llvm.experimental.get.vector.length.i64(i64 %11, i32 4, i1 true) + %13 = add i64 %evl.based.iv, 0 + %14 = getelementptr inbounds i32, ptr %b, i64 %13 + %15 = getelementptr inbounds i32, ptr %14, i32 0 + %vp.op.load = call @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, splat (i1 true), i32 %12) + %18 = add nsw %c, %vp.op.load + %19 = getelementptr inbounds i32, ptr %a, i64 %13 + %20 = getelementptr inbounds i32, ptr %19, i32 0 + call void @llvm.vp.store.nxv4i32.p0( %18, ptr align 4 %20, splat (i1 true), i32 %12) + %21 = zext i32 %12 to i64 + %index.evl.next = add i64 %21, %evl.based.iv + %index.next = add nuw i64 %index, %10 + %22 = icmp eq i64 %index.next, %n.vec + br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0 + +middle.block: ; preds = %vector.body + br i1 true, label %for.cond.cleanup, label %scalar.ph + +scalar.ph: ; preds = %entry, %middle.block + %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %entry ] + br label %for.body + +for.body: ; preds = %for.body, %scalar.ph + %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv + %23 = load i32, ptr %arrayidx, align 4 + %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv + store i32 %23, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void +} + +; Fixed IV steps resulting from vscale_range with a single element + +define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 { +; CHECK-LABEL: define void @fixed_iv_step( +; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK: for.end.loopexit5: +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +; LOOP-DEL-LABEL: define void @fixed_iv_step( +; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 +; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] +; LOOP-DEL: vector.body: +; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) +; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] +; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] +; LOOP-DEL: for.end: +; LOOP-DEL-NEXT: ret void +; +entry: + br label %vector.ph + +vector.ph: + %n.rnd.up = add nsw i64 %N, 15 + %n.vec = and i64 %n.rnd.up, -16 + %broadcast.splatinsert = insertelement poison, ptr %arg0, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + br label %vector.body + +vector.body: + %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ] + %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] + %41 = sub i64 %N, %evl.based.iv + %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true) + %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv + tail call void @llvm.vp.store.nxv2p0.p0( %broadcast.splat, ptr align 8 %gep, splat (i1 true), i32 %42) + %43 = zext i32 %42 to i64 + %index.evl.next = add i64 %evl.based.iv, %43 + %lsr.iv.next33 = add i64 %lsr.iv32, -16 + %44 = icmp eq i64 %lsr.iv.next33, 0 + br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3 + +for.end.loopexit5: + br label %for.end + +for.end: + ret void +} + +; Fixed IV step and trip count +define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 { +; CHECK-LABEL: define void @fixed_iv_step_tc( +; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87 +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] +; CHECK: for.end.loopexit5: +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +; LOOP-DEL-LABEL: define void @fixed_iv_step_tc( +; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 +; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] +; LOOP-DEL: vector.body: +; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) +; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] +; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87 +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] +; LOOP-DEL: for.end: +; LOOP-DEL-NEXT: ret void +; +entry: + br label %vector.ph + +vector.ph: + %n.rnd.up = add nsw i64 87, 15 + %n.vec = and i64 %n.rnd.up, -16 + %broadcast.splatinsert = insertelement poison, ptr %arg0, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + br label %vector.body + +vector.body: + %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ] + %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] + %41 = sub i64 87, %evl.based.iv + %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true) + %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv + tail call void @llvm.vp.store.nxv2p0.p0( %broadcast.splat, ptr align 8 %gep, splat (i1 true), i32 %42) + %43 = zext i32 %42 to i64 + %index.evl.next = add i64 %evl.based.iv, %43 + %lsr.iv.next33 = add i64 %lsr.iv32, -16 + %44 = icmp eq i64 %lsr.iv.next33, 0 + br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3 + +for.end.loopexit5: + br label %for.end + +for.end: + ret void +} + +declare i64 @llvm.vscale.i64() + +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg) + +declare @llvm.vp.load.nxv4i32.p0(ptr nocapture, , i32) + +declare void @llvm.vp.store.nxv4i32.p0(, ptr nocapture, , i32) + +attributes #0 = { vscale_range(8,8) } + +!0 = distinct !{!0, !1, !2, !4} +!1 = !{!"llvm.loop.isvectorized", i32 1} +!2 = !{!"llvm.loop.unroll.runtime.disable"} +!3 = distinct !{!3, !2, !1, !4} +!4 = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]} +;. +; LOOP-DEL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; LOOP-DEL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; LOOP-DEL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; LOOP-DEL: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; LOOP-DEL: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll index ebbc41f034bd1..88d9ed2ce201e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll @@ -46,8 +46,8 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64 ; CHECK-NEXT: [[TMP20:%.*]] = xor [[TMP14]], splat (i1 true) ; CHECK-NEXT: [[TMP21:%.*]] = select [[TMP13]], [[TMP20]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = or [[TMP19]], [[TMP21]] -; CHECK-NEXT: [[EXT:%.*]] = extractelement [[TMP19]], i32 0 -; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[EXT]], i64 [[INDEX]], i64 poison +; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[TMP21]], i32 0 +; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP23]], i64 poison, i64 [[INDEX]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[PREDPHI]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[TMP24]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv8i16.p0( zeroinitializer, ptr [[TMP25]], i32 2, [[TMP22]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index 9cf7bc9fe07d6..3dc17e615048e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -20,9 +20,9 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 9) -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <16 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP4:%.*]] = icmp sge <16 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i1> [[TMP4]], <16 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> [[TMP2]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = shl <16 x i32> [[PREDPHI]], splat (i32 8) ; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8> ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[ACTIVE_LANE_MASK]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index 492eb091175e2..1588d02eff3db 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -33,10 +33,10 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META3]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], splat (i32 20) +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 19) ; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], splat (i32 4) ; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> splat (i32 4), <4 x i32> splat (i32 5) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> splat (i32 3) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> splat (i32 3), <4 x i32> [[TMP10]] ; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[PREDPHI]], <4 x i32> splat (i32 9) ; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope [[META0]], !noalias [[META3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -141,16 +141,14 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 19) -; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) -; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP7]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], splat (i32 4) ; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> splat (i32 4), <4 x i32> splat (i32 5) ; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> splat (i32 6), <4 x i32> splat (i32 11) ; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> splat (i32 3), <4 x i32> splat (i32 9) -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]] -; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> splat (i32 7), <4 x i32> splat (i32 18) -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP13]], <4 x i32> [[PREDPHI4]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> splat (i32 3), <4 x i32> [[TMP12]] +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[PREDPHI]], <4 x i32> splat (i32 9) +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> splat (i32 7), <4 x i32> [[TMP13]] +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[PREDPHI4]], <4 x i32> splat (i32 18) ; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope [[META9]], !noalias [[META12]] ; CHECK-NEXT: store <4 x i32> [[PREDPHI5]], ptr [[TMP6]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index ef4cde4027a9a..e6c2242dd0c4e 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -1207,11 +1207,11 @@ define float @fcmp_multi(ptr nocapture readonly %a, i32 %n) nounwind readonly { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = fcmp uge <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP9]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x float> [[TMP9]], <4 x float> [[TMP8]] ; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[WIDE_LOAD]], <4 x float> [[PREDPHI]] ; CHECK-NEXT: [[TMP10]] = fadd fast <4 x float> [[PREDPHI1]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -1335,8 +1335,8 @@ define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonl ; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP6]] -; CHECK-NEXT: [[PREDPHI1]] = select <4 x i1> [[TMP9]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP6]] +; CHECK-NEXT: [[PREDPHI1]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll index 3256b80b20c82..ba85bb4d84f5c 100644 --- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll @@ -185,10 +185,10 @@ define i32 @test3(i32 %N) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], splat (i32 10) ; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp sle <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP5]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> splat (i32 2), <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> zeroinitializer, <2 x i32> splat (i32 2) +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> splat (i32 1), <2 x i32> [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll index b857385e38535..aee80c9015463 100644 --- a/llvm/test/Transforms/LoopVectorize/phi-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll @@ -102,10 +102,10 @@ define void @phi_three_incoming_values(ptr noalias %a, ptr noalias %b, i64 %n) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], splat (i32 20) +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], splat (i32 19) ; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD2]], splat (i32 4) ; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> splat (i32 4), <2 x i32> splat (i32 5) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[TMP8]], <2 x i32> splat (i32 3) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> splat (i32 3), <2 x i32> [[TMP8]] ; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[PREDPHI]], <2 x i32> splat (i32 9) ; CHECK-NEXT: store <2 x i32> [[PREDPHI3]], ptr [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index 67cf46e7d0f88..bd9647188911a 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -13,8 +13,6 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT3]], <2 x i1> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true) -; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true) -; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x i1> [[TMP4]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT5]], splat (i32 1) ; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP6]], <2 x i1> [[BROADCAST_SPLAT2]], <2 x i1> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -25,10 +23,10 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20) ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> [[VEC_IND]] -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> splat (i32 9), <2 x i32> [[PREDPHI]] -; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP3]], <2 x i32> [[PREDPHI6]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> splat (i32 9) +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]] +; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll index 05f26b8a0a273..17e3bb3cce7eb 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -476,10 +476,10 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK: pred.load.continue8: ; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] ; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] ; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]]) ; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]] @@ -1354,17 +1354,17 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> splat (i1 true), <4 x i1> [[TMP9]] -; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP9]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP3]], <4 x float> [[PREDPHI2]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index f6a1ebf8b0fe9..9ca7a84b3ea1c 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -225,9 +225,9 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]]) ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -689,17 +689,17 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> splat (i1 true), <4 x i1> [[TMP9]] -; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP9]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP3]], <4 x float> [[PREDPHI2]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] @@ -1289,15 +1289,13 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[WIDE_LOAD]], splat (i8 31) ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i8> [[TMP2]], splat (i8 3) ; CHECK-NEXT: [[TMP4:%.*]] = udiv <4 x i8> [[TMP3]], splat (i8 31) ; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll index 09d3ca0b0bf20..757be041afbb5 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction.ll @@ -760,17 +760,17 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]] -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> splat (i1 true), <4 x i1> [[TMP9]] -; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP9]], <4 x float> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP3]], <4 x float> [[PREDPHI2]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll index 8d3ffb48a5b6c..d5a206ff21da0 100644 --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -105,11 +105,11 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [32 x i16], ptr @src, i16 0, i16 [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> splat (i16 1), <2 x i16> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1) +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], ptr [[TMP10]], align 2 @@ -295,11 +295,11 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) { ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: ; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true) -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[TMP14]] -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> splat (i16 1), <2 x i16> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> [[TMP14]], <2 x i16> splat (i16 1) +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 ; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], ptr [[TMP19]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll index 40a1eb477a212..70c0483ee9f4b 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll @@ -46,7 +46,7 @@ while.end: define void @reuse_const_btc(ptr %A) { ; CHECK-LABEL: @reuse_const_btc ; CHECK: {{%.*}} = icmp ule <4 x i32> {{%.*}}, splat (i32 13) -; CHECK: {{%.*}} = select <4 x i1> {{%.*}}, <4 x i32> splat (i32 12), <4 x i32> splat (i32 13) +; CHECK: {{%.*}} = select <4 x i1> {{%.*}}, <4 x i32> splat (i32 13), <4 x i32> splat (i32 12) ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index a030750ed0b6e..70094ed649ec2 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -131,12 +131,13 @@ define void @blend_chain_iv(i1 %c) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i64> undef, <4 x i64> [[VEC_IND]] ; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 2cf630de208c9..da42d62d39c2e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -268,3 +268,148 @@ loop: exit: ret i64 %cond } + +define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_extended_reduction' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]> +; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %iv + %load0 = load i32, ptr %arrayidx, align 4 + %conv0 = zext i32 %load0 to i64 + %rdx.next = add nsw i64 %rdx, %conv0 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} + +define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_mulacc' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> +; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> +; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> +; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, ptr %x, i32 %iv + %load0 = load i64, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i64, ptr %y, i32 %iv + %load1 = load i64, ptr %arrayidx1, align 4 + %mul = mul nsw i64 %load0, %load1 + %rdx.next = add nsw i64 %rdx, %mul + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} + +define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_mulacc_extended' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> +; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> +; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> +; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv + %load0 = load i16, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %iv + %load1 = load i16, ptr %arrayidx1, align 4 + %conv0 = sext i16 %load0 to i32 + %conv1 = sext i16 %load1 to i32 + %mul = mul nsw i32 %conv0, %conv1 + %conv = sext i32 %mul to i64 + %rdx.next = add nsw i64 %rdx, %conv + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} diff --git a/llvm/test/Transforms/ObjCARC/contract-attached-call-retain-to-claim.ll b/llvm/test/Transforms/ObjCARC/contract-attached-call-retain-to-claim.ll new file mode 100644 index 0000000000000..d0b8ce97d6517 --- /dev/null +++ b/llvm/test/Transforms/ObjCARC/contract-attached-call-retain-to-claim.ll @@ -0,0 +1,35 @@ +; RUN: opt -passes=objc-arc-contract -arc-contract-use-objc-claim-rv=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CLAIM +; RUN: opt -passes=objc-arc-contract -arc-contract-use-objc-claim-rv=0 -S < %s | FileCheck %s --check-prefixes=CHECK,RETAIN + +; CHECK-LABEL: define void @test0() { +; CLAIM: %[[CALL:.*]] = notail call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.claimAutoreleasedReturnValue) ] +; RETAIN: %[[CALL:.*]] = notail call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] +; CHECK-NEXT: ret void + +define void @test0() { + %call1 = call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + ret void +} + +; CHECK-LABEL: define void @test1() { +; CHECK: %[[CALL:.*]] = notail call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ] +; CHECK-NEXT: ret void + +define void @test1() { + %call1 = call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ] + ret void +} + +; CHECK-LABEL: define void @test2() { +; CLAIM: %[[CALL:.*]] = notail call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.claimAutoreleasedReturnValue), "otherbundle"() ] +; RETAIN: %[[CALL:.*]] = notail call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue), "otherbundle"() ] +; CHECK-NEXT: ret void + +define void @test2() { + %call1 = call ptr @foo() [ "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue), "otherbundle"() ] + ret void +} + +declare ptr @foo() +declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr) +declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index 295a71899c338..d8ddfee3ed28b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -37,19 +37,19 @@ define void @test() { ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15 ; CHECK-NEXT: br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]] ; CHECK: [[BB77]]: -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> [[TMP23]], <8 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> ; CHECK-NEXT: br label %[[BB78:.*]] ; CHECK: [[BB78]]: -; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP30]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ] ; CHECK-NEXT: [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> @@ -58,8 +58,8 @@ define void @test() { ; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]] ; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison ; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison -; CHECK-NEXT: [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> ; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]] ; CHECK: [[BB167]]: ; CHECK-NEXT: [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll index e3c134b068e04..773b9c069569d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll @@ -5,14 +5,14 @@ define void @test(ptr %0, float %1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr [[TMP0:%.*]], float [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> , float [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> , float [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[BB8:.*]] ; CHECK: [[BB8]]: ; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP15:%.*]], %[[BB8]] ], [ [[TMP5]], [[TMP2:%.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP7]], %[[BB8]] ], [ [[TMP4]], [[TMP2]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP7]], %[[BB8]] ], [ [[TMP8]], [[TMP2]] ] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll index 2a54ae9a1e749..8ad05694b5322 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll @@ -6,9 +6,9 @@ define i32 @test(i64 %l.549) { ; CHECK-SAME: i64 [[L_549:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[CONV3:%.*]] = sext i32 0 to i64 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[L_549]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: br label %[[IF_THEN19:.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long-pointer-distance.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-pointer-distance.ll index 9cfafd2784488..f663d120b136a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/long-pointer-distance.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/long-pointer-distance.ll @@ -5,7 +5,13 @@ define void @test(ptr %this) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr [[THIS:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: store <4 x i64> , ptr [[THIS]], align 8 +; CHECK-NEXT: store i64 1, ptr [[THIS]], align 8 +; CHECK-NEXT: [[B:%.*]] = getelementptr i8, ptr [[THIS]], i64 8 +; CHECK-NEXT: store i64 2, ptr [[B]], align 8 +; CHECK-NEXT: [[C:%.*]] = getelementptr i8, ptr [[THIS]], i64 4294967312 +; CHECK-NEXT: store i64 3, ptr [[C]], align 8 +; CHECK-NEXT: [[D:%.*]] = getelementptr i8, ptr [[THIS]], i64 4294967320 +; CHECK-NEXT: store i64 4, ptr [[D]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-bv-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-bv-schedulable.ll index 5b936f65a3221..6fa33671a7b53 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-bv-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-bv-schedulable.ll @@ -7,11 +7,11 @@ define void @test() { ; CHECK-NEXT: br i1 false, label %[[BB1:.*]], label %[[BB5:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], %[[BB1]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 0, i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP0]], [[TMP4]] ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> , i32 0, i32 0 ; CHECK-NEXT: [[TMP3]] = or <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> , i32 0, i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP0]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 ; CHECK-NEXT: [[OR3:%.*]] = or i32 [[TMP6]], 0 ; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll index 289c6002851d7..5c7dc869395b8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll @@ -37,18 +37,18 @@ define i32 @test(i32 %s.0) { ; CHECK: [[IF_THEN18:.*]]: ; CHECK-NEXT: br label %[[T]] ; CHECK: [[T]]: -; CHECK-NEXT: [[TMP30:%.*]] = phi <8 x i32> [ [[TMP27:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <8 x i32> [ [[TMP27:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ] ; CHECK-NEXT: [[TMP17]] = extractelement <4 x i32> [[TMP23:%.*]], i32 0 ; CHECK-NEXT: br i1 false, label %[[IF_END24]], label %[[K]] ; CHECK: [[IF_END24]]: -; CHECK-NEXT: [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP11]], %[[IF_END6]] ], [ [[TMP30]], %[[T]] ] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP11]], %[[IF_END6]] ], [ [[TMP19]], %[[T]] ] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[O]] ; CHECK: [[O]]: -; CHECK-NEXT: [[TMP22]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ] ; CHECK-NEXT: [[TMP23]] = phi <4 x i32> [ [[TMP1]], %[[K]] ], [ [[TMP20]], %[[IF_END24]] ] +; CHECK-NEXT: [[TMP22]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP30]], %[[IF_END24]] ] ; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP21]], %[[IF_END24]] ] ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP25]], <8 x i32> , <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll new file mode 100644 index 0000000000000..1c482e079bb0f --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s + +define i64 @test() { +; CHECK-LABEL: define i64 @test() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 0, i32 1 +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: br label %[[BB5]] +; CHECK: [[BB5]]: +; CHECK-NEXT: br i1 false, label %[[BB6:.*]], label %[[BB1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP2]], %[[BB5]] ] +; CHECK-NEXT: ret i64 0 +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb ], [ %or, %bb5 ] + %phi2 = phi i32 [ 0, %bb ], [ %or4, %bb5 ] + %or = or i32 %phi, 0 + %add = add i32 0, 0 + %or3 = or i32 %add, %phi2 + %or4 = or i32 %or3, 0 + br label %bb5 + +bb5: + br i1 false, label %bb6, label %bb1 + +bb6: + %phi7 = phi i32 [ %or, %bb5 ] + %phi8 = phi i32 [ %or3, %bb5 ] + ret i64 0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll new file mode 100644 index 0000000000000..57eb1e7173618 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=cascadelake < %s | FileCheck %s + +%class.btManifoldPoint = type <{ %class.btVector3, %class.btVector3, %class.btVector3, %class.btVector3, %class.btVector3, float, float, float, i32, i32, i32, i32, [4 x i8], ptr, float, i8, [3 x i8], float, float, i32, %class.btVector3, %class.btVector3, [4 x i8] }> +%class.btVector3 = type { [4 x float] } + +define void @test(ptr %this, i1 %cmp4.not) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[THIS:%.*]], i1 [[CMP4_NOT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[NEWPT:%.*]] = alloca [[CLASS_BTMANIFOLDPOINT:%.*]], align 8 +; CHECK-NEXT: [[CALL25:%.*]] = load volatile i32, ptr [[NEWPT]], align 4 +; CHECK-NEXT: br i1 [[CMP4_NOT]], label %[[IF_ELSE37:.*]], label %[[IF_END46:.*]] +; CHECK: [[IF_ELSE37]]: +; CHECK-NEXT: br label %[[IF_END46]] +; CHECK: [[IF_END46]]: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i64> [ , %[[IF_ELSE37]] ], [ , %[[ENTRY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[THIS]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEWPT]], i64 92 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison) +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP4]], align 4 +; CHECK-NEXT: ret void +; +entry: + %newPt = alloca %class.btManifoldPoint, align 8 + %call25 = load volatile i32, ptr %newPt, align 4 + br i1 %cmp4.not, label %if.else37, label %if.end46 + +if.else37: ; preds = %entry + br label %if.end46 + +if.end46: ; preds = %if.else37, %entry + %.sink264 = phi i64 [ 160, %if.else37 ], [ 0, %entry ] + %.sink262 = phi i64 [ 0, %if.else37 ], [ 1, %entry ] + %.sink261 = phi i64 [ 1, %if.else37 ], [ 0, %entry ] + %m_partId038 = getelementptr i8, ptr %this, i64 %.sink264 + %m_index042 = getelementptr i8, ptr %this, i64 %.sink262 + %m_index144 = getelementptr i8, ptr %this, i64 %.sink261 + %.sink = load i32, ptr %m_index144, align 4 + %.sink186 = load i32, ptr %m_index042, align 4 + %.sink188 = load i32, ptr %m_partId038, align 4 + %0 = getelementptr i8, ptr %newPt, i64 92 + store i32 %.sink188, ptr %0, align 4 + %1 = getelementptr i8, ptr %newPt, i64 96 + store i32 %.sink, ptr %1, align 8 + %2 = getelementptr i8, ptr %newPt, i64 100 + store i32 %.sink186, ptr %2, align 4 + %3 = getelementptr i8, ptr %newPt, i64 104 + store i32 %.sink, ptr %3, align 8 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll index 2612a21b9eedf..a08d0b1ab16cc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll @@ -16,10 +16,10 @@ define i32 @test(i1 %cond) { ; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[OR92]] = or i32 1, 0 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> , i32 [[OR92]], i32 0 ; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]] -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]] ; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 [[OP_RDX]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll index 5aa4dba2b8a1b..f2b9d6329bfdf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-num-operands.ll @@ -15,8 +15,8 @@ define i64 @Foo(ptr align 8 dereferenceable(344) %0, i64 %1) { ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> , i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 ; CHECK-NEXT: br label %[[BB16:.*]] ; CHECK: [[BB16]]: ; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[TMP25:.*]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll index 36dbeed9bbcd5..ebe3eb6b53358 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -234,12 +234,12 @@ define void @test7() { define void @test8() { ; CHECK-LABEL: @test8( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> zeroinitializer, i64 2) ; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> poison, <2 x float> zeroinitializer, i64 0) ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP0]], <2 x float> zeroinitializer, i64 2) ; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP1]], <2 x float> zeroinitializer, i64 4) ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP2]], <2 x float> zeroinitializer, i64 6) -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> zeroinitializer, i64 2) ; CHECK-NEXT: br i1 false, label [[FOR0:%.*]], label [[FOR_BODY:%.*]] ; CHECK: for0: ; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x float> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/SafeStack/AArch64/abi.ll b/llvm/test/Transforms/SafeStack/AArch64/abi.ll index 6d4ca03096825..70e1ba605c1d6 100644 --- a/llvm/test/Transforms/SafeStack/AArch64/abi.ll +++ b/llvm/test/Transforms/SafeStack/AArch64/abi.ll @@ -4,7 +4,7 @@ define void @foo() nounwind uwtable safestack { entry: -; CHECK: %[[TP:.*]] = call ptr @llvm.thread.pointer() +; CHECK: %[[TP:.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK: %[[SPA0:.*]] = getelementptr i8, ptr %[[TP]], i32 72 ; CHECK: %[[USP:.*]] = load ptr, ptr %[[SPA0]] ; CHECK: %[[USST:.*]] = getelementptr i8, ptr %[[USP]], i32 -16 diff --git a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll index 282d8c4390b65..43fb2605ff646 100644 --- a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll +++ b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll @@ -6,9 +6,9 @@ define void @foo() nounwind uwtable safestack sspreq { entry: ; The first @llvm.thread.pointer is for the unsafe stack pointer, skip it. -; TLS: call ptr @llvm.thread.pointer() +; TLS: call ptr @llvm.thread.pointer.p0() -; TLS: %[[TP2:.*]] = call ptr @llvm.thread.pointer() +; TLS: %[[TP2:.*]] = call ptr @llvm.thread.pointer.p0() ; ANDROID: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 40 ; FUCHSIA: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 -16 ; TLS: %[[StackGuard:.*]] = load ptr, ptr %[[B]] diff --git a/llvm/test/Transforms/SafeStack/AArch64/unreachable.ll b/llvm/test/Transforms/SafeStack/AArch64/unreachable.ll index 23fd3bf9d8f21..befdc634b73e5 100644 --- a/llvm/test/Transforms/SafeStack/AArch64/unreachable.ll +++ b/llvm/test/Transforms/SafeStack/AArch64/unreachable.ll @@ -3,7 +3,7 @@ define void @foo() nounwind uwtable safestack { entry: -; CHECK: %[[TP:.*]] = call ptr @llvm.thread.pointer() +; CHECK: %[[TP:.*]] = call ptr @llvm.thread.pointer.p0() ; CHECK: %[[SPA0:.*]] = getelementptr i8, ptr %[[TP]], i32 72 ; CHECK: %[[USP:.*]] = load ptr, ptr %[[SPA0]] ; CHECK: %[[USST:.*]] = getelementptr i8, ptr %[[USP]], i32 -16 diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll index b6adf1b40acf7..2654b70b90e85 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll @@ -9,28 +9,28 @@ target datalayout = "e-p:64:64" ;; preserve alignment. Making them i16s allows them to stay at the beginning of ;; the vtable. There are other tests where there's a mix of constants before and ;; after the vtable but for this file we just want everything before the vtable. -; CHECK: [[VT1DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\00\00\03\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf1i16], [0 x i8] zeroinitializer }, section "vt1sec", !type [[T8:![0-9]+]] +; CHECK: [[VT1DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\00\03\00\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf1i16], [0 x i8] zeroinitializer }, section "vt1sec", !type [[T8:![0-9]+]] @vt1 = constant [3 x ptr] [ ptr @vf0i1, ptr @vf1i1, ptr @vf1i16 ], section "vt1sec", !type !0 -; CHECK: [[VT2DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\00\00\04\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf2i16], [0 x i8] zeroinitializer }, !type [[T8]] +; CHECK: [[VT2DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\00\04\00\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf2i16], [0 x i8] zeroinitializer }, !type [[T8]] @vt2 = constant [3 x ptr] [ ptr @vf1i1, ptr @vf0i1, ptr @vf2i16 ], !type !0 -; CHECK: [[VT3DATA:@[^ ]*]] = private constant { [4 x i8], [3 x ptr], [0 x i8] } { [4 x i8] c"\00\05\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf3i16], [0 x i8] zeroinitializer }, align 2, !type [[T5:![0-9]+]] +; CHECK: [[VT3DATA:@[^ ]*]] = private constant { [4 x i8], [3 x ptr], [0 x i8] } { [4 x i8] c"\05\00\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf3i16], [0 x i8] zeroinitializer }, align 2, !type [[T5:![0-9]+]] @vt3 = constant [3 x ptr] [ ptr @vf0i1, ptr @vf1i1, ptr @vf3i16 ], align 2, !type !0 -; CHECK: [[VT4DATA:@[^ ]*]] = private constant { [16 x i8], [3 x ptr], [0 x i8] } { [16 x i8] c"\00\00\00\00\00\00\00\00\00\00\00\00\00\06\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf4i16], [0 x i8] zeroinitializer }, align 16, !type [[T16:![0-9]+]] +; CHECK: [[VT4DATA:@[^ ]*]] = private constant { [16 x i8], [3 x ptr], [0 x i8] } { [16 x i8] c"\00\00\00\00\00\00\00\00\00\00\00\00\06\00\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf4i16], [0 x i8] zeroinitializer }, align 16, !type [[T16:![0-9]+]] @vt4 = constant [3 x ptr] [ ptr @vf1i1, ptr @vf0i1, @@ -136,7 +136,7 @@ define i16 @call3(ptr %obj) { call void @llvm.assume(i1 %p) %fptrptr = getelementptr [3 x ptr], ptr %vtable, i16 0, i16 2 %fptr = load ptr, ptr %fptrptr - ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -3 + ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -4 ; CHECK: [[VTLOAD3:%[^ ]*]] = load i16, ptr [[VTGEP3]] %result = call i16 %fptr(ptr %obj) ; CHECK: ret i16 [[VTLOAD3]] diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll index 40adabbe38400..d8f5c912e9a50 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll @@ -37,28 +37,28 @@ target triple = "x86_64-unknown-linux-gnu" ; SKIP-ALL-NOT: devirtualized -; CHECK: [[VT1DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\01\00\00\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf1i32], [0 x i8] zeroinitializer }, section "vt1sec", !type [[T8:![0-9]+]] +; CHECK: [[VT1DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [4 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf1i32], [4 x i8] c"\01\00\00\00" }, section "vt1sec", !type [[T8:![0-9]+]] @vt1 = constant [3 x ptr] [ ptr @vf0i1, ptr @vf1i1, ptr @vf1i32 ], section "vt1sec", !type !0 -; CHECK: [[VT2DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\02\00\00\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf2i32], [0 x i8] zeroinitializer }, !type [[T8]] +; CHECK: [[VT2DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [4 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf2i32], [4 x i8] c"\02\00\00\00" }, !type [[T8]] @vt2 = constant [3 x ptr] [ ptr @vf1i1, ptr @vf0i1, ptr @vf2i32 ], !type !0 -; CHECK: [[VT3DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\03\00\00\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf3i32], [0 x i8] zeroinitializer }, !type [[T8]] +; CHECK: [[VT3DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [4 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\02", [3 x ptr] [ptr @vf0i1, ptr @vf1i1, ptr @vf3i32], [4 x i8] c"\03\00\00\00" }, !type [[T8]] @vt3 = constant [3 x ptr] [ ptr @vf0i1, ptr @vf1i1, ptr @vf3i32 ], !type !0 -; CHECK: [[VT4DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [0 x i8] } { [8 x i8] c"\00\00\00\04\00\00\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf4i32], [0 x i8] zeroinitializer }, !type [[T8]] +; CHECK: [[VT4DATA:@[^ ]*]] = private constant { [8 x i8], [3 x ptr], [4 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\01", [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf4i32], [4 x i8] c"\04\00\00\00" }, !type [[T8]] @vt4 = constant [3 x ptr] [ ptr @vf1i1, ptr @vf0i1, @@ -95,10 +95,10 @@ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf0i1 to i64), i64 p i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf4i32 to i64), i64 ptrtoint (ptr @vt7_rel to i64)) to i32) ], !type !1 -; CHECK: @vt1 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT1DATA]], i32 0, i32 1) -; CHECK: @vt2 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT2DATA]], i32 0, i32 1) -; CHECK: @vt3 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) -; CHECK: @vt4 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT4DATA]], i32 0, i32 1) +; CHECK: @vt1 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [4 x i8] }, ptr [[VT1DATA]], i32 0, i32 1) +; CHECK: @vt2 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [4 x i8] }, ptr [[VT2DATA]], i32 0, i32 1) +; CHECK: @vt3 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [4 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) +; CHECK: @vt4 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [4 x i8] }, ptr [[VT4DATA]], i32 0, i32 1) ; CHECK: @vt6_rel = alias [3 x i32], getelementptr inbounds ({ [4 x i8], [3 x i32], [0 x i8] }, ptr [[VT6RELDATA]], i32 0, i32 1) ; CHECK: @vt7_rel = alias [3 x i32], getelementptr inbounds ({ [4 x i8], [3 x i32], [0 x i8] }, ptr [[VT7RELDATA]], i32 0, i32 1) @@ -165,7 +165,7 @@ define i32 @call3(ptr %obj) { %vtable = load ptr, ptr %obj %pair = call {ptr, i1} @llvm.type.checked.load(ptr %vtable, i32 16, metadata !"typeid") %fptr = extractvalue {ptr, i1} %pair, 0 - ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -5 + ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 24 ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, ptr [[VTGEP3]] %result = call i32 %fptr(ptr %obj) ; CHECK: ret i32 [[VTLOAD3]] diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll index e0f9b6dbe2ac5..dd91ff6e6f3aa 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll @@ -2,7 +2,7 @@ target datalayout = "e-p:64:64" -; CHECK: [[VT1DATA:@[^ ]*]] = private constant { [0 x i8], [4 x ptr], [5 x i8] } { [0 x i8] zeroinitializer, [4 x ptr] [ptr null, ptr @vf0i1, ptr @vf1i1, ptr @vf1i32], [5 x i8] c"\02\03\00\00\00" }, !type [[T8:![0-9]+]] +; CHECK: [[VT1DATA:@[^ ]*]] = private constant { [0 x i8], [4 x ptr], [8 x i8] } { [0 x i8] zeroinitializer, [4 x ptr] [ptr null, ptr @vf0i1, ptr @vf1i1, ptr @vf1i32], [8 x i8] c"\02\00\00\00\03\00\00\00" }, !type [[T8:![0-9]+]] @vt1 = constant [4 x ptr] [ ptr null, ptr @vf0i1, @@ -10,14 +10,14 @@ ptr @vf1i1, ptr @vf1i32 ], !type !1 -; CHECK: [[VT2DATA:@[^ ]*]] = private constant { [0 x i8], [3 x ptr], [5 x i8] } { [0 x i8] zeroinitializer, [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf2i32], [5 x i8] c"\01\04\00\00\00" }, !type [[T0:![0-9]+]] +; CHECK: [[VT2DATA:@[^ ]*]] = private constant { [0 x i8], [3 x ptr], [8 x i8] } { [0 x i8] zeroinitializer, [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf2i32], [8 x i8] c"\01\00\00\00\04\00\00\00" }, !type [[T0:![0-9]+]] @vt2 = constant [3 x ptr] [ ptr @vf1i1, ptr @vf0i1, ptr @vf2i32 ], !type !0 -; CHECK: [[VT3DATA:@[^ ]*]] = private constant { [0 x i8], [4 x ptr], [5 x i8] } { [0 x i8] zeroinitializer, [4 x ptr] [ptr null, ptr @vf0i1, ptr @vf1i1, ptr @vf3i32], [5 x i8] c"\02\05\00\00\00" }, !type [[T8]] +; CHECK: [[VT3DATA:@[^ ]*]] = private constant { [0 x i8], [4 x ptr], [8 x i8] } { [0 x i8] zeroinitializer, [4 x ptr] [ptr null, ptr @vf0i1, ptr @vf1i1, ptr @vf3i32], [8 x i8] c"\02\00\00\00\05\00\00\00" }, !type [[T8]] @vt3 = constant [4 x ptr] [ ptr null, ptr @vf0i1, @@ -25,7 +25,7 @@ ptr @vf1i1, ptr @vf3i32 ], !type !1 -; CHECK: [[VT4DATA:@[^ ]*]] = private constant { [0 x i8], [3 x ptr], [5 x i8] } { [0 x i8] zeroinitializer, [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf4i32], [5 x i8] c"\01\06\00\00\00" }, !type [[T0]] +; CHECK: [[VT4DATA:@[^ ]*]] = private constant { [0 x i8], [3 x ptr], [8 x i8] } { [0 x i8] zeroinitializer, [3 x ptr] [ptr @vf1i1, ptr @vf0i1, ptr @vf4i32], [8 x i8] c"\01\00\00\00\06\00\00\00" }, !type [[T0]] @vt4 = constant [3 x ptr] [ ptr @vf1i1, ptr @vf0i1, @@ -57,10 +57,10 @@ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf1i1 to i64), i64 p i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf4i32 to i64), i64 ptrtoint (ptr @vt6_rel to i64)) to i32) ], !type !2 -; CHECK: @vt1 = alias [4 x ptr], getelementptr inbounds ({ [0 x i8], [4 x ptr], [5 x i8] }, ptr [[VT1DATA]], i32 0, i32 1) -; CHECK: @vt2 = alias [3 x ptr], getelementptr inbounds ({ [0 x i8], [3 x ptr], [5 x i8] }, ptr [[VT2DATA]], i32 0, i32 1) -; CHECK: @vt3 = alias [4 x ptr], getelementptr inbounds ({ [0 x i8], [4 x ptr], [5 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) -; CHECK: @vt4 = alias [3 x ptr], getelementptr inbounds ({ [0 x i8], [3 x ptr], [5 x i8] }, ptr [[VT4DATA]], i32 0, i32 1) +; CHECK: @vt1 = alias [4 x ptr], getelementptr inbounds ({ [0 x i8], [4 x ptr], [8 x i8] }, ptr [[VT1DATA]], i32 0, i32 1) +; CHECK: @vt2 = alias [3 x ptr], getelementptr inbounds ({ [0 x i8], [3 x ptr], [8 x i8] }, ptr [[VT2DATA]], i32 0, i32 1) +; CHECK: @vt3 = alias [4 x ptr], getelementptr inbounds ({ [0 x i8], [4 x ptr], [8 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) +; CHECK: @vt4 = alias [3 x ptr], getelementptr inbounds ({ [0 x i8], [3 x ptr], [8 x i8] }, ptr [[VT4DATA]], i32 0, i32 1) define i1 @vf0i1(ptr %this) readnone { ret i1 0 @@ -124,7 +124,7 @@ define i32 @call3(ptr %obj) { call void @llvm.assume(i1 %p) %fptrptr = getelementptr [3 x ptr], ptr %vtable, i32 0, i32 2 %fptr = load ptr, ptr %fptrptr - ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 25 + ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 28 ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, ptr [[VTGEP3]] %result = call i32 %fptr(ptr %obj) ; CHECK: ret i32 [[VTLOAD3]] diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-32.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-32.ll index fd703712ceb2c..ab76f2c22e343 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-32.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-32.ll @@ -5,8 +5,8 @@ target datalayout = "e-p:32:32" ;; Constant propagation should be agnostic towards sections. ;; Also the new global should be in the original vtable's section. -; CHECK: [[VT1DATA:@[^ ]*]] = {{.*}} { [8 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [8 x i8] c"\00\00\01\00\00\00\03\00", +; CHECK: [[VT1DATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x ptr], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\03\00", ; CHECK-SAME: }, section "vt1sec", !type [[T8:![0-9]+]] @vt1 = constant [3 x ptr] [ ptr @vf0i1, @@ -23,8 +23,8 @@ ptr @vf1i32 ;; according to the datalayout, this could result in an unaligned load. ;; 2. The call instruction in @call3 is replaced with a GEP + load. ;; -; CHECK: [[VT2DATA:@[^ ]*]] = {{.*}} { [8 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [8 x i8] c"\00\00\02\00\00\00\02\01", +; CHECK: [[VT2DATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x ptr], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\02\01", ; CHECK-SAME: !type [[T8]] @vt2 = constant [3 x ptr] [ ptr @vf1i1, @@ -37,8 +37,8 @@ ptr @vf2i32 ;; All the functions returning i8s and i1s should still be constant-propagated ;; because we can still do an aligned load regardless of where the 1-byte aligned ;; vtable is. -; CHECK: [[VT3DATA:@[^ ]*]] = {{.*}} { [6 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [6 x i8] c"\03\00\00\00\03\00", +; CHECK: [[VT3DATA:@[^ ]*]] = {{.*}} { [2 x i8], [3 x ptr], [0 x i8] } +; CHECK-SAME: [2 x i8] c"\03\00", ; CHECK-SAME: }, align 1, !type [[T5:![0-9]+]] @vt3 = constant [3 x ptr] [ ptr @vf0i1, @@ -48,7 +48,7 @@ ptr @vf3i32 ;; This represents an overaligned vtable. ; CHECK: [[VT4DATA:@[^ ]*]] = {{.*}} { [16 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [16 x i8] c"\00\00\00\00\00\00\00\00\00\00\04\00\00\00\02\01", +; CHECK-SAME: [16 x i8] c"\00\00\00\00\00\00\00\00\00\00\00\00\00\00\02\01", ; CHECK-SAME: }, align 16, !type [[T16:![0-9]+]] @vt4 = constant [3 x ptr] [ ptr @vf1i1, @@ -57,8 +57,8 @@ ptr @vf4i32 ], align 16, !type !0 ;; These contain a mix of different integral type sizes. -; CHECK: [[VT6DATA:@[^ ]*]] = {{.*}} { [12 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [12 x i8] c"\00\00\00\0B\05\00\00\00\00\00\00\00", +; CHECK: [[VT6DATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x ptr], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\00\0B", ; CHECK-SAME: }, !type [[T1:![0-9]+]] @vt6 = constant [3 x ptr] [ ptr @vf0i1, @@ -66,8 +66,8 @@ ptr @vf10i8, ptr @vf5i64 ], !type !1 -; CHECK: [[VT7DATA:@[^ ]*]] = {{.*}} { [12 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [12 x i8] c"\00\00\00\0A\06\00\00\00\00\00\00\00", +; CHECK: [[VT7DATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x ptr], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\00\0A", ; CHECK-SAME: }, !type [[T1]] @vt7 = constant [3 x ptr] [ ptr @vf1i1, @@ -76,8 +76,8 @@ ptr @vf6i64 ], !type !1 ;; Test relative vtables -; CHECK: [[VT6RELDATA:@[^ ]*]] = {{.*}} { [12 x i8], [3 x i32], [0 x i8] } -; CHECK-SAME: [12 x i8] c"\00\00\00\0B\05\00\00\00\00\00\00\00", +; CHECK: [[VT6RELDATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x i32], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\00\0B", ; CHECK-SAME: ], [0 x i8] zeroinitializer }, !type [[TREL:![0-9]+]] @vt6_rel = constant [3 x i32] [ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf0i1 to i64), i64 ptrtoint (ptr @vt6_rel to i64)) to i32), @@ -85,8 +85,8 @@ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf10i8 to i64), i64 i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf5i64 to i64), i64 ptrtoint (ptr @vt6_rel to i64)) to i32) ], !type !2 -; CHECK: [[VT7RELDATA:@[^ ]*]] = {{.*}} { [12 x i8], [3 x i32], [0 x i8] } -; CHECK-SAME: [12 x i8] c"\00\00\00\0A\06\00\00\00\00\00\00\00", +; CHECK: [[VT7RELDATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x i32], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\00\0A", ; CHECK-SAME: ], [0 x i8] zeroinitializer }, !type [[TREL]] @vt7_rel = constant [3 x i32] [ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf1i1 to i64), i64 ptrtoint (ptr @vt7_rel to i64)) to i32), @@ -94,14 +94,14 @@ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf9i8 to i64), i64 p i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf6i64 to i64), i64 ptrtoint (ptr @vt7_rel to i64)) to i32) ], !type !2 -; CHECK: @vt1 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT1DATA]], i32 0, i32 1) -; CHECK: @vt2 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT2DATA]], i32 0, i32 1) -; CHECK: @vt3 = alias [3 x ptr], getelementptr inbounds ({ [6 x i8], [3 x ptr], [0 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) +; CHECK: @vt1 = alias [3 x ptr], getelementptr inbounds ({ [4 x i8], [3 x ptr], [0 x i8] }, ptr [[VT1DATA]], i32 0, i32 1) +; CHECK: @vt2 = alias [3 x ptr], getelementptr inbounds ({ [4 x i8], [3 x ptr], [0 x i8] }, ptr [[VT2DATA]], i32 0, i32 1) +; CHECK: @vt3 = alias [3 x ptr], getelementptr inbounds ({ [2 x i8], [3 x ptr], [0 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) ; CHECK: @vt4 = alias [3 x ptr], getelementptr inbounds ({ [16 x i8], [3 x ptr], [0 x i8] }, ptr [[VT4DATA]], i32 0, i32 1) -; CHECK: @vt6 = alias [3 x ptr], getelementptr inbounds ({ [12 x i8], [3 x ptr], [0 x i8] }, ptr [[VT6DATA]], i32 0, i32 1) -; CHECK: @vt7 = alias [3 x ptr], getelementptr inbounds ({ [12 x i8], [3 x ptr], [0 x i8] }, ptr [[VT7DATA]], i32 0, i32 1) -; CHECK: @vt6_rel = alias [3 x i32], getelementptr inbounds ({ [12 x i8], [3 x i32], [0 x i8] }, ptr [[VT6RELDATA]], i32 0, i32 1) -; CHECK: @vt7_rel = alias [3 x i32], getelementptr inbounds ({ [12 x i8], [3 x i32], [0 x i8] }, ptr [[VT7RELDATA]], i32 0, i32 1) +; CHECK: @vt6 = alias [3 x ptr], getelementptr inbounds ({ [4 x i8], [3 x ptr], [0 x i8] }, ptr [[VT6DATA]], i32 0, i32 1) +; CHECK: @vt7 = alias [3 x ptr], getelementptr inbounds ({ [4 x i8], [3 x ptr], [0 x i8] }, ptr [[VT7DATA]], i32 0, i32 1) +; CHECK: @vt6_rel = alias [3 x i32], getelementptr inbounds ({ [4 x i8], [3 x i32], [0 x i8] }, ptr [[VT6RELDATA]], i32 0, i32 1) +; CHECK: @vt7_rel = alias [3 x i32], getelementptr inbounds ({ [4 x i8], [3 x i32], [0 x i8] }, ptr [[VT7RELDATA]], i32 0, i32 1) define i1 @vf0i1(ptr %this) readnone { ret i1 0 @@ -199,9 +199,10 @@ define i32 @call3(ptr %obj) { %fptr = load ptr, ptr %fptrptr %result = call i32 %fptr(ptr %obj) ret i32 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -6 - ; CHECK: [[VTLOAD:%[^ ]*]] = load i32, ptr [[VTGEP2]] - ; CHECK: ret i32 [[VTLOAD]] + ; CHECK: [[FPTRPTR:%.*]] = getelementptr [3 x ptr], ptr %vtable, i32 0, i32 2 + ; CHECK: [[FPTR:%.*]] = load ptr, ptr [[FPTRPTR]], align 4 + ; CHECK: [[RES:%.*]] = call i32 [[FPTR]](ptr %obj) + ; CHECK: ret i32 [[RES]] } ; CHECK-LABEL: define i1 @call4( @@ -226,9 +227,10 @@ define i64 @call5(ptr %obj) { %fptr = load ptr, ptr %fptrptr %result = call i64 %fptr(ptr %obj) ret i64 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -8 - ; CHECK: [[VTLOAD:%[^ ]*]] = load i64, ptr [[VTGEP2]] - ; CHECK: ret i64 [[VTLOAD]] + ; CHECK: [[FPTRPTR:%.*]] = getelementptr [3 x ptr], ptr %vtable, i32 0, i32 2 + ; CHECK: [[FPTR:%.*]] = load ptr, ptr [[FPTRPTR]], align 4 + ; CHECK: [[RES:%.*]] = call i64 [[FPTR]](ptr %obj) + ; CHECK: ret i64 [[RES]] } ; CHECK-LABEL: define i8 @call6( @@ -240,7 +242,7 @@ define i8 @call6(ptr %obj) { %fptr = load ptr, ptr %fptrptr %result = call i8 %fptr(ptr %obj) ret i8 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -9 + ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -1 ; CHECK: [[VTLOAD:%[^ ]*]] = load i8, ptr [[VTGEP2]] ; CHECK: ret i8 [[VTLOAD]] } @@ -265,9 +267,9 @@ define i64 @call5_rel(ptr %obj) { %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8) %result = call i64 %fptr(ptr %obj) ret i64 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -8 - ; CHECK: [[VTLOAD:%[^ ]*]] = load i64, ptr [[VTGEP2]] - ; CHECK: ret i64 [[VTLOAD]] + ; CHECK: [[FPTR:%.*]] = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8) + ; CHECK: [[RES:%.*]] = call i64 [[FPTR]](ptr %obj) + ; CHECK: ret i64 [[RES]] } ; CHECK-LABEL: define i8 @call6_rel( @@ -278,7 +280,7 @@ define i8 @call6_rel(ptr %obj) { %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 4) %result = call i8 %fptr(ptr %obj) ret i8 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -9 + ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -1 ; CHECK: [[VTLOAD:%[^ ]*]] = load i8, ptr [[VTGEP2]] ; CHECK: ret i8 [[VTLOAD]] } @@ -288,11 +290,11 @@ declare void @llvm.assume(i1) declare void @__cxa_pure_virtual() declare ptr @llvm.load.relative.i32(ptr, i32) -; CHECK: [[T8]] = !{i32 8, !"typeid"} -; CHECK: [[T5]] = !{i32 6, !"typeid"} +; CHECK: [[T8]] = !{i32 4, !"typeid"} +; CHECK: [[T5]] = !{i32 2, !"typeid"} ; CHECK: [[T16]] = !{i32 16, !"typeid"} -; CHECK: [[T1]] = !{i32 12, !"typeid2"} -; CHECK: [[TREL]] = !{i32 12, !"typeid3"} +; CHECK: [[T1]] = !{i32 4, !"typeid2"} +; CHECK: [[TREL]] = !{i32 4, !"typeid3"} !0 = !{i32 0, !"typeid"} !1 = !{i32 0, !"typeid2"} diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-64.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-64.ll index ce4a0180dfc92..c83fbc6ed5a19 100644 --- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-64.ll +++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-small-alignment-64.ll @@ -6,7 +6,7 @@ target datalayout = "e-p:64:64" ;; Constant propagation should be agnostic towards sections. ;; Also the new global should be in the original vtable's section. ; CHECK: [[VT1DATA:@[^ ]*]] = {{.*}} { [8 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [8 x i8] c"\00\00\01\00\00\00\03\00", +; CHECK-SAME: [8 x i8] c"\00\00\00\00\00\00\03\00", ; CHECK-SAME: }, section "vt1sec", !type [[T8:![0-9]+]] @vt1 = constant [3 x ptr] [ ptr @vf0i1, @@ -24,7 +24,7 @@ ptr @vf1i32 ;; 2. The call instruction in @call3 is replaced with a GEP + load. ;; ; CHECK: [[VT2DATA:@[^ ]*]] = {{.*}} { [8 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [8 x i8] c"\00\00\02\00\00\00\02\01", +; CHECK-SAME: [8 x i8] c"\00\00\00\00\00\00\02\01", ; CHECK-SAME: !type [[T8]] @vt2 = constant [3 x ptr] [ ptr @vf1i1, @@ -37,8 +37,8 @@ ptr @vf2i32 ;; All the functions returning i8s and i1s should still be constant-propagated ;; because we can still do an aligned load regardless of where the 1-byte aligned ;; vtable is. -; CHECK: [[VT3DATA:@[^ ]*]] = {{.*}} { [6 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [6 x i8] c"\03\00\00\00\03\00", +; CHECK: [[VT3DATA:@[^ ]*]] = {{.*}} { [2 x i8], [3 x ptr], [0 x i8] } +; CHECK-SAME: [2 x i8] c"\03\00", ; CHECK-SAME: }, align 1, !type [[T5:![0-9]+]] @vt3 = constant [3 x ptr] [ ptr @vf0i1, @@ -48,7 +48,7 @@ ptr @vf3i32 ;; This represents an overaligned vtable. ; CHECK: [[VT4DATA:@[^ ]*]] = {{.*}} { [16 x i8], [3 x ptr], [0 x i8] } -; CHECK-SAME: [16 x i8] c"\00\00\00\00\00\00\00\00\00\00\04\00\00\00\02\01", +; CHECK-SAME: [16 x i8] c"\00\00\00\00\00\00\00\00\00\00\00\00\00\00\02\01", ; CHECK-SAME: }, align 16, !type [[T16:![0-9]+]] @vt4 = constant [3 x ptr] [ ptr @vf1i1, @@ -79,8 +79,8 @@ ptr @vf6i64 ], !type !1 ;; Test relative vtables -; CHECK: [[VT6RELDATA:@[^ ]*]] = {{.*}} { [12 x i8], [3 x i32], [0 x i8] } -; CHECK-SAME: [12 x i8] c"\00\00\00\0B\05\00\00\00\00\00\00\00", +; CHECK: [[VT6RELDATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x i32], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\00\0B", ; CHECK-SAME: ], [0 x i8] zeroinitializer }, !type [[TREL:![0-9]+]] @vt6_rel = constant [3 x i32] [ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf0i1 to i64), i64 ptrtoint (ptr @vt6_rel to i64)) to i32), @@ -88,8 +88,8 @@ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf10i8 to i64), i64 i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf5i64 to i64), i64 ptrtoint (ptr @vt6_rel to i64)) to i32) ], !type !2 -; CHECK: [[VT7RELDATA:@[^ ]*]] = {{.*}} { [12 x i8], [3 x i32], [0 x i8] } -; CHECK-SAME: [12 x i8] c"\00\00\00\0A\06\00\00\00\00\00\00\00", +; CHECK: [[VT7RELDATA:@[^ ]*]] = {{.*}} { [4 x i8], [3 x i32], [0 x i8] } +; CHECK-SAME: [4 x i8] c"\00\00\00\0A", ; CHECK-SAME: ], [0 x i8] zeroinitializer }, !type [[TREL]] @vt7_rel = constant [3 x i32] [ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf1i1 to i64), i64 ptrtoint (ptr @vt7_rel to i64)) to i32), @@ -99,12 +99,12 @@ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf6i64 to i64), i64 ; CHECK: @vt1 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT1DATA]], i32 0, i32 1) ; CHECK: @vt2 = alias [3 x ptr], getelementptr inbounds ({ [8 x i8], [3 x ptr], [0 x i8] }, ptr [[VT2DATA]], i32 0, i32 1) -; CHECK: @vt3 = alias [3 x ptr], getelementptr inbounds ({ [6 x i8], [3 x ptr], [0 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) +; CHECK: @vt3 = alias [3 x ptr], getelementptr inbounds ({ [2 x i8], [3 x ptr], [0 x i8] }, ptr [[VT3DATA]], i32 0, i32 1) ; CHECK: @vt4 = alias [3 x ptr], getelementptr inbounds ({ [16 x i8], [3 x ptr], [0 x i8] }, ptr [[VT4DATA]], i32 0, i32 1) ; CHECK: @vt6 = alias [3 x ptr], getelementptr inbounds ({ [16 x i8], [3 x ptr], [0 x i8] }, ptr [[VT6DATA]], i32 0, i32 1) ; CHECK: @vt7 = alias [3 x ptr], getelementptr inbounds ({ [16 x i8], [3 x ptr], [0 x i8] }, ptr [[VT7DATA]], i32 0, i32 1) -; CHECK: @vt6_rel = alias [3 x i32], getelementptr inbounds ({ [12 x i8], [3 x i32], [0 x i8] }, ptr [[VT6RELDATA]], i32 0, i32 1) -; CHECK: @vt7_rel = alias [3 x i32], getelementptr inbounds ({ [12 x i8], [3 x i32], [0 x i8] }, ptr [[VT7RELDATA]], i32 0, i32 1) +; CHECK: @vt6_rel = alias [3 x i32], getelementptr inbounds ({ [4 x i8], [3 x i32], [0 x i8] }, ptr [[VT6RELDATA]], i32 0, i32 1) +; CHECK: @vt7_rel = alias [3 x i32], getelementptr inbounds ({ [4 x i8], [3 x i32], [0 x i8] }, ptr [[VT7RELDATA]], i32 0, i32 1) define i1 @vf0i1(ptr %this) readnone { ret i1 0 @@ -200,9 +200,10 @@ define i32 @call3(ptr %obj) { %fptr = load ptr, ptr %fptrptr %result = call i32 %fptr(ptr %obj) ret i32 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -6 - ; CHECK: [[VTLOAD:%[^ ]*]] = load i32, ptr [[VTGEP2]] - ; CHECK: ret i32 [[VTLOAD]] + ; CHECK: [[FPTRPTR:%.*]] = getelementptr [3 x ptr], ptr %vtable, i32 0, i32 2 + ; CHECK: [[FPTR:%.*]] = load ptr, ptr [[FPTRPTR]], align 8 + ; CHECK: [[RES:%.*]] = call i32 [[FPTR]](ptr %obj) + ; CHECK: ret i32 [[RES]] } ; CHECK-LABEL: define i1 @call4( @@ -266,9 +267,9 @@ define i64 @call5_rel(ptr %obj) { %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8) %result = call i64 %fptr(ptr %obj) ret i64 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -8 - ; CHECK: [[VTLOAD:%[^ ]*]] = load i64, ptr [[VTGEP2]] - ; CHECK: ret i64 [[VTLOAD]] + ; CHECK: [[FPTR:%.*]] = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8) + ; CHECK: [[RES:%.*]] = call i64 [[FPTR]](ptr %obj) + ; CHECK: ret i64 [[RES]] } ; CHECK-LABEL: define i8 @call6_rel( @@ -279,7 +280,7 @@ define i8 @call6_rel(ptr %obj) { %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 4) %result = call i8 %fptr(ptr %obj) ret i8 %result - ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -9 + ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, ptr %vtable, i32 -1 ; CHECK: [[VTLOAD:%[^ ]*]] = load i8, ptr [[VTGEP2]] ; CHECK: ret i8 [[VTLOAD]] } @@ -290,10 +291,10 @@ declare void @__cxa_pure_virtual() declare ptr @llvm.load.relative.i32(ptr, i32) ; CHECK: [[T8]] = !{i32 8, !"typeid"} -; CHECK: [[T5]] = !{i32 6, !"typeid"} +; CHECK: [[T5]] = !{i32 2, !"typeid"} ; CHECK: [[T16]] = !{i32 16, !"typeid"} ; CHECK: [[T1]] = !{i32 16, !"typeid2"} -; CHECK: [[TREL]] = !{i32 12, !"typeid3"} +; CHECK: [[TREL]] = !{i32 4, !"typeid3"} !0 = !{i32 0, !"typeid"} !1 = !{i32 0, !"typeid2"} diff --git a/llvm/test/tools/llvm-exegesis/AArch64/skip_unsupported_instructions.s b/llvm/test/tools/llvm-exegesis/AArch64/skip_unsupported_instructions.s index 927ee190e803f..72009756ed1d5 100644 --- a/llvm/test/tools/llvm-exegesis/AArch64/skip_unsupported_instructions.s +++ b/llvm/test/tools/llvm-exegesis/AArch64/skip_unsupported_instructions.s @@ -1,5 +1,9 @@ llvm/test/tools/llvm-exegesis/AArch64/skip_unsupported_instructions.s +# TODO: This is failing on some systems that have hardware support for +# pointer authentication. This needs to be fixed before reenabling. +# REQUIRES: disabled + # REQUIRES: aarch64-registered-target # Check for skipping of illegal instruction errors (AUT and LDGM) @@ -7,4 +11,4 @@ llvm/test/tools/llvm-exegesis/AArch64/skip_unsupported_instructions.s # CHECK-AUTIA-NOT: snippet crashed while running: Illegal instruction # RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --opcode-name=LDGM --benchmark-phase=assemble-measured-code 2>&1 | FileCheck %s --check-prefix=CHECK-LDGM -# CHECK-LDGM: LDGM: Unsupported opcode: load tag multiple \ No newline at end of file +# CHECK-LDGM: LDGM: Unsupported opcode: load tag multiple diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 54b90cd7c7506..ab2f685b4fc1d 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -1243,7 +1243,7 @@ Session::Session(std::unique_ptr EPC, Error &Err) if ((Err = ES.getBootstrapMapValue("darwin-use-ehframes-only", ForceEHFrames))) return; - bool UseEHFrames = ForceEHFrames ? *ForceEHFrames : false; + bool UseEHFrames = ForceEHFrames.value_or(false); if (!UseEHFrames) ObjLayer.addPlugin(ExitOnErr(UnwindInfoRegistrationPlugin::Create(ES))); else diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index 2f839199712eb..7a778da2d3a49 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -1375,7 +1375,7 @@ static uint64_t dumpARMELFData(uint64_t SectionAddr, uint64_t Index, } static void dumpELFData(uint64_t SectionAddr, uint64_t Index, uint64_t End, - ArrayRef Bytes) { + ArrayRef Bytes, raw_ostream &OS) { // print out data up to 8 bytes at a time in hex and ascii uint8_t AsciiData[9] = {'\0'}; uint8_t Byte; @@ -1383,9 +1383,9 @@ static void dumpELFData(uint64_t SectionAddr, uint64_t Index, uint64_t End, for (; Index < End; ++Index) { if (NumBytes == 0) - outs() << format("%8" PRIx64 ":", SectionAddr + Index); + OS << format("%8" PRIx64 ":", SectionAddr + Index); Byte = Bytes.slice(Index)[0]; - outs() << format(" %02x", Byte); + OS << format(" %02x", Byte); AsciiData[NumBytes] = isPrint(Byte) ? Byte : '.'; uint8_t IndentOffset = 0; @@ -1400,9 +1400,9 @@ static void dumpELFData(uint64_t SectionAddr, uint64_t Index, uint64_t End, } if (NumBytes == 8) { AsciiData[8] = '\0'; - outs() << std::string(IndentOffset, ' ') << " "; - outs() << reinterpret_cast(AsciiData); - outs() << '\n'; + OS << std::string(IndentOffset, ' ') << " "; + OS << reinterpret_cast(AsciiData); + OS << '\n'; NumBytes = 0; } } @@ -1666,7 +1666,7 @@ static void disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, DisassemblerTarget &PrimaryTarget, std::optional &SecondaryTarget, - SourcePrinter &SP, bool InlineRelocs) { + SourcePrinter &SP, bool InlineRelocs, raw_ostream &OS) { DisassemblerTarget *DT = &PrimaryTarget; bool PrimaryIsThumb = false; SmallVector, 0> CHPECodeMap; @@ -2089,10 +2089,10 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, if (!PrintedSection) { PrintedSection = true; - outs() << "\nDisassembly of section "; + OS << "\nDisassembly of section "; if (!SegmentName.empty()) - outs() << SegmentName << ","; - outs() << SectionName << ":\n"; + OS << SegmentName << ","; + OS << SectionName << ":\n"; } bool PrintedLabel = false; @@ -2104,22 +2104,22 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, const StringRef SymbolName = SymNamesHere[i]; if (!PrintedLabel) { - outs() << '\n'; + OS << '\n'; PrintedLabel = true; } if (LeadingAddr) - outs() << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ", - SectionAddr + Start + VMAAdjustment); + OS << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ", + SectionAddr + Start + VMAAdjustment); if (Obj.isXCOFF() && SymbolDescription) { - outs() << getXCOFFSymbolDescription(Symbol, SymbolName) << ":\n"; + OS << getXCOFFSymbolDescription(Symbol, SymbolName) << ":\n"; } else - outs() << '<' << SymbolName << ">:\n"; + OS << '<' << SymbolName << ">:\n"; } // Don't print raw contents of a virtual section. A virtual section // doesn't have any contents in the file. if (Section.isVirtual()) { - outs() << "...\n"; + OS << "...\n"; continue; } @@ -2156,17 +2156,17 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, do { StringRef Line; std::tie(Line, ErrMsg) = ErrMsg.split('\n'); - outs() << DT->Context->getAsmInfo()->getCommentString() - << " error decoding " << SymNamesHere[SHI] << ": " << Line - << '\n'; + OS << DT->Context->getAsmInfo()->getCommentString() + << " error decoding " << SymNamesHere[SHI] << ": " << Line + << '\n'; } while (!ErrMsg.empty()); if (Size) { - outs() << DT->Context->getAsmInfo()->getCommentString() - << " decoding failed region as bytes\n"; + OS << DT->Context->getAsmInfo()->getCommentString() + << " decoding failed region as bytes\n"; for (uint64_t I = 0; I < Size; ++I) - outs() << "\t.byte\t " << format_hex(Bytes[I], 1, /*Upper=*/true) - << '\n'; + OS << "\t.byte\t " << format_hex(Bytes[I], 1, /*Upper=*/true) + << '\n'; } } @@ -2179,13 +2179,13 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, Start += Size; break; } - + formatted_raw_ostream FOS(OS); Index = Start; if (SectionAddr < StartAddress) Index = std::max(Index, StartAddress - SectionAddr); if (DisassembleAsELFData) { - dumpELFData(SectionAddr, Index, End, Bytes); + dumpELFData(SectionAddr, Index, End, Bytes, FOS); Index = End; continue; } @@ -2203,8 +2203,6 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, Symbols[SI - 1].XCOFFSymInfo.StorageMappingClass && (*Symbols[SI - 1].XCOFFSymInfo.StorageMappingClass == XCOFF::XMC_PR); - formatted_raw_ostream FOS(outs()); - std::unordered_map AllLabels; std::unordered_map> BBAddrMapLabels; if (SymbolizeOperands) { @@ -2553,7 +2551,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, reportWarning("failed to disassemble missing symbol " + Sym, FileName); } -static void disassembleObject(ObjectFile *Obj, bool InlineRelocs) { +static void disassembleObject(ObjectFile *Obj, bool InlineRelocs, + raw_ostream &OS) { // If information useful for showing the disassembly is missing, try to find a // more complete binary and disassemble that instead. OwningBinary FetchedBinary; @@ -2679,7 +2678,7 @@ static void disassembleObject(ObjectFile *Obj, bool InlineRelocs) { "Unrecognized disassembler option: " + Opt); disassembleObject(*Obj, *DbgObj, PrimaryTarget, SecondaryTarget, SP, - InlineRelocs); + InlineRelocs, OS); } void Dumper::printRelocations() { @@ -3340,7 +3339,7 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr, if (SectionContents) printSectionContents(O); if (Disassemble) - disassembleObject(O, Relocations); + disassembleObject(O, Relocations, outs()); if (UnwindInfo) printUnwindInfo(O); diff --git a/llvm/unittests/ADT/BitmaskEnumTest.cpp b/llvm/unittests/ADT/BitmaskEnumTest.cpp index 2c0a80342a54c..b1ef8482416a9 100644 --- a/llvm/unittests/ADT/BitmaskEnumTest.cpp +++ b/llvm/unittests/ADT/BitmaskEnumTest.cpp @@ -176,6 +176,17 @@ TEST(BitmaskEnumTest, BitwiseNot) { EXPECT_EQ(15, ~V0); } +TEST(BitmaskEnumTest, BooleanNot) { + bool b0 = !F0; + EXPECT_TRUE(b0); + + bool b1 = !(F1 & F2); + EXPECT_TRUE(b1); + + bool b2 = !(F2 | F4); + EXPECT_FALSE(b2); +} + enum class FlagsClass { F0 = 0, F1 = 1, diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt index a972dc32c40a2..d1677cdaeceac 100644 --- a/llvm/unittests/CodeGen/CMakeLists.txt +++ b/llvm/unittests/CodeGen/CMakeLists.txt @@ -29,6 +29,7 @@ add_llvm_unittest(CodeGenTests DIETest.cpp DroppedVariableStatsMIRTest.cpp DwarfStringPoolEntryRefTest.cpp + GCMetadata.cpp InstrRefLDVTest.cpp LowLevelTypeTest.cpp LexicalScopesTest.cpp diff --git a/llvm/unittests/CodeGen/GCMetadata.cpp b/llvm/unittests/CodeGen/GCMetadata.cpp new file mode 100644 index 0000000000000..a5d8a63d9b555 --- /dev/null +++ b/llvm/unittests/CodeGen/GCMetadata.cpp @@ -0,0 +1,76 @@ +//===- llvm/unittest/CodeGen/GCMetadata.cpp -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/GCMetadata.h" +#include "llvm/Analysis/CGSCCPassManager.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Analysis.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +std::unique_ptr parseIR(LLVMContext &Context, const char *IR) { + SMDiagnostic Err; + return parseAssemblyString(IR, Err, Context); +} + +class GCMetadataTest : public ::testing::Test { +protected: + LLVMContext Context; + std::unique_ptr M; + +public: + GCMetadataTest() + : M(parseIR(Context, R"( +%Env = type ptr + +define void @.main(%Env) gc "shadow-stack" { + %Root = alloca %Env + call void @llvm.gcroot( ptr %Root, %Env null ) + unreachable +} + +define void @g() gc "erlang" { +entry: + ret void +} + +declare void @llvm.gcroot(ptr, %Env) +)")) {} +}; + +TEST_F(GCMetadataTest, Basic) { + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + PassBuilder PB; + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + ModulePassManager MPM; + FunctionPassManager FPM; + GCStrategyMap &StrategyMap = MAM.getResult(*M); + for (auto &[GCName, Strategy] : StrategyMap) + EXPECT_EQ(GCName, Strategy->getName()); + for (auto &[GCName, Strategy] : llvm::reverse(StrategyMap)) + EXPECT_EQ(GCName, Strategy->getName()); +} + +} // namespace diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp index de29cbcd29476..089fb00d6080d 100644 --- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp @@ -10,52 +10,6 @@ #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -TEST_F(AArch64GISelMITest, TestKnownBitsCst) { - StringRef MIRString = " %3:_(s8) = G_CONSTANT i8 1\n" - " %4:_(s8) = COPY %3\n"; - setUp(MIRString); - if (!TM) - GTEST_SKIP(); - unsigned CopyReg = Copies[Copies.size() - 1]; - MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg); - unsigned SrcReg = FinalCopy->getOperand(1).getReg(); - unsigned DstReg = FinalCopy->getOperand(0).getReg(); - GISelValueTracking Info(*MF); - KnownBits Res = Info.getKnownBits(SrcReg); - EXPECT_EQ((uint64_t)1, Res.One.getZExtValue()); - EXPECT_EQ((uint64_t)0xfe, Res.Zero.getZExtValue()); - - KnownBits Res2 = Info.getKnownBits(DstReg); - EXPECT_EQ(Res.One.getZExtValue(), Res2.One.getZExtValue()); - EXPECT_EQ(Res.Zero.getZExtValue(), Res2.Zero.getZExtValue()); -} - -TEST_F(AArch64GISelMITest, TestKnownBitsCstWithClass) { - StringRef MIRString = " %10:gpr32 = MOVi32imm 1\n" - " %4:_(s32) = COPY %10\n"; - setUp(MIRString); - if (!TM) - GTEST_SKIP(); - unsigned CopyReg = Copies[Copies.size() - 1]; - MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg); - unsigned SrcReg = FinalCopy->getOperand(1).getReg(); - unsigned DstReg = FinalCopy->getOperand(0).getReg(); - GISelValueTracking Info(*MF); - KnownBits Res = Info.getKnownBits(SrcReg); - // We can't analyze %3 due to the register class constraint. We will get a - // default-constructed KnownBits back. - EXPECT_EQ((uint64_t)1, Res.getBitWidth()); - EXPECT_EQ((uint64_t)0, Res.One.getZExtValue()); - EXPECT_EQ((uint64_t)0, Res.Zero.getZExtValue()); - - KnownBits Res2 = Info.getKnownBits(DstReg); - // We still don't know the values due to the register class constraint but %4 - // did reveal the size of %3. - EXPECT_EQ((uint64_t)32, Res2.getBitWidth()); - EXPECT_EQ(Res.One.getZExtValue(), Res2.One.getZExtValue()); - EXPECT_EQ(Res.Zero.getZExtValue(), Res2.Zero.getZExtValue()); -} - // Check that we are able to track bits through PHIs // and get the intersections of everything we know on each operand. TEST_F(AArch64GISelMITest, TestKnownBitsCstPHI) { diff --git a/llvm/unittests/ProfileData/CMakeLists.txt b/llvm/unittests/ProfileData/CMakeLists.txt index 0a7f7da085950..29b9cb751dabe 100644 --- a/llvm/unittests/ProfileData/CMakeLists.txt +++ b/llvm/unittests/ProfileData/CMakeLists.txt @@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(ProfileDataTests BPFunctionNodeTest.cpp CoverageMappingTest.cpp + DataAccessProfTest.cpp InstrProfDataTest.cpp InstrProfTest.cpp ItaniumManglingCanonicalizerTest.cpp diff --git a/llvm/unittests/ProfileData/DataAccessProfTest.cpp b/llvm/unittests/ProfileData/DataAccessProfTest.cpp new file mode 100644 index 0000000000000..8866c16fe292a --- /dev/null +++ b/llvm/unittests/ProfileData/DataAccessProfTest.cpp @@ -0,0 +1,181 @@ +//===- unittests/Support/DataAccessProfTest.cpp +//----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ProfileData/DataAccessProf.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace data_access_prof { +namespace { + +using ::llvm::StringRef; +using llvm::ValueIs; +using ::testing::ElementsAre; +using ::testing::Field; +using ::testing::HasSubstr; +using ::testing::IsEmpty; + +static std::string ErrorToString(Error E) { + std::string ErrMsg; + llvm::raw_string_ostream OS(ErrMsg); + llvm::logAllUnhandledErrors(std::move(E), OS); + return ErrMsg; +} + +// Test the various scenarios when DataAccessProfData should return error on +// invalid input. +TEST(MemProf, DataAccessProfileError) { + // Returns error if the input symbol name is empty. + DataAccessProfData Data; + EXPECT_THAT(ErrorToString(Data.setDataAccessProfile("", 100)), + HasSubstr("Empty symbol name")); + + // Returns error when the same symbol gets added twice. + ASSERT_FALSE(Data.setDataAccessProfile("foo", 100)); + EXPECT_THAT(ErrorToString(Data.setDataAccessProfile("foo", 100)), + HasSubstr("Duplicate symbol or string literal added")); + + // Returns error when the same string content hash gets added twice. + ASSERT_FALSE(Data.setDataAccessProfile((uint64_t)135246, 1000)); + EXPECT_THAT(ErrorToString(Data.setDataAccessProfile((uint64_t)135246, 1000)), + HasSubstr("Duplicate symbol or string literal added")); +} + +// Test the following operations on DataAccessProfData: +// - Profile record look up. +// - Serialization and de-serialization. +TEST(MemProf, DataAccessProfile) { + using internal::DataAccessProfRecordRef; + using internal::SourceLocationRef; + DataAccessProfData Data; + + // In the bool conversion, Error is true if it's in a failure state and false + // if it's in an accept state. Use ASSERT_FALSE or EXPECT_FALSE for no error. + ASSERT_FALSE(Data.setDataAccessProfile("foo.llvm.123", 100)); + ASSERT_FALSE(Data.addKnownSymbolWithoutSamples((uint64_t)789)); + ASSERT_FALSE(Data.addKnownSymbolWithoutSamples("sym2")); + ASSERT_FALSE(Data.setDataAccessProfile("bar.__uniq.321", 123, + { + SourceLocation{"file2", 3}, + })); + ASSERT_FALSE(Data.addKnownSymbolWithoutSamples("sym1")); + ASSERT_FALSE(Data.addKnownSymbolWithoutSamples((uint64_t)678)); + ASSERT_FALSE(Data.setDataAccessProfile( + (uint64_t)135246, 1000, + {SourceLocation{"file1", 1}, SourceLocation{"file2", 2}})); + + { + // Test that symbol names and file names are stored in the input order. + EXPECT_THAT( + llvm::to_vector(llvm::make_first_range(Data.getStrToIndexMapRef())), + ElementsAre("foo", "sym2", "bar.__uniq.321", "file2", "sym1", "file1")); + EXPECT_THAT(Data.getKnownColdSymbols(), ElementsAre("sym2", "sym1")); + EXPECT_THAT(Data.getKnownColdHashes(), ElementsAre(789, 678)); + + // Look up profiles. + EXPECT_TRUE(Data.isKnownColdSymbol((uint64_t)789)); + EXPECT_TRUE(Data.isKnownColdSymbol((uint64_t)678)); + EXPECT_TRUE(Data.isKnownColdSymbol("sym2")); + EXPECT_TRUE(Data.isKnownColdSymbol("sym1")); + + EXPECT_EQ(Data.getProfileRecord("non-existence"), std::nullopt); + EXPECT_EQ(Data.getProfileRecord((uint64_t)789987), std::nullopt); + + EXPECT_THAT( + Data.getProfileRecord("foo.llvm.123"), + ValueIs(AllOf( + Field(&DataAccessProfRecord::SymHandle, + testing::VariantWith(testing::Eq("foo"))), + Field(&DataAccessProfRecord::Locations, testing::IsEmpty())))); + EXPECT_THAT( + Data.getProfileRecord("bar.__uniq.321"), + ValueIs(AllOf( + Field(&DataAccessProfRecord::SymHandle, + testing::VariantWith( + testing::Eq("bar.__uniq.321"))), + Field(&DataAccessProfRecord::Locations, + ElementsAre(AllOf(Field(&SourceLocation::FileName, "file2"), + Field(&SourceLocation::Line, 3))))))); + EXPECT_THAT( + Data.getProfileRecord((uint64_t)135246), + ValueIs(AllOf( + Field(&DataAccessProfRecord::SymHandle, + testing::VariantWith(testing::Eq(135246))), + Field(&DataAccessProfRecord::Locations, + ElementsAre(AllOf(Field(&SourceLocation::FileName, "file1"), + Field(&SourceLocation::Line, 1)), + AllOf(Field(&SourceLocation::FileName, "file2"), + Field(&SourceLocation::Line, 2))))))); + } + + // Tests serialization and de-serialization. + DataAccessProfData deserializedData; + { + std::string serializedData; + llvm::raw_string_ostream OS(serializedData); + llvm::ProfOStream POS(OS); + + EXPECT_FALSE(Data.serialize(POS)); + + const unsigned char *p = + reinterpret_cast(serializedData.data()); + ASSERT_THAT(llvm::to_vector(llvm::make_first_range( + deserializedData.getStrToIndexMapRef())), + testing::IsEmpty()); + EXPECT_FALSE(deserializedData.deserialize(p)); + + EXPECT_THAT( + llvm::to_vector( + llvm::make_first_range(deserializedData.getStrToIndexMapRef())), + ElementsAre("foo", "sym2", "bar.__uniq.321", "file2", "sym1", "file1")); + EXPECT_THAT(deserializedData.getKnownColdSymbols(), + ElementsAre("sym2", "sym1")); + EXPECT_THAT(deserializedData.getKnownColdHashes(), ElementsAre(789, 678)); + + // Look up profiles after deserialization. + EXPECT_TRUE(deserializedData.isKnownColdSymbol((uint64_t)789)); + EXPECT_TRUE(deserializedData.isKnownColdSymbol((uint64_t)678)); + EXPECT_TRUE(deserializedData.isKnownColdSymbol("sym2")); + EXPECT_TRUE(deserializedData.isKnownColdSymbol("sym1")); + + auto Records = + llvm::to_vector(llvm::make_second_range(deserializedData.getRecords())); + + EXPECT_THAT( + Records, + ElementsAre( + AllOf( + Field(&DataAccessProfRecordRef::SymbolID, 0), + Field(&DataAccessProfRecordRef::AccessCount, 100), + Field(&DataAccessProfRecordRef::IsStringLiteral, false), + Field(&DataAccessProfRecordRef::Locations, testing::IsEmpty())), + AllOf(Field(&DataAccessProfRecordRef::SymbolID, 2), + Field(&DataAccessProfRecordRef::AccessCount, 123), + Field(&DataAccessProfRecordRef::IsStringLiteral, false), + Field(&DataAccessProfRecordRef::Locations, + ElementsAre( + AllOf(Field(&SourceLocationRef::FileName, "file2"), + Field(&SourceLocationRef::Line, 3))))), + AllOf(Field(&DataAccessProfRecordRef::SymbolID, 135246), + Field(&DataAccessProfRecordRef::AccessCount, 1000), + Field(&DataAccessProfRecordRef::IsStringLiteral, true), + Field(&DataAccessProfRecordRef::Locations, + ElementsAre( + AllOf(Field(&SourceLocationRef::FileName, "file1"), + Field(&SourceLocationRef::Line, 1)), + AllOf(Field(&SourceLocationRef::FileName, "file2"), + Field(&SourceLocationRef::Line, 2))))))); + } +} +} // namespace +} // namespace data_access_prof +} // namespace llvm diff --git a/llvm/unittests/Support/FileOutputBufferTest.cpp b/llvm/unittests/Support/FileOutputBufferTest.cpp index f7bb0833e5a0e..423a6e12240c0 100644 --- a/llvm/unittests/Support/FileOutputBufferTest.cpp +++ b/llvm/unittests/Support/FileOutputBufferTest.cpp @@ -123,7 +123,7 @@ TEST(FileOutputBuffer, Test) { File5.append("/file5"); { Expected> BufferOrErr = - FileOutputBuffer::create(File5, 8000, FileOutputBuffer::F_no_mmap); + FileOutputBuffer::create(File5, 8000, FileOutputBuffer::F_mmap); ASSERT_NO_ERROR(errorToErrorCode(BufferOrErr.takeError())); std::unique_ptr &Buffer = *BufferOrErr; // Start buffer with special header. diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp index f8c77fcba19cf..f13252f3a4c28 100644 --- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp +++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp @@ -9,6 +9,7 @@ using namespace llvm; using SA = SMEAttrs; +using CA = SMECallAttrs; std::unique_ptr parseIR(const char *IR) { static LLVMContext C; @@ -70,15 +71,14 @@ TEST(SMEAttributes, Constructors) { ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_new_zt0\"") ->getFunction("foo")) .isNewZT0()); - ASSERT_TRUE( - SA(cast((parseIR("declare void @callee()\n" - "define void @foo() {" - "call void @callee() \"aarch64_zt0_undef\"\n" - "ret void\n}") - ->getFunction("foo") - ->begin() - ->front()))) - .isUndefZT0()); + + auto CallModule = parseIR("declare void @callee()\n" + "define void @foo() {" + "call void @callee() \"aarch64_zt0_undef\"\n" + "ret void\n}"); + CallBase &Call = + cast((CallModule->getFunction("foo")->begin()->front())); + ASSERT_TRUE(SMECallAttrs(Call).callsite().hasUndefZT0()); // Invalid combinations. EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled | SA::SM_Compatible), @@ -235,7 +235,7 @@ TEST(SMEAttributes, Basics) { ASSERT_TRUE(ZT0_Undef.hasZT0State()); ASSERT_FALSE(ZT0_Undef.hasSharedZAInterface()); ASSERT_TRUE(ZT0_Undef.hasPrivateZAInterface()); - ASSERT_TRUE(ZT0_Undef.isUndefZT0()); + ASSERT_TRUE(ZT0_Undef.hasUndefZT0()); ASSERT_FALSE(SA(SA::Normal).isInZT0()); ASSERT_FALSE(SA(SA::Normal).isOutZT0()); @@ -248,59 +248,57 @@ TEST(SMEAttributes, Basics) { TEST(SMEAttributes, Transitions) { // Normal -> Normal - ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal))); - ASSERT_FALSE(SA(SA::Normal).requiresPreservingZT0(SA(SA::Normal))); - ASSERT_FALSE(SA(SA::Normal).requiresDisablingZABeforeCall(SA(SA::Normal))); - ASSERT_FALSE(SA(SA::Normal).requiresEnablingZAAfterCall(SA(SA::Normal))); + ASSERT_FALSE(CA(SA::Normal, SA::Normal).requiresSMChange()); + ASSERT_FALSE(CA(SA::Normal, SA::Normal).requiresPreservingZT0()); + ASSERT_FALSE(CA(SA::Normal, SA::Normal).requiresDisablingZABeforeCall()); + ASSERT_FALSE(CA(SA::Normal, SA::Normal).requiresEnablingZAAfterCall()); // Normal -> Normal + LocallyStreaming - ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal | SA::SM_Body))); + ASSERT_FALSE(CA(SA::Normal, SA::Normal | SA::SM_Body).requiresSMChange()); // Normal -> Streaming - ASSERT_TRUE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled))); + ASSERT_TRUE(CA(SA::Normal, SA::SM_Enabled).requiresSMChange()); // Normal -> Streaming + LocallyStreaming - ASSERT_TRUE( - SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body))); + ASSERT_TRUE(CA(SA::Normal, SA::SM_Enabled | SA::SM_Body).requiresSMChange()); // Normal -> Streaming-compatible - ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible))); + ASSERT_FALSE(CA(SA::Normal, SA::SM_Compatible).requiresSMChange()); // Normal -> Streaming-compatible + LocallyStreaming ASSERT_FALSE( - SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body))); + CA(SA::Normal, SA::SM_Compatible | SA::SM_Body).requiresSMChange()); // Streaming -> Normal - ASSERT_TRUE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal))); + ASSERT_TRUE(CA(SA::SM_Enabled, SA::Normal).requiresSMChange()); // Streaming -> Normal + LocallyStreaming - ASSERT_TRUE( - SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal | SA::SM_Body))); + ASSERT_TRUE(CA(SA::SM_Enabled, SA::Normal | SA::SM_Body).requiresSMChange()); // Streaming -> Streaming - ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled))); + ASSERT_FALSE(CA(SA::SM_Enabled, SA::SM_Enabled).requiresSMChange()); // Streaming -> Streaming + LocallyStreaming ASSERT_FALSE( - SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body))); + CA(SA::SM_Enabled, SA::SM_Enabled | SA::SM_Body).requiresSMChange()); // Streaming -> Streaming-compatible - ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible))); + ASSERT_FALSE(CA(SA::SM_Enabled, SA::SM_Compatible).requiresSMChange()); // Streaming -> Streaming-compatible + LocallyStreaming ASSERT_FALSE( - SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body))); + CA(SA::SM_Enabled, SA::SM_Compatible | SA::SM_Body).requiresSMChange()); // Streaming-compatible -> Normal - ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal))); + ASSERT_TRUE(CA(SA::SM_Compatible, SA::Normal).requiresSMChange()); ASSERT_TRUE( - SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal | SA::SM_Body))); + CA(SA::SM_Compatible, SA::Normal | SA::SM_Body).requiresSMChange()); // Streaming-compatible -> Streaming - ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled))); + ASSERT_TRUE(CA(SA::SM_Compatible, SA::SM_Enabled).requiresSMChange()); // Streaming-compatible -> Streaming + LocallyStreaming ASSERT_TRUE( - SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body))); + CA(SA::SM_Compatible, SA::SM_Enabled | SA::SM_Body).requiresSMChange()); // Streaming-compatible -> Streaming-compatible - ASSERT_FALSE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Compatible))); + ASSERT_FALSE(CA(SA::SM_Compatible, SA::SM_Compatible).requiresSMChange()); // Streaming-compatible -> Streaming-compatible + LocallyStreaming - ASSERT_FALSE(SA(SA::SM_Compatible) - .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body))); + ASSERT_FALSE(CA(SA::SM_Compatible, SA::SM_Compatible | SA::SM_Body) + .requiresSMChange()); SA Private_ZA = SA(SA::Normal); SA ZA_Shared = SA(SA::encodeZAState(SA::StateValue::In)); @@ -310,37 +308,39 @@ TEST(SMEAttributes, Transitions) { SA Undef_ZT0 = SA(SA::ZT0_Undef); // Shared ZA -> Private ZA Interface - ASSERT_FALSE(ZA_Shared.requiresDisablingZABeforeCall(Private_ZA)); - ASSERT_TRUE(ZA_Shared.requiresEnablingZAAfterCall(Private_ZA)); + ASSERT_FALSE(CA(ZA_Shared, Private_ZA).requiresDisablingZABeforeCall()); + ASSERT_TRUE(CA(ZA_Shared, Private_ZA).requiresEnablingZAAfterCall()); // Shared ZT0 -> Private ZA Interface - ASSERT_TRUE(ZT0_Shared.requiresDisablingZABeforeCall(Private_ZA)); - ASSERT_TRUE(ZT0_Shared.requiresPreservingZT0(Private_ZA)); - ASSERT_TRUE(ZT0_Shared.requiresEnablingZAAfterCall(Private_ZA)); + ASSERT_TRUE(CA(ZT0_Shared, Private_ZA).requiresDisablingZABeforeCall()); + ASSERT_TRUE(CA(ZT0_Shared, Private_ZA).requiresPreservingZT0()); + ASSERT_TRUE(CA(ZT0_Shared, Private_ZA).requiresEnablingZAAfterCall()); // Shared Undef ZT0 -> Private ZA Interface // Note: "Undef ZT0" is a callsite attribute that means ZT0 is undefined at // point the of the call. - ASSERT_TRUE(ZT0_Shared.requiresDisablingZABeforeCall(Undef_ZT0)); - ASSERT_FALSE(ZT0_Shared.requiresPreservingZT0(Undef_ZT0)); - ASSERT_TRUE(ZT0_Shared.requiresEnablingZAAfterCall(Undef_ZT0)); + ASSERT_TRUE( + CA(ZT0_Shared, Private_ZA, Undef_ZT0).requiresDisablingZABeforeCall()); + ASSERT_FALSE(CA(ZT0_Shared, Private_ZA, Undef_ZT0).requiresPreservingZT0()); + ASSERT_TRUE( + CA(ZT0_Shared, Private_ZA, Undef_ZT0).requiresEnablingZAAfterCall()); // Shared ZA & ZT0 -> Private ZA Interface - ASSERT_FALSE(ZA_ZT0_Shared.requiresDisablingZABeforeCall(Private_ZA)); - ASSERT_TRUE(ZA_ZT0_Shared.requiresPreservingZT0(Private_ZA)); - ASSERT_TRUE(ZA_ZT0_Shared.requiresEnablingZAAfterCall(Private_ZA)); + ASSERT_FALSE(CA(ZA_ZT0_Shared, Private_ZA).requiresDisablingZABeforeCall()); + ASSERT_TRUE(CA(ZA_ZT0_Shared, Private_ZA).requiresPreservingZT0()); + ASSERT_TRUE(CA(ZA_ZT0_Shared, Private_ZA).requiresEnablingZAAfterCall()); // Shared ZA -> Shared ZA Interface - ASSERT_FALSE(ZA_Shared.requiresDisablingZABeforeCall(ZT0_Shared)); - ASSERT_FALSE(ZA_Shared.requiresEnablingZAAfterCall(ZT0_Shared)); + ASSERT_FALSE(CA(ZA_Shared, ZT0_Shared).requiresDisablingZABeforeCall()); + ASSERT_FALSE(CA(ZA_Shared, ZT0_Shared).requiresEnablingZAAfterCall()); // Shared ZT0 -> Shared ZA Interface - ASSERT_FALSE(ZT0_Shared.requiresDisablingZABeforeCall(ZT0_Shared)); - ASSERT_FALSE(ZT0_Shared.requiresPreservingZT0(ZT0_Shared)); - ASSERT_FALSE(ZT0_Shared.requiresEnablingZAAfterCall(ZT0_Shared)); + ASSERT_FALSE(CA(ZT0_Shared, ZT0_Shared).requiresDisablingZABeforeCall()); + ASSERT_FALSE(CA(ZT0_Shared, ZT0_Shared).requiresPreservingZT0()); + ASSERT_FALSE(CA(ZT0_Shared, ZT0_Shared).requiresEnablingZAAfterCall()); // Shared ZA & ZT0 -> Shared ZA Interface - ASSERT_FALSE(ZA_ZT0_Shared.requiresDisablingZABeforeCall(ZT0_Shared)); - ASSERT_FALSE(ZA_ZT0_Shared.requiresPreservingZT0(ZT0_Shared)); - ASSERT_FALSE(ZA_ZT0_Shared.requiresEnablingZAAfterCall(ZT0_Shared)); + ASSERT_FALSE(CA(ZA_ZT0_Shared, ZT0_Shared).requiresDisablingZABeforeCall()); + ASSERT_FALSE(CA(ZA_ZT0_Shared, ZT0_Shared).requiresPreservingZT0()); + ASSERT_FALSE(CA(ZA_ZT0_Shared, ZT0_Shared).requiresEnablingZAAfterCall()); } diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 2a53f8469b8fa..fc6854a483f6f 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -981,6 +981,7 @@ R"(All available -march extensions for RISC-V a 2.1 f 2.2 d 2.2 + q 2.2 c 2.0 b 1.0 v 1.0 @@ -1128,6 +1129,7 @@ R"(All available -march extensions for RISC-V svpbmt 1.0 svvptc 1.0 xandesperf 5.0 + xandesvdot 5.0 xandesvpackfph 5.0 xcvalu 1.0 xcvbi 1.0 diff --git a/llvm/unittests/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/unittests/Transforms/IPO/WholeProgramDevirt.cpp index 8fae05bef2ae1..59cc5bc2c4e6f 100644 --- a/llvm/unittests/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/unittests/Transforms/IPO/WholeProgramDevirt.cpp @@ -55,7 +55,7 @@ TEST(WholeProgramDevirt, findLowestOffset) { VT1.After.BytesUsed = {0xff, 0, 0, 0, 0xff}; VT2.After.BytesUsed = {0xff, 1, 0, 0, 0}; EXPECT_EQ(16ull, findLowestOffset(Targets, /*IsAfter=*/true, 16)); - EXPECT_EQ(40ull, findLowestOffset(Targets, /*IsAfter=*/true, 32)); + EXPECT_EQ(64ull, findLowestOffset(Targets, /*IsAfter=*/true, 32)); } TEST(WholeProgramDevirt, setReturnValues) { diff --git a/llvm/unittests/Transforms/Vectorize/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/CMakeLists.txt index 0df39c41a9041..53eeff28c185f 100644 --- a/llvm/unittests/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/unittests/Transforms/Vectorize/CMakeLists.txt @@ -12,6 +12,7 @@ add_llvm_unittest(VectorizeTests VPlanTest.cpp VPDomTreeTest.cpp VPlanHCFGTest.cpp + VPlanPatternMatchTest.cpp VPlanSlpTest.cpp VPlanVerifierTest.cpp ) diff --git a/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp new file mode 100644 index 0000000000000..e38b4fad80b0e --- /dev/null +++ b/llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp @@ -0,0 +1,55 @@ +//===- llvm/unittests/Transforms/Vectorize/VPlanPatternMatchTest.cpp ------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../lib/Transforms/Vectorize/VPlanPatternMatch.h" +#include "../lib/Transforms/Vectorize/LoopVectorizationPlanner.h" +#include "../lib/Transforms/Vectorize/VPlan.h" +#include "../lib/Transforms/Vectorize/VPlanHelpers.h" +#include "VPlanTestBase.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "gtest/gtest.h" + +namespace llvm { + +namespace { +using VPPatternMatchTest = VPlanTestBase; + +TEST_F(VPPatternMatchTest, ScalarIVSteps) { + VPlan &Plan = getPlan(); + VPBasicBlock *VPBB = Plan.createVPBasicBlock(""); + VPBuilder Builder(VPBB); + + IntegerType *I64Ty = IntegerType::get(C, 64); + VPValue *StartV = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0)); + auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DebugLoc()); + Builder.insert(CanonicalIVPHI); + + VPValue *Inc = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1)); + VPValue *VF = &Plan.getVF(); + VPValue *Steps = Builder.createScalarIVSteps( + Instruction::Add, nullptr, CanonicalIVPHI, Inc, VF, DebugLoc()); + + VPValue *Inc2 = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 2)); + VPValue *Steps2 = Builder.createScalarIVSteps( + Instruction::Add, nullptr, CanonicalIVPHI, Inc2, VF, DebugLoc()); + + using namespace VPlanPatternMatch; + + ASSERT_TRUE(match(Steps, m_ScalarIVSteps(m_Specific(CanonicalIVPHI), + m_SpecificInt(1), m_Specific(VF)))); + ASSERT_FALSE( + match(Steps2, m_ScalarIVSteps(m_Specific(CanonicalIVPHI), + m_SpecificInt(1), m_Specific(VF)))); + ASSERT_TRUE(match(Steps2, m_ScalarIVSteps(m_Specific(CanonicalIVPHI), + m_SpecificInt(2), m_Specific(VF)))); +} + +} // namespace +} // namespace llvm diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index eec7b4480b75d..f0d943fe8f304 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1247,8 +1247,9 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { { // Test for a call to a function without side-effects. Module M("", C); + PointerType *PtrTy = PointerType::get(C, 0); Function *TheFn = - Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer, PtrTy); auto *Call = CallInst::Create(TheFn->getFunctionType(), TheFn); VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 84b7e33146811..0cce111ccd22c 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -143,6 +143,44 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { delete Phi; } +TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) { + VPlan &Plan = getPlan(); + IntegerType *Int32 = IntegerType::get(C, 32); + VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 0)); + + VPBasicBlock *VPBB1 = Plan.getEntry(); + VPBasicBlock *VPBB2 = Plan.createVPBasicBlock(""); + VPBasicBlock *VPBB3 = Plan.createVPBasicBlock(""); + + VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero}); + VPPhi *Phi = new VPPhi({DefI}, {}); + VPBB2->appendRecipe(Phi); + VPBB2->appendRecipe(DefI); + auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {}); + VPBB3->appendRecipe(CanIV); + + VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB3, VPBB3, "R1"); + VPBlockUtils::connectBlocks(VPBB1, VPBB2); + VPBlockUtils::connectBlocks(VPBB2, R1); +#if GTEST_HAS_STREAM_REDIRECTION + ::testing::internal::CaptureStderr(); +#endif + EXPECT_FALSE(verifyVPlanIsValid(Plan)); +#if GTEST_HAS_STREAM_REDIRECTION +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + EXPECT_STREQ("Incoming def at index 0 does not dominate incoming block!\n" + " EMIT vp<%2> = add ir<0>\n" + " does not dominate preheader for\n" + " EMIT vp<%1> = phi [ vp<%2>, preheader ]", + ::testing::internal::GetCapturedStderr().c_str()); +#else + EXPECT_STREQ("Incoming def at index 0 does not dominate incoming block!\n", :: + testing::internal::GetCapturedStderr() + .c_str()); +#endif +#endif +} + TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPlan &Plan = getPlan(); VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0)); diff --git a/llvm/utils/TableGen/AsmWriterEmitter.cpp b/llvm/utils/TableGen/AsmWriterEmitter.cpp index 3ecbd88b1d9f3..ebf1894b0d216 100644 --- a/llvm/utils/TableGen/AsmWriterEmitter.cpp +++ b/llvm/utils/TableGen/AsmWriterEmitter.cpp @@ -1031,13 +1031,10 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { // Change (any_of FeatureAll, (any_of ...)) to (any_of FeatureAll, ...). if (IsOr && D->getNumArgs() == 2 && isa(D->getArg(1))) { const DagInit *RHS = cast(D->getArg(1)); - SmallVector Args{D->getArg(0)}; - SmallVector ArgNames{D->getArgName(0)}; - for (unsigned i = 0, e = RHS->getNumArgs(); i != e; ++i) { - Args.push_back(RHS->getArg(i)); - ArgNames.push_back(RHS->getArgName(i)); - } - D = DagInit::get(D->getOperator(), nullptr, Args, ArgNames); + SmallVector> Args{ + *D->getArgAndNames().begin()}; + llvm::append_range(Args, RHS->getArgAndNames()); + D = DagInit::get(D->getOperator(), Args); } for (auto *Arg : D->getArgs()) { diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp index 339b8d6acd622..df37d7005215e 100644 --- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp +++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp @@ -77,6 +77,48 @@ static void generateEnumClass(ArrayRef Records, raw_ostream &OS, } } +// Generate enum class with values corresponding to different bit positions. +// Entries are emitted in the order in which they appear in the `Records` +// vector. +static void generateEnumBitmask(ArrayRef Records, + raw_ostream &OS, StringRef Enum, + StringRef Prefix, + const DirectiveLanguage &DirLang, + bool ExportEnums) { + assert(Records.size() <= 64 && "Too many values for a bitmask"); + llvm::StringRef Type = Records.size() <= 32 ? "uint32_t" : "uint64_t"; + llvm::StringRef TypeSuffix = Records.size() <= 32 ? "U" : "ULL"; + + OS << "\n"; + OS << "enum class " << Enum << " : " << Type << " {\n"; + std::string LastName; + for (auto [I, R] : llvm::enumerate(Records)) { + BaseRecord Rec(R); + LastName = Prefix.str() + Rec.getFormattedName(); + OS << " " << LastName << " = " << (1ull << I) << TypeSuffix << ",\n"; + } + OS << " LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/" << LastName << ")\n"; + OS << "};\n"; + OS << "\n"; + OS << "static constexpr std::size_t " << Enum + << "_enumSize = " << Records.size() << ";\n"; + + // Make the enum values available in the defined namespace. This allows us to + // write something like Enum_X if we have a `using namespace `. + // At the same time we do not loose the strong type guarantees of the enum + // class, that is we cannot pass an unsigned as Directive without an explicit + // cast. + if (ExportEnums) { + OS << "\n"; + for (const auto &R : Records) { + BaseRecord Rec(R); + OS << "constexpr auto " << Prefix << Rec.getFormattedName() << " = " + << "llvm::" << DirLang.getCppNamespace() << "::" << Enum + << "::" << Prefix << Rec.getFormattedName() << ";\n"; + } + } +} + // Generate enums for values that clauses can take. // Also generate function declarations for getName(StringRef Str). static void generateEnumClauseVal(ArrayRef Records, @@ -224,6 +266,9 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { generateEnumClass(DirLang.getCategories(), OS, "Category", /*Prefix=*/"", DirLang, /*ExportEnums=*/false); + generateEnumBitmask(DirLang.getSourceLanguages(), OS, "SourceLanguage", + /*Prefix=*/"", DirLang, /*ExportEnums=*/false); + // Emit Directive enumeration generateEnumClass(DirLang.getDirectives(), OS, "Directive", DirLang.getDirectivePrefix(), DirLang, @@ -267,6 +312,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { << getMaxLeafCount(DirLang) << "; }\n"; OS << "LLVM_ABI Association getDirectiveAssociation(Directive D);\n"; OS << "LLVM_ABI Category getDirectiveCategory(Directive D);\n"; + OS << "LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);\n"; if (EnumHelperFuncs.length() > 0) { OS << EnumHelperFuncs; OS << "\n"; @@ -764,6 +810,34 @@ static void generateGetDirectiveCategory(const DirectiveLanguage &DirLang, OS << "}\n"; } +static void generateGetDirectiveLanguages(const DirectiveLanguage &DirLang, + raw_ostream &OS) { + std::string LangNamespace = "llvm::" + DirLang.getCppNamespace().str(); + std::string LanguageTypeName = LangNamespace + "::SourceLanguage"; + std::string LanguageNamespace = LanguageTypeName + "::"; + + OS << '\n'; + OS << LanguageTypeName << ' ' << LangNamespace << "::getDirectiveLanguages(" + << getDirectiveType(DirLang) << " D) {\n"; + OS << " switch (D) {\n"; + + for (const Record *R : DirLang.getDirectives()) { + Directive D(R); + OS << " case " << getDirectiveName(DirLang, R) << ":\n"; + OS << " return "; + llvm::interleave( + D.getSourceLanguages(), OS, + [&](const Record *L) { + OS << LanguageNamespace << BaseRecord::getFormattedName(L); + }, + " | "); + OS << ";\n"; + } + OS << " } // switch(D)\n"; + OS << " llvm_unreachable(\"Unexpected directive\");\n"; + OS << "}\n"; +} + namespace { enum class DirectiveClauseFE { Flang, Clang }; @@ -1264,6 +1338,9 @@ void emitDirectivesBasicImpl(const DirectiveLanguage &DirLang, // getDirectiveCategory(Directive D) generateGetDirectiveCategory(DirLang, OS); + // getDirectiveLanguages(Directive D) + generateGetDirectiveLanguages(DirLang, OS); + // Leaf table for getLeafConstructs, etc. emitLeafTable(DirLang, OS, "LeafConstructTable"); } diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index b6bc52888d7e2..45f144627ac30 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -2906,11 +2906,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(const Init *TheInit, // TreePatternNode of its own. For example: /// (foo GPR, imm) -> (foo GPR, (imm)) if (R->isSubClassOf("SDNode") || R->isSubClassOf("PatFrags")) - return ParseTreePattern( - DagInit::get( - DI, nullptr, - std::vector>()), - OpName); + return ParseTreePattern(DagInit::get(DI, {}), OpName); // Input argument? TreePatternNodePtr Res = makeIntrusiveRefCnt(DI, 1); @@ -3413,10 +3409,8 @@ void CodeGenDAGPatterns::ParseDefaultOperands() { // Clone the DefaultInfo dag node, changing the operator from 'ops' to // SomeSDnode so that we can parse this. - std::vector> Ops; - for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op) - Ops.emplace_back(DefaultInfo->getArg(op), DefaultInfo->getArgName(op)); - const DagInit *DI = DagInit::get(SomeSDNode, nullptr, Ops); + const DagInit *DI = DagInit::get(SomeSDNode, DefaultInfo->getArgs(), + DefaultInfo->getArgNames()); // Create a TreePattern to parse this. TreePattern P(DefaultOps[i], DI, false, *this); diff --git a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp index 0a835bd7b0bc0..1d172ab6109c1 100644 --- a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp +++ b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp @@ -213,8 +213,9 @@ void VarLenInst::buildRec(const DagInit *DI) { if (NeedSwap) { // Normalization: Hi bit should always be the second argument. const Init *const NewArgs[] = {OperandName, LoBit, HiBit}; - Segments.push_back({NumBits, - DagInit::get(DI->getOperator(), nullptr, NewArgs, {}), + // TODO: This creates an invalid DagInit with 3 Args but 0 ArgNames. + // Extend unit test to exercise this and fix it. + Segments.push_back({NumBits, DagInit::get(DI->getOperator(), NewArgs, {}), CustomEncoder, CustomDecoder}); } else { Segments.push_back({NumBits, DI, CustomEncoder, CustomDecoder}); diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp index 9aa6ec1064276..a8b6f79c176a7 100644 --- a/llvm/utils/TableGen/FastISelEmitter.cpp +++ b/llvm/utils/TableGen/FastISelEmitter.cpp @@ -35,7 +35,7 @@ using namespace llvm; /// namespace { struct InstructionMemo { - std::string Name; + StringRef Name; const CodeGenRegisterClass *RC; std::string SubRegNo; std::vector PhysRegs; @@ -71,10 +71,7 @@ class ImmPredicateSet { return Entry - 1; } - const TreePredicateFn &getPredicate(unsigned i) { - assert(i < PredsByName.size()); - return PredsByName[i]; - } + const TreePredicateFn &getPredicate(unsigned Idx) { return PredsByName[Idx]; } typedef std::vector::const_iterator iterator; iterator begin() const { return PredsByName.begin(); } @@ -151,37 +148,33 @@ struct OperandsSignature { bool empty() const { return Operands.empty(); } bool hasAnyImmediateCodes() const { - for (unsigned i = 0, e = Operands.size(); i != e; ++i) - if (Operands[i].isImm() && Operands[i].getImmCode() != 0) - return true; - return false; + return llvm::any_of(Operands, [](OpKind Kind) { + return Kind.isImm() && Kind.getImmCode() != 0; + }); } /// getWithoutImmCodes - Return a copy of this with any immediate codes forced /// to zero. OperandsSignature getWithoutImmCodes() const { OperandsSignature Result; - for (unsigned i = 0, e = Operands.size(); i != e; ++i) - if (!Operands[i].isImm()) - Result.Operands.push_back(Operands[i]); - else - Result.Operands.push_back(OpKind::getImm(0)); + Result.Operands.resize(Operands.size()); + llvm::transform(Operands, Result.Operands.begin(), [](OpKind Kind) { + return Kind.isImm() ? OpKind::getImm(0) : Kind; + }); return Result; } - void emitImmediatePredicate(raw_ostream &OS, ImmPredicateSet &ImmPredicates) { - bool EmittedAnything = false; - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - if (!Operands[i].isImm()) + void emitImmediatePredicate(raw_ostream &OS, + ImmPredicateSet &ImmPredicates) const { + ListSeparator LS(" &&\n "); + for (auto [Idx, Opnd] : enumerate(Operands)) { + if (!Opnd.isImm()) continue; - unsigned Code = Operands[i].getImmCode(); + unsigned Code = Opnd.getImmCode(); if (Code == 0) continue; - if (EmittedAnything) - OS << " &&\n "; - TreePredicateFn PredFn = ImmPredicates.getPredicate(Code - 1); // Emit the type check. @@ -189,10 +182,9 @@ struct OperandsSignature { ValueTypeByHwMode VVT = TP->getTree(0)->getType(0); assert(VVT.isSimple() && "Cannot use variable value types with fast isel"); - OS << "VT == " << getEnumName(VVT.getSimple().SimpleTy) << " && "; + OS << LS << "VT == " << getEnumName(VVT.getSimple().SimpleTy) << " && "; - OS << PredFn.getFnName() << "(imm" << i << ')'; - EmittedAnything = true; + OS << PredFn.getFnName() << "(imm" << Idx << ')'; } } @@ -304,77 +296,74 @@ struct OperandsSignature { void PrintParameters(raw_ostream &OS) const { ListSeparator LS; - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + for (auto [Idx, Opnd] : enumerate(Operands)) { OS << LS; - if (Operands[i].isReg()) { - OS << "Register Op" << i; - } else if (Operands[i].isImm()) { - OS << "uint64_t imm" << i; - } else if (Operands[i].isFP()) { - OS << "const ConstantFP *f" << i; - } else { + if (Opnd.isReg()) + OS << "Register Op" << Idx; + else if (Opnd.isImm()) + OS << "uint64_t imm" << Idx; + else if (Opnd.isFP()) + OS << "const ConstantFP *f" << Idx; + else llvm_unreachable("Unknown operand kind!"); - } } } - void PrintArguments(raw_ostream &OS, - const std::vector &PR) const { - assert(PR.size() == Operands.size()); + void PrintArguments(raw_ostream &OS, ArrayRef PhyRegs) const { ListSeparator LS; - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - if (PR[i] != "") + for (auto [Idx, Opnd, PhyReg] : enumerate(Operands, PhyRegs)) { + if (!PhyReg.empty()) { // Implicit physical register operand. continue; + } OS << LS; - if (Operands[i].isReg()) { - OS << "Op" << i; - } else if (Operands[i].isImm()) { - OS << "imm" << i; - } else if (Operands[i].isFP()) { - OS << "f" << i; - } else { + if (Opnd.isReg()) + OS << "Op" << Idx; + else if (Opnd.isImm()) + OS << "imm" << Idx; + else if (Opnd.isFP()) + OS << "f" << Idx; + else llvm_unreachable("Unknown operand kind!"); - } } } void PrintArguments(raw_ostream &OS) const { ListSeparator LS; - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + for (auto [Idx, Opnd] : enumerate(Operands)) { OS << LS; - if (Operands[i].isReg()) { - OS << "Op" << i; - } else if (Operands[i].isImm()) { - OS << "imm" << i; - } else if (Operands[i].isFP()) { - OS << "f" << i; - } else { + if (Opnd.isReg()) + OS << "Op" << Idx; + else if (Opnd.isImm()) + OS << "imm" << Idx; + else if (Opnd.isFP()) + OS << "f" << Idx; + else llvm_unreachable("Unknown operand kind!"); - } } } - void PrintManglingSuffix(raw_ostream &OS, const std::vector &PR, + void PrintManglingSuffix(raw_ostream &OS, ArrayRef PhyRegs, ImmPredicateSet &ImmPredicates, bool StripImmCodes = false) const { - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - if (PR[i] != "") + for (auto [PhyReg, Opnd] : zip_equal(PhyRegs, Operands)) { + if (!PhyReg.empty()) { // Implicit physical register operand. e.g. Instruction::Mul expect to // select to a binary op. On x86, mul may take a single operand with // the other operand being implicit. We must emit something that looks // like a binary instruction except for the very inner fastEmitInst_* // call. continue; - Operands[i].printManglingSuffix(OS, ImmPredicates, StripImmCodes); + } + Opnd.printManglingSuffix(OS, ImmPredicates, StripImmCodes); } } void PrintManglingSuffix(raw_ostream &OS, ImmPredicateSet &ImmPredicates, bool StripImmCodes = false) const { - for (unsigned i = 0, e = Operands.size(); i != e; ++i) - Operands[i].printManglingSuffix(OS, ImmPredicates, StripImmCodes); + for (OpKind Opnd : Operands) + Opnd.printManglingSuffix(OS, ImmPredicates, StripImmCodes); } }; } // End anonymous namespace @@ -386,14 +375,14 @@ class FastISelMap { typedef std::multimap PredMap; typedef std::map RetPredMap; typedef std::map TypeRetPredMap; - typedef std::map OpcodeTypeRetPredMap; + typedef std::map OpcodeTypeRetPredMap; typedef std::map OperandsOpcodeTypeRetPredMap; OperandsOpcodeTypeRetPredMap SimplePatterns; // This is used to check that there are no duplicate predicates - std::set> SimplePatternsCheck; @@ -412,20 +401,16 @@ class FastISelMap { private: void emitInstructionCode(raw_ostream &OS, const OperandsSignature &Operands, - const PredMap &PM, const std::string &RetVTName); + const PredMap &PM, StringRef RetVTName); }; } // End anonymous namespace -static std::string getOpcodeName(const Record *Op, - const CodeGenDAGPatterns &CGP) { - return CGP.getSDNodeInfo(Op).getEnumName().str(); -} - -static std::string getLegalCName(std::string OpName) { - std::string::size_type pos = OpName.find("::"); - if (pos != std::string::npos) - OpName.replace(pos, 2, "_"); - return OpName; +static std::string getLegalCName(StringRef OpName) { + std::string CName = OpName.str(); + std::string::size_type Pos = CName.find("::"); + if (Pos != std::string::npos) + CName.replace(Pos, 2, "_"); + return CName; } FastISelMap::FastISelMap(StringRef instns) : InstNS(instns) {} @@ -452,10 +437,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) { const CodeGenTarget &Target = CGP.getTargetInfo(); // Scan through all the patterns and record the simple ones. - for (CodeGenDAGPatterns::ptm_iterator I = CGP.ptm_begin(), E = CGP.ptm_end(); - I != E; ++I) { - const PatternToMatch &Pattern = *I; - + for (const PatternToMatch &Pattern : CGP.ptms()) { // For now, just look at Instructions, so that we don't have to worry // about emitting multiple instructions for a pattern. TreePatternNode &Dst = Pattern.getDstPattern(); @@ -464,15 +446,15 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) { const Record *Op = Dst.getOperator(); if (!Op->isSubClassOf("Instruction")) continue; - CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op); - if (II.Operands.empty()) + CodeGenInstruction &Inst = CGP.getTargetInfo().getInstruction(Op); + if (Inst.Operands.empty()) continue; // Allow instructions to be marked as unavailable for FastISel for // certain cases, i.e. an ISA has two 'and' instruction which differ // by what registers they can use but are otherwise identical for // codegen purposes. - if (II.FastISelShouldIgnore) + if (Inst.FastISelShouldIgnore) continue; // For now, ignore multi-instruction patterns. @@ -493,7 +475,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) { const CodeGenRegisterClass *DstRC = nullptr; std::string SubRegNo; if (Op->getName() != "EXTRACT_SUBREG") { - const Record *Op0Rec = II.Operands[0].Rec; + const Record *Op0Rec = Inst.Operands[0].Rec; if (Op0Rec->isSubClassOf("RegisterOperand")) Op0Rec = Op0Rec->getValueAsDef("RegClass"); if (!Op0Rec->isSubClassOf("RegisterClass")) @@ -524,7 +506,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) { continue; const Record *InstPatOp = InstPatNode.getOperator(); - std::string OpcodeName = getOpcodeName(InstPatOp, CGP); + StringRef OpcodeName = CGP.getSDNodeInfo(InstPatOp).getEnumName(); MVT::SimpleValueType RetVT = MVT::isVoid; if (InstPatNode.getNumTypes()) RetVT = InstPatNode.getSimpleType(0); @@ -591,7 +573,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) { DstRC, std::move(SubRegNo), std::move(PhysRegInputs), PredicateCheck); - int complexity = Pattern.getPatternComplexity(CGP); + int Complexity = Pattern.getPatternComplexity(CGP); auto inserted_simple_pattern = SimplePatternsCheck.insert( {Operands, OpcodeName, VT, RetVT, PredicateCheck}); @@ -602,7 +584,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) { // Note: Instructions with the same complexity will appear in the order // that they are encountered. - SimplePatterns[Operands][OpcodeName][VT][RetVT].emplace(complexity, + SimplePatterns[Operands][OpcodeName][VT][RetVT].emplace(Complexity, std::move(Memo)); // If any of the operands were immediates with predicates on them, strip @@ -631,16 +613,13 @@ void FastISelMap::printImmediatePredicates(raw_ostream &OS) { void FastISelMap::emitInstructionCode(raw_ostream &OS, const OperandsSignature &Operands, - const PredMap &PM, - const std::string &RetVTName) { + const PredMap &PM, StringRef RetVTName) { // Emit code for each possible instruction. There may be // multiple if there are subtarget concerns. A reverse iterator // is used to produce the ones with highest complexity first. bool OneHadNoPredicate = false; - for (PredMap::const_reverse_iterator PI = PM.rbegin(), PE = PM.rend(); - PI != PE; ++PI) { - const InstructionMemo &Memo = PI->second; + for (const auto &[_, Memo] : reverse(PM)) { std::string PredicateCheck = Memo.PredicateCheck; if (PredicateCheck.empty()) { @@ -659,11 +638,11 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS, OS << " "; } - for (unsigned i = 0; i < Memo.PhysRegs.size(); ++i) { - if (Memo.PhysRegs[i] != "") + for (auto [Idx, PhyReg] : enumerate(Memo.PhysRegs)) { + if (!PhyReg.empty()) OS << " BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, " - << "TII.get(TargetOpcode::COPY), " << Memo.PhysRegs[i] - << ").addReg(Op" << i << ");\n"; + << "TII.get(TargetOpcode::COPY), " << PhyReg << ").addReg(Op" << Idx + << ");\n"; } OS << " return fastEmitInst_"; @@ -681,9 +660,8 @@ void FastISelMap::emitInstructionCode(raw_ostream &OS, << ");\n"; } - if (!PredicateCheck.empty()) { + if (!PredicateCheck.empty()) OS << " }\n"; - } } // Return Register() if all of the possibilities had predicates but none // were satisfied. @@ -699,48 +677,38 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { const OperandsSignature &Operands = SimplePattern.first; const OpcodeTypeRetPredMap &OTM = SimplePattern.second; - for (const auto &I : OTM) { - const std::string &Opcode = I.first; - const TypeRetPredMap &TM = I.second; - + for (const auto &[Opcode, TM] : OTM) { OS << "// FastEmit functions for " << Opcode << ".\n"; OS << "\n"; // Emit one function for each opcode,type pair. - for (const auto &TI : TM) { - MVT::SimpleValueType VT = TI.first; - const RetPredMap &RM = TI.second; + for (const auto &[VT, RM] : TM) { if (RM.size() != 1) { - for (const auto &RI : RM) { - MVT::SimpleValueType RetVT = RI.first; - const PredMap &PM = RI.second; - + for (const auto &[RetVT, PM] : RM) { OS << "Register fastEmit_" << getLegalCName(Opcode) << "_" - << getLegalCName(getEnumName(VT).str()) << "_" - << getLegalCName(getEnumName(RetVT).str()) << "_"; + << getLegalCName(getEnumName(VT)) << "_" + << getLegalCName(getEnumName(RetVT)) << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); OS << "("; Operands.PrintParameters(OS); OS << ") {\n"; - emitInstructionCode(OS, Operands, PM, getEnumName(RetVT).str()); + emitInstructionCode(OS, Operands, PM, getEnumName(RetVT)); } // Emit one function for the type that demultiplexes on return type. OS << "Register fastEmit_" << getLegalCName(Opcode) << "_" - << getLegalCName(getEnumName(VT).str()) << "_"; + << getLegalCName(getEnumName(VT)) << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); OS << "(MVT RetVT"; if (!Operands.empty()) OS << ", "; Operands.PrintParameters(OS); OS << ") {\nswitch (RetVT.SimpleTy) {\n"; - for (const auto &RI : RM) { - MVT::SimpleValueType RetVT = RI.first; + for (const auto &[RetVT, _] : RM) { OS << " case " << getEnumName(RetVT) << ": return fastEmit_" - << getLegalCName(Opcode) << "_" - << getLegalCName(getEnumName(VT).str()) << "_" - << getLegalCName(getEnumName(RetVT).str()) << "_"; + << getLegalCName(Opcode) << "_" << getLegalCName(getEnumName(VT)) + << "_" << getLegalCName(getEnumName(RetVT)) << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); OS << "("; Operands.PrintArguments(OS); @@ -751,7 +719,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { } else { // Non-variadic return type. OS << "Register fastEmit_" << getLegalCName(Opcode) << "_" - << getLegalCName(getEnumName(VT).str()) << "_"; + << getLegalCName(getEnumName(VT)) << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); OS << "(MVT RetVT"; if (!Operands.empty()) @@ -777,9 +745,8 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { Operands.PrintParameters(OS); OS << ") {\n"; OS << " switch (VT.SimpleTy) {\n"; - for (const auto &TI : TM) { - MVT::SimpleValueType VT = TI.first; - std::string TypeName = getEnumName(VT).str(); + for (const auto &[VT, _] : TM) { + StringRef TypeName = getEnumName(VT); OS << " case " << TypeName << ": return fastEmit_" << getLegalCName(Opcode) << "_" << getLegalCName(TypeName) << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); @@ -825,15 +792,15 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { // Check each in order it was seen. It would be nice to have a good // relative ordering between them, but we're not going for optimality // here. - for (unsigned i = 0, e = MI->second.size(); i != e; ++i) { + for (const OperandsSignature &Sig : MI->second) { OS << " if ("; - MI->second[i].emitImmediatePredicate(OS, ImmediatePredicates); + Sig.emitImmediatePredicate(OS, ImmediatePredicates); OS << ")\n if (Register Reg = fastEmit_"; - MI->second[i].PrintManglingSuffix(OS, ImmediatePredicates); + Sig.PrintManglingSuffix(OS, ImmediatePredicates); OS << "(VT, RetVT, Opcode"; - if (!MI->second[i].empty()) + if (!Sig.empty()) OS << ", "; - MI->second[i].PrintArguments(OS); + Sig.PrintArguments(OS); OS << "))\n return Reg;\n\n"; } @@ -842,9 +809,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) { } OS << " switch (Opcode) {\n"; - for (const auto &I : OTM) { - const std::string &Opcode = I.first; - + for (const auto &[Opcode, _] : OTM) { OS << " case " << Opcode << ": return fastEmit_" << getLegalCName(Opcode) << "_"; Operands.PrintManglingSuffix(OS, ImmediatePredicates); diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index 402fc93703228..f93e5fbcc4c27 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -17,6 +17,7 @@ #include "X86DisassemblerShared.h" #include "X86DisassemblerTables.h" #include "X86ModRMFilters.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TableGen/Record.h" #include @@ -437,11 +438,11 @@ void RecognizableInstr::adjustOperandEncoding(OperandEncoding &encoding) { "Invalid CDisp scaling"); } -void RecognizableInstr::handleOperand( - bool optional, unsigned &operandIndex, unsigned &physicalOperandIndex, - unsigned numPhysicalOperands, const unsigned *operandMapping, - OperandEncoding (*encodingFromString)(const std::string &, - uint8_t OpSize)) { +void RecognizableInstr::handleOperand(bool optional, unsigned &operandIndex, + unsigned &physicalOperandIndex, + unsigned numPhysicalOperands, + const unsigned *operandMapping, + EncodingFn encodingFromString) { if (optional) { if (physicalOperandIndex >= numPhysicalOperands) return; @@ -458,12 +459,12 @@ void RecognizableInstr::handleOperand( StringRef typeName = (*Operands)[operandIndex].Rec->getName(); - OperandEncoding encoding = encodingFromString(typeName.str(), OpSize); + OperandEncoding encoding = encodingFromString(typeName, OpSize); // Adjust the encoding type for an operand based on the instruction. adjustOperandEncoding(encoding); Spec->operands[operandIndex].encoding = encoding; Spec->operands[operandIndex].type = - typeFromString(typeName.str(), HasREX_W, OpSize); + typeFromString(typeName, HasREX_W, OpSize); ++operandIndex; ++physicalOperandIndex; @@ -1020,433 +1021,472 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const { #undef MAP } -#define TYPE(str, type) \ - if (s == str) \ - return type; -OperandType RecognizableInstr::typeFromString(const std::string &s, - bool hasREX_W, uint8_t OpSize) { +OperandType RecognizableInstr::typeFromString(StringRef Str, bool hasREX_W, + uint8_t OpSize) { + StringSwitch Switch(Str); if (hasREX_W) { // For instructions with a REX_W prefix, a declared 32-bit register encoding // is special. - TYPE("GR32", TYPE_R32) + Switch.Case("GR32", TYPE_R32); } if (OpSize == X86Local::OpSize16) { // For OpSize16 instructions, a declared 16-bit register or // immediate encoding is special. - TYPE("GR16", TYPE_Rv) + Switch.Case("GR16", TYPE_Rv); } else if (OpSize == X86Local::OpSize32) { // For OpSize32 instructions, a declared 32-bit register or // immediate encoding is special. - TYPE("GR32", TYPE_Rv) + Switch.Case("GR32", TYPE_Rv); } - TYPE("i16mem", TYPE_M) - TYPE("i16imm", TYPE_IMM) - TYPE("i16i8imm", TYPE_IMM) - TYPE("GR16", TYPE_R16) - TYPE("GR16orGR32orGR64", TYPE_R16) - TYPE("i32mem", TYPE_M) - TYPE("i32imm", TYPE_IMM) - TYPE("i32i8imm", TYPE_IMM) - TYPE("GR32", TYPE_R32) - TYPE("GR32orGR64", TYPE_R32) - TYPE("i64mem", TYPE_M) - TYPE("i64i32imm", TYPE_IMM) - TYPE("i64i8imm", TYPE_IMM) - TYPE("GR64", TYPE_R64) - TYPE("i8mem", TYPE_M) - TYPE("i8imm", TYPE_IMM) - TYPE("u4imm", TYPE_UIMM8) - TYPE("u8imm", TYPE_UIMM8) - TYPE("i16u8imm", TYPE_UIMM8) - TYPE("i32u8imm", TYPE_UIMM8) - TYPE("i64u8imm", TYPE_UIMM8) - TYPE("GR8", TYPE_R8) - TYPE("VR128", TYPE_XMM) - TYPE("VR128X", TYPE_XMM) - TYPE("f128mem", TYPE_M) - TYPE("f256mem", TYPE_M) - TYPE("f512mem", TYPE_M) - TYPE("FR128", TYPE_XMM) - TYPE("FR64", TYPE_XMM) - TYPE("FR64X", TYPE_XMM) - TYPE("f64mem", TYPE_M) - TYPE("sdmem", TYPE_M) - TYPE("FR16X", TYPE_XMM) - TYPE("FR32", TYPE_XMM) - TYPE("FR32X", TYPE_XMM) - TYPE("f32mem", TYPE_M) - TYPE("f16mem", TYPE_M) - TYPE("ssmem", TYPE_M) - TYPE("shmem", TYPE_M) - TYPE("RST", TYPE_ST) - TYPE("RSTi", TYPE_ST) - TYPE("i128mem", TYPE_M) - TYPE("i256mem", TYPE_M) - TYPE("i512mem", TYPE_M) - TYPE("i512mem_GR16", TYPE_M) - TYPE("i512mem_GR32", TYPE_M) - TYPE("i512mem_GR64", TYPE_M) - TYPE("i64i32imm_brtarget", TYPE_REL) - TYPE("i8imm_brtarget", TYPE_REL) - TYPE("i16imm_brtarget", TYPE_REL) - TYPE("i32imm_brtarget", TYPE_REL) - TYPE("ccode", TYPE_IMM) - TYPE("cflags", TYPE_IMM) - TYPE("AVX512RC", TYPE_IMM) - TYPE("brtarget32", TYPE_REL) - TYPE("brtarget16", TYPE_REL) - TYPE("brtarget8", TYPE_REL) - TYPE("f80mem", TYPE_M) - TYPE("lea64_8mem", TYPE_M) - TYPE("lea64_16mem", TYPE_M) - TYPE("lea64_32mem", TYPE_M) - TYPE("lea64mem", TYPE_M) - TYPE("VR64", TYPE_MM64) - TYPE("i64imm", TYPE_IMM) - TYPE("anymem", TYPE_M) - TYPE("opaquemem", TYPE_M) - TYPE("sibmem", TYPE_MSIB) - TYPE("SEGMENT_REG", TYPE_SEGMENTREG) - TYPE("DEBUG_REG", TYPE_DEBUGREG) - TYPE("CONTROL_REG", TYPE_CONTROLREG) - TYPE("srcidx8", TYPE_SRCIDX) - TYPE("srcidx16", TYPE_SRCIDX) - TYPE("srcidx32", TYPE_SRCIDX) - TYPE("srcidx64", TYPE_SRCIDX) - TYPE("dstidx8", TYPE_DSTIDX) - TYPE("dstidx16", TYPE_DSTIDX) - TYPE("dstidx32", TYPE_DSTIDX) - TYPE("dstidx64", TYPE_DSTIDX) - TYPE("offset16_8", TYPE_MOFFS) - TYPE("offset16_16", TYPE_MOFFS) - TYPE("offset16_32", TYPE_MOFFS) - TYPE("offset32_8", TYPE_MOFFS) - TYPE("offset32_16", TYPE_MOFFS) - TYPE("offset32_32", TYPE_MOFFS) - TYPE("offset32_64", TYPE_MOFFS) - TYPE("offset64_8", TYPE_MOFFS) - TYPE("offset64_16", TYPE_MOFFS) - TYPE("offset64_32", TYPE_MOFFS) - TYPE("offset64_64", TYPE_MOFFS) - TYPE("VR256", TYPE_YMM) - TYPE("VR256X", TYPE_YMM) - TYPE("VR512", TYPE_ZMM) - TYPE("VK1", TYPE_VK) - TYPE("VK1WM", TYPE_VK) - TYPE("VK2", TYPE_VK) - TYPE("VK2WM", TYPE_VK) - TYPE("VK4", TYPE_VK) - TYPE("VK4WM", TYPE_VK) - TYPE("VK8", TYPE_VK) - TYPE("VK8WM", TYPE_VK) - TYPE("VK16", TYPE_VK) - TYPE("VK16WM", TYPE_VK) - TYPE("VK32", TYPE_VK) - TYPE("VK32WM", TYPE_VK) - TYPE("VK64", TYPE_VK) - TYPE("VK64WM", TYPE_VK) - TYPE("VK1Pair", TYPE_VK_PAIR) - TYPE("VK2Pair", TYPE_VK_PAIR) - TYPE("VK4Pair", TYPE_VK_PAIR) - TYPE("VK8Pair", TYPE_VK_PAIR) - TYPE("VK16Pair", TYPE_VK_PAIR) - TYPE("vx32mem", TYPE_MVSIBX) - TYPE("vx64mem", TYPE_MVSIBX) - TYPE("vy32mem", TYPE_MVSIBY) - TYPE("vy64mem", TYPE_MVSIBY) - TYPE("vx32xmem", TYPE_MVSIBX) - TYPE("vx64xmem", TYPE_MVSIBX) - TYPE("vy32xmem", TYPE_MVSIBY) - TYPE("vy64xmem", TYPE_MVSIBY) - TYPE("vz32mem", TYPE_MVSIBZ) - TYPE("vz64mem", TYPE_MVSIBZ) - TYPE("BNDR", TYPE_BNDR) - TYPE("TILE", TYPE_TMM) - TYPE("TILEPair", TYPE_TMM_PAIR) - errs() << "Unhandled type string " << s << "\n"; + OperandType Type = Switch.Case("i16mem", TYPE_M) + .Case("i16imm", TYPE_IMM) + .Case("i16i8imm", TYPE_IMM) + .Case("GR16", TYPE_R16) + .Case("GR16orGR32orGR64", TYPE_R16) + .Case("i32mem", TYPE_M) + .Case("i32imm", TYPE_IMM) + .Case("i32i8imm", TYPE_IMM) + .Case("GR32", TYPE_R32) + .Case("GR32orGR64", TYPE_R32) + .Case("i64mem", TYPE_M) + .Case("i64i32imm", TYPE_IMM) + .Case("i64i8imm", TYPE_IMM) + .Case("GR64", TYPE_R64) + .Case("i8mem", TYPE_M) + .Case("i8imm", TYPE_IMM) + .Case("u4imm", TYPE_UIMM8) + .Case("u8imm", TYPE_UIMM8) + .Case("i16u8imm", TYPE_UIMM8) + .Case("i32u8imm", TYPE_UIMM8) + .Case("i64u8imm", TYPE_UIMM8) + .Case("GR8", TYPE_R8) + .Case("VR128", TYPE_XMM) + .Case("VR128X", TYPE_XMM) + .Case("f128mem", TYPE_M) + .Case("f256mem", TYPE_M) + .Case("f512mem", TYPE_M) + .Case("FR128", TYPE_XMM) + .Case("FR64", TYPE_XMM) + .Case("FR64X", TYPE_XMM) + .Case("f64mem", TYPE_M) + .Case("sdmem", TYPE_M) + .Case("FR16X", TYPE_XMM) + .Case("FR32", TYPE_XMM) + .Case("FR32X", TYPE_XMM) + .Case("f32mem", TYPE_M) + .Case("f16mem", TYPE_M) + .Case("ssmem", TYPE_M) + .Case("shmem", TYPE_M) + .Case("RST", TYPE_ST) + .Case("RSTi", TYPE_ST) + .Case("i128mem", TYPE_M) + .Case("i256mem", TYPE_M) + .Case("i512mem", TYPE_M) + .Case("i512mem_GR16", TYPE_M) + .Case("i512mem_GR32", TYPE_M) + .Case("i512mem_GR64", TYPE_M) + .Case("i64i32imm_brtarget", TYPE_REL) + .Case("i8imm_brtarget", TYPE_REL) + .Case("i16imm_brtarget", TYPE_REL) + .Case("i32imm_brtarget", TYPE_REL) + .Case("ccode", TYPE_IMM) + .Case("cflags", TYPE_IMM) + .Case("AVX512RC", TYPE_IMM) + .Case("brtarget32", TYPE_REL) + .Case("brtarget16", TYPE_REL) + .Case("brtarget8", TYPE_REL) + .Case("f80mem", TYPE_M) + .Case("lea64_8mem", TYPE_M) + .Case("lea64_16mem", TYPE_M) + .Case("lea64_32mem", TYPE_M) + .Case("lea64mem", TYPE_M) + .Case("VR64", TYPE_MM64) + .Case("i64imm", TYPE_IMM) + .Case("anymem", TYPE_M) + .Case("opaquemem", TYPE_M) + .Case("sibmem", TYPE_MSIB) + .Case("SEGMENT_REG", TYPE_SEGMENTREG) + .Case("DEBUG_REG", TYPE_DEBUGREG) + .Case("CONTROL_REG", TYPE_CONTROLREG) + .Case("srcidx8", TYPE_SRCIDX) + .Case("srcidx16", TYPE_SRCIDX) + .Case("srcidx32", TYPE_SRCIDX) + .Case("srcidx64", TYPE_SRCIDX) + .Case("dstidx8", TYPE_DSTIDX) + .Case("dstidx16", TYPE_DSTIDX) + .Case("dstidx32", TYPE_DSTIDX) + .Case("dstidx64", TYPE_DSTIDX) + .Case("offset16_8", TYPE_MOFFS) + .Case("offset16_16", TYPE_MOFFS) + .Case("offset16_32", TYPE_MOFFS) + .Case("offset32_8", TYPE_MOFFS) + .Case("offset32_16", TYPE_MOFFS) + .Case("offset32_32", TYPE_MOFFS) + .Case("offset32_64", TYPE_MOFFS) + .Case("offset64_8", TYPE_MOFFS) + .Case("offset64_16", TYPE_MOFFS) + .Case("offset64_32", TYPE_MOFFS) + .Case("offset64_64", TYPE_MOFFS) + .Case("VR256", TYPE_YMM) + .Case("VR256X", TYPE_YMM) + .Case("VR512", TYPE_ZMM) + .Case("VK1", TYPE_VK) + .Case("VK1WM", TYPE_VK) + .Case("VK2", TYPE_VK) + .Case("VK2WM", TYPE_VK) + .Case("VK4", TYPE_VK) + .Case("VK4WM", TYPE_VK) + .Case("VK8", TYPE_VK) + .Case("VK8WM", TYPE_VK) + .Case("VK16", TYPE_VK) + .Case("VK16WM", TYPE_VK) + .Case("VK32", TYPE_VK) + .Case("VK32WM", TYPE_VK) + .Case("VK64", TYPE_VK) + .Case("VK64WM", TYPE_VK) + .Case("VK1Pair", TYPE_VK_PAIR) + .Case("VK2Pair", TYPE_VK_PAIR) + .Case("VK4Pair", TYPE_VK_PAIR) + .Case("VK8Pair", TYPE_VK_PAIR) + .Case("VK16Pair", TYPE_VK_PAIR) + .Case("vx32mem", TYPE_MVSIBX) + .Case("vx64mem", TYPE_MVSIBX) + .Case("vy32mem", TYPE_MVSIBY) + .Case("vy64mem", TYPE_MVSIBY) + .Case("vx32xmem", TYPE_MVSIBX) + .Case("vx64xmem", TYPE_MVSIBX) + .Case("vy32xmem", TYPE_MVSIBY) + .Case("vy64xmem", TYPE_MVSIBY) + .Case("vz32mem", TYPE_MVSIBZ) + .Case("vz64mem", TYPE_MVSIBZ) + .Case("BNDR", TYPE_BNDR) + .Case("TILE", TYPE_TMM) + .Case("TILEPair", TYPE_TMM_PAIR) + .Default(TYPE_NONE); + + if (Type != TYPE_NONE) + return Type; + errs() << "Unhandled type string " << Str << "\n"; llvm_unreachable("Unhandled type string"); } -#undef TYPE -#define ENCODING(str, encoding) \ - if (s == str) \ - return encoding; -OperandEncoding -RecognizableInstr::immediateEncodingFromString(const std::string &s, - uint8_t OpSize) { +OperandEncoding RecognizableInstr::immediateEncodingFromString(StringRef Str, + uint8_t OpSize) { + StringSwitch Switch(Str); if (OpSize != X86Local::OpSize16) { // For instructions without an OpSize prefix, a declared 16-bit register or // immediate encoding is special. - ENCODING("i16imm", ENCODING_IW) + Switch.Case("i16imm", ENCODING_IW); } - ENCODING("i32i8imm", ENCODING_IB) - ENCODING("AVX512RC", ENCODING_IRC) - ENCODING("i16imm", ENCODING_Iv) - ENCODING("i16i8imm", ENCODING_IB) - ENCODING("i32imm", ENCODING_Iv) - ENCODING("i64i32imm", ENCODING_ID) - ENCODING("i64i8imm", ENCODING_IB) - ENCODING("i8imm", ENCODING_IB) - ENCODING("ccode", ENCODING_CC) - ENCODING("cflags", ENCODING_CF) - ENCODING("u4imm", ENCODING_IB) - ENCODING("u8imm", ENCODING_IB) - ENCODING("i16u8imm", ENCODING_IB) - ENCODING("i32u8imm", ENCODING_IB) - ENCODING("i64u8imm", ENCODING_IB) - // This is not a typo. Instructions like BLENDVPD put - // register IDs in 8-bit immediates nowadays. - ENCODING("FR32", ENCODING_IB) - ENCODING("FR64", ENCODING_IB) - ENCODING("FR128", ENCODING_IB) - ENCODING("VR128", ENCODING_IB) - ENCODING("VR256", ENCODING_IB) - ENCODING("FR16X", ENCODING_IB) - ENCODING("FR32X", ENCODING_IB) - ENCODING("FR64X", ENCODING_IB) - ENCODING("VR128X", ENCODING_IB) - ENCODING("VR256X", ENCODING_IB) - ENCODING("VR512", ENCODING_IB) - ENCODING("TILE", ENCODING_IB) - errs() << "Unhandled immediate encoding " << s << "\n"; + OperandEncoding Encoding = + Switch.Case("i32i8imm", ENCODING_IB) + .Case("AVX512RC", ENCODING_IRC) + .Case("i16imm", ENCODING_Iv) + .Case("i16i8imm", ENCODING_IB) + .Case("i32imm", ENCODING_Iv) + .Case("i64i32imm", ENCODING_ID) + .Case("i64i8imm", ENCODING_IB) + .Case("i8imm", ENCODING_IB) + .Case("ccode", ENCODING_CC) + .Case("cflags", ENCODING_CF) + .Case("u4imm", ENCODING_IB) + .Case("u8imm", ENCODING_IB) + .Case("i16u8imm", ENCODING_IB) + .Case("i32u8imm", ENCODING_IB) + .Case("i64u8imm", ENCODING_IB) + // This is not a typo. Instructions like BLENDVPD put + // register IDs in 8-bit immediates nowadays. + .Case("FR32", ENCODING_IB) + .Case("FR64", ENCODING_IB) + .Case("FR128", ENCODING_IB) + .Case("VR128", ENCODING_IB) + .Case("VR256", ENCODING_IB) + .Case("FR16X", ENCODING_IB) + .Case("FR32X", ENCODING_IB) + .Case("FR64X", ENCODING_IB) + .Case("VR128X", ENCODING_IB) + .Case("VR256X", ENCODING_IB) + .Case("VR512", ENCODING_IB) + .Case("TILE", ENCODING_IB) + .Default(ENCODING_NONE); + + if (Encoding != ENCODING_NONE) + return Encoding; + errs() << "Unhandled immediate encoding " << Str << "\n"; llvm_unreachable("Unhandled immediate encoding"); } OperandEncoding -RecognizableInstr::rmRegisterEncodingFromString(const std::string &s, - uint8_t OpSize) { - ENCODING("RST", ENCODING_FP) - ENCODING("RSTi", ENCODING_FP) - ENCODING("GR16", ENCODING_RM) - ENCODING("GR16orGR32orGR64", ENCODING_RM) - ENCODING("GR32", ENCODING_RM) - ENCODING("GR32orGR64", ENCODING_RM) - ENCODING("GR64", ENCODING_RM) - ENCODING("GR8", ENCODING_RM) - ENCODING("VR128", ENCODING_RM) - ENCODING("VR128X", ENCODING_RM) - ENCODING("FR128", ENCODING_RM) - ENCODING("FR64", ENCODING_RM) - ENCODING("FR32", ENCODING_RM) - ENCODING("FR64X", ENCODING_RM) - ENCODING("FR32X", ENCODING_RM) - ENCODING("FR16X", ENCODING_RM) - ENCODING("VR64", ENCODING_RM) - ENCODING("VR256", ENCODING_RM) - ENCODING("VR256X", ENCODING_RM) - ENCODING("VR512", ENCODING_RM) - ENCODING("VK1", ENCODING_RM) - ENCODING("VK2", ENCODING_RM) - ENCODING("VK4", ENCODING_RM) - ENCODING("VK8", ENCODING_RM) - ENCODING("VK16", ENCODING_RM) - ENCODING("VK32", ENCODING_RM) - ENCODING("VK64", ENCODING_RM) - ENCODING("BNDR", ENCODING_RM) - ENCODING("TILE", ENCODING_RM) - ENCODING("TILEPair", ENCODING_RM) - errs() << "Unhandled R/M register encoding " << s << "\n"; +RecognizableInstr::rmRegisterEncodingFromString(StringRef Str, uint8_t OpSize) { + auto Encoding = StringSwitch(Str) + .Case("RST", ENCODING_FP) + .Case("RSTi", ENCODING_FP) + .Case("GR16", ENCODING_RM) + .Case("GR16orGR32orGR64", ENCODING_RM) + .Case("GR32", ENCODING_RM) + .Case("GR32orGR64", ENCODING_RM) + .Case("GR64", ENCODING_RM) + .Case("GR8", ENCODING_RM) + .Case("VR128", ENCODING_RM) + .Case("VR128X", ENCODING_RM) + .Case("FR128", ENCODING_RM) + .Case("FR64", ENCODING_RM) + .Case("FR32", ENCODING_RM) + .Case("FR64X", ENCODING_RM) + .Case("FR32X", ENCODING_RM) + .Case("FR16X", ENCODING_RM) + .Case("VR64", ENCODING_RM) + .Case("VR256", ENCODING_RM) + .Case("VR256X", ENCODING_RM) + .Case("VR512", ENCODING_RM) + .Case("VK1", ENCODING_RM) + .Case("VK2", ENCODING_RM) + .Case("VK4", ENCODING_RM) + .Case("VK8", ENCODING_RM) + .Case("VK16", ENCODING_RM) + .Case("VK32", ENCODING_RM) + .Case("VK64", ENCODING_RM) + .Case("BNDR", ENCODING_RM) + .Case("TILE", ENCODING_RM) + .Case("TILEPair", ENCODING_RM) + .Default(ENCODING_NONE); + if (Encoding != ENCODING_NONE) + return Encoding; + errs() << "Unhandled R/M register encoding " << Str << "\n"; llvm_unreachable("Unhandled R/M register encoding"); } OperandEncoding -RecognizableInstr::roRegisterEncodingFromString(const std::string &s, - uint8_t OpSize) { - ENCODING("GR16", ENCODING_REG) - ENCODING("GR16orGR32orGR64", ENCODING_REG) - ENCODING("GR32", ENCODING_REG) - ENCODING("GR32orGR64", ENCODING_REG) - ENCODING("GR64", ENCODING_REG) - ENCODING("GR8", ENCODING_REG) - ENCODING("VR128", ENCODING_REG) - ENCODING("FR128", ENCODING_REG) - ENCODING("FR64", ENCODING_REG) - ENCODING("FR32", ENCODING_REG) - ENCODING("VR64", ENCODING_REG) - ENCODING("SEGMENT_REG", ENCODING_REG) - ENCODING("DEBUG_REG", ENCODING_REG) - ENCODING("CONTROL_REG", ENCODING_REG) - ENCODING("VR256", ENCODING_REG) - ENCODING("VR256X", ENCODING_REG) - ENCODING("VR128X", ENCODING_REG) - ENCODING("FR64X", ENCODING_REG) - ENCODING("FR32X", ENCODING_REG) - ENCODING("FR16X", ENCODING_REG) - ENCODING("VR512", ENCODING_REG) - ENCODING("VK1", ENCODING_REG) - ENCODING("VK2", ENCODING_REG) - ENCODING("VK4", ENCODING_REG) - ENCODING("VK8", ENCODING_REG) - ENCODING("VK16", ENCODING_REG) - ENCODING("VK32", ENCODING_REG) - ENCODING("VK64", ENCODING_REG) - ENCODING("VK1Pair", ENCODING_REG) - ENCODING("VK2Pair", ENCODING_REG) - ENCODING("VK4Pair", ENCODING_REG) - ENCODING("VK8Pair", ENCODING_REG) - ENCODING("VK16Pair", ENCODING_REG) - ENCODING("VK1WM", ENCODING_REG) - ENCODING("VK2WM", ENCODING_REG) - ENCODING("VK4WM", ENCODING_REG) - ENCODING("VK8WM", ENCODING_REG) - ENCODING("VK16WM", ENCODING_REG) - ENCODING("VK32WM", ENCODING_REG) - ENCODING("VK64WM", ENCODING_REG) - ENCODING("BNDR", ENCODING_REG) - ENCODING("TILE", ENCODING_REG) - ENCODING("TILEPair", ENCODING_REG) - errs() << "Unhandled reg/opcode register encoding " << s << "\n"; +RecognizableInstr::roRegisterEncodingFromString(StringRef Str, uint8_t OpSize) { + auto Encoding = StringSwitch(Str) + .Case("GR16", ENCODING_REG) + .Case("GR16orGR32orGR64", ENCODING_REG) + .Case("GR32", ENCODING_REG) + .Case("GR32orGR64", ENCODING_REG) + .Case("GR64", ENCODING_REG) + .Case("GR8", ENCODING_REG) + .Case("VR128", ENCODING_REG) + .Case("FR128", ENCODING_REG) + .Case("FR64", ENCODING_REG) + .Case("FR32", ENCODING_REG) + .Case("VR64", ENCODING_REG) + .Case("SEGMENT_REG", ENCODING_REG) + .Case("DEBUG_REG", ENCODING_REG) + .Case("CONTROL_REG", ENCODING_REG) + .Case("VR256", ENCODING_REG) + .Case("VR256X", ENCODING_REG) + .Case("VR128X", ENCODING_REG) + .Case("FR64X", ENCODING_REG) + .Case("FR32X", ENCODING_REG) + .Case("FR16X", ENCODING_REG) + .Case("VR512", ENCODING_REG) + .Case("VK1", ENCODING_REG) + .Case("VK2", ENCODING_REG) + .Case("VK4", ENCODING_REG) + .Case("VK8", ENCODING_REG) + .Case("VK16", ENCODING_REG) + .Case("VK32", ENCODING_REG) + .Case("VK64", ENCODING_REG) + .Case("VK1Pair", ENCODING_REG) + .Case("VK2Pair", ENCODING_REG) + .Case("VK4Pair", ENCODING_REG) + .Case("VK8Pair", ENCODING_REG) + .Case("VK16Pair", ENCODING_REG) + .Case("VK1WM", ENCODING_REG) + .Case("VK2WM", ENCODING_REG) + .Case("VK4WM", ENCODING_REG) + .Case("VK8WM", ENCODING_REG) + .Case("VK16WM", ENCODING_REG) + .Case("VK32WM", ENCODING_REG) + .Case("VK64WM", ENCODING_REG) + .Case("BNDR", ENCODING_REG) + .Case("TILE", ENCODING_REG) + .Case("TILEPair", ENCODING_REG) + .Default(ENCODING_NONE); + + if (Encoding != ENCODING_NONE) + return Encoding; + + errs() << "Unhandled reg/opcode register encoding " << Str << "\n"; llvm_unreachable("Unhandled reg/opcode register encoding"); } OperandEncoding -RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s, +RecognizableInstr::vvvvRegisterEncodingFromString(StringRef Str, uint8_t OpSize) { - ENCODING("GR8", ENCODING_VVVV) - ENCODING("GR16", ENCODING_VVVV) - ENCODING("GR32", ENCODING_VVVV) - ENCODING("GR64", ENCODING_VVVV) - ENCODING("FR32", ENCODING_VVVV) - ENCODING("FR128", ENCODING_VVVV) - ENCODING("FR64", ENCODING_VVVV) - ENCODING("VR128", ENCODING_VVVV) - ENCODING("VR256", ENCODING_VVVV) - ENCODING("FR16X", ENCODING_VVVV) - ENCODING("FR32X", ENCODING_VVVV) - ENCODING("FR64X", ENCODING_VVVV) - ENCODING("VR128X", ENCODING_VVVV) - ENCODING("VR256X", ENCODING_VVVV) - ENCODING("VR512", ENCODING_VVVV) - ENCODING("VK1", ENCODING_VVVV) - ENCODING("VK2", ENCODING_VVVV) - ENCODING("VK4", ENCODING_VVVV) - ENCODING("VK8", ENCODING_VVVV) - ENCODING("VK16", ENCODING_VVVV) - ENCODING("VK32", ENCODING_VVVV) - ENCODING("VK64", ENCODING_VVVV) - ENCODING("TILE", ENCODING_VVVV) - ENCODING("TILEPair", ENCODING_VVVV) - errs() << "Unhandled VEX.vvvv register encoding " << s << "\n"; + auto Encoding = StringSwitch(Str) + .Case("GR8", ENCODING_VVVV) + .Case("GR16", ENCODING_VVVV) + .Case("GR32", ENCODING_VVVV) + .Case("GR64", ENCODING_VVVV) + .Case("FR32", ENCODING_VVVV) + .Case("FR128", ENCODING_VVVV) + .Case("FR64", ENCODING_VVVV) + .Case("VR128", ENCODING_VVVV) + .Case("VR256", ENCODING_VVVV) + .Case("FR16X", ENCODING_VVVV) + .Case("FR32X", ENCODING_VVVV) + .Case("FR64X", ENCODING_VVVV) + .Case("VR128X", ENCODING_VVVV) + .Case("VR256X", ENCODING_VVVV) + .Case("VR512", ENCODING_VVVV) + .Case("VK1", ENCODING_VVVV) + .Case("VK2", ENCODING_VVVV) + .Case("VK4", ENCODING_VVVV) + .Case("VK8", ENCODING_VVVV) + .Case("VK16", ENCODING_VVVV) + .Case("VK32", ENCODING_VVVV) + .Case("VK64", ENCODING_VVVV) + .Case("TILE", ENCODING_VVVV) + .Case("TILEPair", ENCODING_VVVV) + .Default(ENCODING_NONE); + + if (Encoding != ENCODING_NONE) + return Encoding; + + errs() << "Unhandled VEX.vvvv register encoding " << Str << "\n"; llvm_unreachable("Unhandled VEX.vvvv register encoding"); } OperandEncoding -RecognizableInstr::writemaskRegisterEncodingFromString(const std::string &s, +RecognizableInstr::writemaskRegisterEncodingFromString(StringRef Str, uint8_t OpSize) { - ENCODING("VK1WM", ENCODING_WRITEMASK) - ENCODING("VK2WM", ENCODING_WRITEMASK) - ENCODING("VK4WM", ENCODING_WRITEMASK) - ENCODING("VK8WM", ENCODING_WRITEMASK) - ENCODING("VK16WM", ENCODING_WRITEMASK) - ENCODING("VK32WM", ENCODING_WRITEMASK) - ENCODING("VK64WM", ENCODING_WRITEMASK) - errs() << "Unhandled mask register encoding " << s << "\n"; + auto Encoding = StringSwitch(Str) + .Case("VK1WM", ENCODING_WRITEMASK) + .Case("VK2WM", ENCODING_WRITEMASK) + .Case("VK4WM", ENCODING_WRITEMASK) + .Case("VK8WM", ENCODING_WRITEMASK) + .Case("VK16WM", ENCODING_WRITEMASK) + .Case("VK32WM", ENCODING_WRITEMASK) + .Case("VK64WM", ENCODING_WRITEMASK) + .Default(ENCODING_NONE); + + if (Encoding != ENCODING_NONE) + return Encoding; + + errs() << "Unhandled mask register encoding " << Str << "\n"; llvm_unreachable("Unhandled mask register encoding"); } -OperandEncoding -RecognizableInstr::memoryEncodingFromString(const std::string &s, - uint8_t OpSize) { - ENCODING("i16mem", ENCODING_RM) - ENCODING("i32mem", ENCODING_RM) - ENCODING("i64mem", ENCODING_RM) - ENCODING("i8mem", ENCODING_RM) - ENCODING("shmem", ENCODING_RM) - ENCODING("ssmem", ENCODING_RM) - ENCODING("sdmem", ENCODING_RM) - ENCODING("f128mem", ENCODING_RM) - ENCODING("f256mem", ENCODING_RM) - ENCODING("f512mem", ENCODING_RM) - ENCODING("f64mem", ENCODING_RM) - ENCODING("f32mem", ENCODING_RM) - ENCODING("f16mem", ENCODING_RM) - ENCODING("i128mem", ENCODING_RM) - ENCODING("i256mem", ENCODING_RM) - ENCODING("i512mem", ENCODING_RM) - ENCODING("i512mem_GR16", ENCODING_RM) - ENCODING("i512mem_GR32", ENCODING_RM) - ENCODING("i512mem_GR64", ENCODING_RM) - ENCODING("f80mem", ENCODING_RM) - ENCODING("lea64_8mem", ENCODING_RM) - ENCODING("lea64_16mem", ENCODING_RM) - ENCODING("lea64_32mem", ENCODING_RM) - ENCODING("lea64mem", ENCODING_RM) - ENCODING("anymem", ENCODING_RM) - ENCODING("opaquemem", ENCODING_RM) - ENCODING("sibmem", ENCODING_SIB) - ENCODING("vx32mem", ENCODING_VSIB) - ENCODING("vx64mem", ENCODING_VSIB) - ENCODING("vy32mem", ENCODING_VSIB) - ENCODING("vy64mem", ENCODING_VSIB) - ENCODING("vx32xmem", ENCODING_VSIB) - ENCODING("vx64xmem", ENCODING_VSIB) - ENCODING("vy32xmem", ENCODING_VSIB) - ENCODING("vy64xmem", ENCODING_VSIB) - ENCODING("vz32mem", ENCODING_VSIB) - ENCODING("vz64mem", ENCODING_VSIB) - errs() << "Unhandled memory encoding " << s << "\n"; +OperandEncoding RecognizableInstr::memoryEncodingFromString(StringRef Str, + uint8_t OpSize) { + auto Encoding = StringSwitch(Str) + .Case("i16mem", ENCODING_RM) + .Case("i32mem", ENCODING_RM) + .Case("i64mem", ENCODING_RM) + .Case("i8mem", ENCODING_RM) + .Case("shmem", ENCODING_RM) + .Case("ssmem", ENCODING_RM) + .Case("sdmem", ENCODING_RM) + .Case("f128mem", ENCODING_RM) + .Case("f256mem", ENCODING_RM) + .Case("f512mem", ENCODING_RM) + .Case("f64mem", ENCODING_RM) + .Case("f32mem", ENCODING_RM) + .Case("f16mem", ENCODING_RM) + .Case("i128mem", ENCODING_RM) + .Case("i256mem", ENCODING_RM) + .Case("i512mem", ENCODING_RM) + .Case("i512mem_GR16", ENCODING_RM) + .Case("i512mem_GR32", ENCODING_RM) + .Case("i512mem_GR64", ENCODING_RM) + .Case("f80mem", ENCODING_RM) + .Case("lea64_8mem", ENCODING_RM) + .Case("lea64_16mem", ENCODING_RM) + .Case("lea64_32mem", ENCODING_RM) + .Case("lea64mem", ENCODING_RM) + .Case("anymem", ENCODING_RM) + .Case("opaquemem", ENCODING_RM) + .Case("sibmem", ENCODING_SIB) + .Case("vx32mem", ENCODING_VSIB) + .Case("vx64mem", ENCODING_VSIB) + .Case("vy32mem", ENCODING_VSIB) + .Case("vy64mem", ENCODING_VSIB) + .Case("vx32xmem", ENCODING_VSIB) + .Case("vx64xmem", ENCODING_VSIB) + .Case("vy32xmem", ENCODING_VSIB) + .Case("vy64xmem", ENCODING_VSIB) + .Case("vz32mem", ENCODING_VSIB) + .Case("vz64mem", ENCODING_VSIB) + .Default(ENCODING_NONE); + + if (Encoding != ENCODING_NONE) + return Encoding; + + errs() << "Unhandled memory encoding " << Str << "\n"; llvm_unreachable("Unhandled memory encoding"); } OperandEncoding -RecognizableInstr::relocationEncodingFromString(const std::string &s, - uint8_t OpSize) { +RecognizableInstr::relocationEncodingFromString(StringRef Str, uint8_t OpSize) { + StringSwitch Switch(Str); + if (OpSize != X86Local::OpSize16) { // For instructions without an OpSize prefix, a declared 16-bit register or // immediate encoding is special. - ENCODING("i16imm", ENCODING_IW) + Switch.Case("i16imm", ENCODING_IW); } - ENCODING("i16imm", ENCODING_Iv) - ENCODING("i16i8imm", ENCODING_IB) - ENCODING("i32imm", ENCODING_Iv) - ENCODING("i32i8imm", ENCODING_IB) - ENCODING("i64i32imm", ENCODING_ID) - ENCODING("i64i8imm", ENCODING_IB) - ENCODING("i8imm", ENCODING_IB) - ENCODING("u8imm", ENCODING_IB) - ENCODING("i16u8imm", ENCODING_IB) - ENCODING("i32u8imm", ENCODING_IB) - ENCODING("i64u8imm", ENCODING_IB) - ENCODING("i64i32imm_brtarget", ENCODING_ID) - ENCODING("i16imm_brtarget", ENCODING_IW) - ENCODING("i32imm_brtarget", ENCODING_ID) - ENCODING("i8imm_brtarget", ENCODING_IB) - ENCODING("brtarget32", ENCODING_ID) - ENCODING("brtarget16", ENCODING_IW) - ENCODING("brtarget8", ENCODING_IB) - ENCODING("i64imm", ENCODING_IO) - ENCODING("offset16_8", ENCODING_Ia) - ENCODING("offset16_16", ENCODING_Ia) - ENCODING("offset16_32", ENCODING_Ia) - ENCODING("offset32_8", ENCODING_Ia) - ENCODING("offset32_16", ENCODING_Ia) - ENCODING("offset32_32", ENCODING_Ia) - ENCODING("offset32_64", ENCODING_Ia) - ENCODING("offset64_8", ENCODING_Ia) - ENCODING("offset64_16", ENCODING_Ia) - ENCODING("offset64_32", ENCODING_Ia) - ENCODING("offset64_64", ENCODING_Ia) - ENCODING("srcidx8", ENCODING_SI) - ENCODING("srcidx16", ENCODING_SI) - ENCODING("srcidx32", ENCODING_SI) - ENCODING("srcidx64", ENCODING_SI) - ENCODING("dstidx8", ENCODING_DI) - ENCODING("dstidx16", ENCODING_DI) - ENCODING("dstidx32", ENCODING_DI) - ENCODING("dstidx64", ENCODING_DI) - errs() << "Unhandled relocation encoding " << s << "\n"; + + OperandEncoding Encoding = Switch.Case("i16imm", ENCODING_Iv) + .Case("i16i8imm", ENCODING_IB) + .Case("i32imm", ENCODING_Iv) + .Case("i32i8imm", ENCODING_IB) + .Case("i64i32imm", ENCODING_ID) + .Case("i64i8imm", ENCODING_IB) + .Case("i8imm", ENCODING_IB) + .Case("u8imm", ENCODING_IB) + .Case("i16u8imm", ENCODING_IB) + .Case("i32u8imm", ENCODING_IB) + .Case("i64u8imm", ENCODING_IB) + .Case("i64i32imm_brtarget", ENCODING_ID) + .Case("i16imm_brtarget", ENCODING_IW) + .Case("i32imm_brtarget", ENCODING_ID) + .Case("i8imm_brtarget", ENCODING_IB) + .Case("brtarget32", ENCODING_ID) + .Case("brtarget16", ENCODING_IW) + .Case("brtarget8", ENCODING_IB) + .Case("i64imm", ENCODING_IO) + .Case("offset16_8", ENCODING_Ia) + .Case("offset16_16", ENCODING_Ia) + .Case("offset16_32", ENCODING_Ia) + .Case("offset32_8", ENCODING_Ia) + .Case("offset32_16", ENCODING_Ia) + .Case("offset32_32", ENCODING_Ia) + .Case("offset32_64", ENCODING_Ia) + .Case("offset64_8", ENCODING_Ia) + .Case("offset64_16", ENCODING_Ia) + .Case("offset64_32", ENCODING_Ia) + .Case("offset64_64", ENCODING_Ia) + .Case("srcidx8", ENCODING_SI) + .Case("srcidx16", ENCODING_SI) + .Case("srcidx32", ENCODING_SI) + .Case("srcidx64", ENCODING_SI) + .Case("dstidx8", ENCODING_DI) + .Case("dstidx16", ENCODING_DI) + .Case("dstidx32", ENCODING_DI) + .Case("dstidx64", ENCODING_DI) + .Default(ENCODING_NONE); + + if (Encoding != ENCODING_NONE) + return Encoding; + + errs() << "Unhandled relocation encoding " << Str << "\n"; llvm_unreachable("Unhandled relocation encoding"); } OperandEncoding -RecognizableInstr::opcodeModifierEncodingFromString(const std::string &s, +RecognizableInstr::opcodeModifierEncodingFromString(StringRef Str, uint8_t OpSize) { - ENCODING("GR32", ENCODING_Rv) - ENCODING("GR64", ENCODING_RO) - ENCODING("GR16", ENCODING_Rv) - ENCODING("GR8", ENCODING_RB) - ENCODING("ccode", ENCODING_CC) - errs() << "Unhandled opcode modifier encoding " << s << "\n"; + auto Encoding = StringSwitch(Str) + .Case("GR32", ENCODING_Rv) + .Case("GR64", ENCODING_RO) + .Case("GR16", ENCODING_Rv) + .Case("GR8", ENCODING_RB) + .Case("ccode", ENCODING_CC) + .Default(ENCODING_NONE); + if (Encoding != ENCODING_NONE) + return Encoding; + + errs() << "Unhandled opcode modifier encoding " << Str << "\n"; llvm_unreachable("Unhandled opcode modifier encoding"); } -#undef ENCODING diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h index eb2cee7bbbf87..2f60b99154796 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.h +++ b/llvm/utils/TableGen/X86RecognizableInstr.h @@ -281,7 +281,7 @@ class RecognizableInstr : public RecognizableInstrBase { /// If register size does not match OpSize, then /// register sizes keep their size. /// @return - The operand's type. - static OperandType typeFromString(const std::string &s, bool hasREX_W, + static OperandType typeFromString(StringRef Str, bool hasREX_W, uint8_t OpSize); /// immediateEncodingFromString - Translates an immediate encoding from the @@ -292,28 +292,28 @@ class RecognizableInstr : public RecognizableInstrBase { /// @param OpSize - Indicates whether this is an OpSize16 instruction. /// If it is not, then 16-bit immediate operands stay 16-bit. /// @return - The operand's encoding. - static OperandEncoding immediateEncodingFromString(const std::string &s, + static OperandEncoding immediateEncodingFromString(StringRef Str, uint8_t OpSize); /// rmRegisterEncodingFromString - Like immediateEncodingFromString, but /// handles operands that are in the REG field of the ModR/M byte. - static OperandEncoding rmRegisterEncodingFromString(const std::string &s, + static OperandEncoding rmRegisterEncodingFromString(StringRef Str, uint8_t OpSize); /// rmRegisterEncodingFromString - Like immediateEncodingFromString, but /// handles operands that are in the REG field of the ModR/M byte. - static OperandEncoding roRegisterEncodingFromString(const std::string &s, + static OperandEncoding roRegisterEncodingFromString(StringRef Str, uint8_t OpSize); - static OperandEncoding memoryEncodingFromString(const std::string &s, + static OperandEncoding memoryEncodingFromString(StringRef Str, uint8_t OpSize); - static OperandEncoding relocationEncodingFromString(const std::string &s, + static OperandEncoding relocationEncodingFromString(StringRef Str, uint8_t OpSize); - static OperandEncoding opcodeModifierEncodingFromString(const std::string &s, + static OperandEncoding opcodeModifierEncodingFromString(StringRef Str, uint8_t OpSize); - static OperandEncoding vvvvRegisterEncodingFromString(const std::string &s, + static OperandEncoding vvvvRegisterEncodingFromString(StringRef Str, uint8_t OpSize); - static OperandEncoding - writemaskRegisterEncodingFromString(const std::string &s, uint8_t OpSize); + static OperandEncoding writemaskRegisterEncodingFromString(StringRef Str, + uint8_t OpSize); /// Adjust the encoding type for an operand based on the instruction. void adjustOperandEncoding(OperandEncoding &encoding); @@ -336,12 +336,13 @@ class RecognizableInstr : public RecognizableInstrBase { /// @param operandMapping - The operand mapping, which has an entry for /// each operand that indicates whether it is a /// duplicate, and of what. + using EncodingFn = + llvm::function_ref; void handleOperand(bool optional, unsigned &operandIndex, unsigned &physicalOperandIndex, unsigned numPhysicalOperands, const unsigned *operandMapping, - OperandEncoding (*encodingFromString)(const std::string &, - uint8_t OpSize)); + EncodingFn encodingFromString); /// emitInstructionSpecifier - Loads the instruction specifier for the current /// instruction into a DisassemblerTables. diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn index 95196fc3ebf72..3794e15189e50 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn @@ -22,6 +22,7 @@ static_library("clang-doc") { "ClangDoc.cpp", "Generators.cpp", "HTMLGenerator.cpp", + "HTMLMustacheGenerator.cpp", "MDGenerator.cpp", "Mapper.cpp", "Representation.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn index 17ee6c3dee677..fbb1df4891ead 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn @@ -16,6 +16,7 @@ unittest("ClangDocTests") { "//llvm/lib/Bitcode/Reader", "//llvm/lib/Bitcode/Writer", "//llvm/lib/Support", + "//llvm/lib/Testing/Support", ] include_dirs = [ "//clang-tools-extra/clang-doc" ] sources = [ @@ -23,6 +24,7 @@ unittest("ClangDocTests") { "ClangDocTest.cpp", "GeneratorTest.cpp", "HTMLGeneratorTest.cpp", + "HTMLMustacheGeneratorTest.cpp", "MDGeneratorTest.cpp", "MergeTest.cpp", "SerializeTest.cpp", diff --git a/llvm/utils/gn/secondary/clang/unittests/CIR/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/CIR/BUILD.gn new file mode 100644 index 0000000000000..c480200e18c8c --- /dev/null +++ b/llvm/utils/gn/secondary/clang/unittests/CIR/BUILD.gn @@ -0,0 +1,5 @@ +# Dummy target because real CIRUnitTests depends on //mlir, which isn't +# part of the GN build. +group("CIRUnitTests") { + sources = [ "PointerLikeTest.cpp" ] +} diff --git a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/GSYM/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/GSYM/BUILD.gn index 157fa6e885afc..40cddfa67d3bf 100644 --- a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/GSYM/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/GSYM/BUILD.gn @@ -12,7 +12,7 @@ static_library("GSYM") { "FileWriter.cpp", "FunctionInfo.cpp", "GsymCreator.cpp", - "GsymDIContext.cpp", + "GsymContext.cpp", "GsymReader.cpp", "Header.cpp", "InlineInfo.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn index 274f5b54345c7..4230c55da6420 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn @@ -3,6 +3,7 @@ static_library("Passes") { deps = [ "//llvm/lib/Analysis", "//llvm/lib/CodeGen", + "//llvm/lib/CodeGen/GlobalISel", "//llvm/lib/IR", "//llvm/lib/IRPrinter", "//llvm/lib/Support", diff --git a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn index 244688dd00ba5..d8fd6be468246 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn @@ -9,6 +9,7 @@ static_library("ProfileData") { "//llvm/lib/TargetParser", ] sources = [ + "DataAccessProf.cpp", "GCOV.cpp", "IndexedMemProfData.cpp", "InstrProf.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index 56dd2dcd170ea..e0fb8198e7892 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -8,6 +8,7 @@ static_library("Vectorize") { "//llvm/lib/Transforms/Utils", ] sources = [ + "EVLIndVarSimplify.cpp", "LoadStoreVectorizer.cpp", "LoopIdiomVectorize.cpp", "LoopVectorizationLegality.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn index 456c4f97c7f25..9a76fe6a84781 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn @@ -28,6 +28,7 @@ unittest("CodeGenTests") { "DIETest.cpp", "DroppedVariableStatsMIRTest.cpp", "DwarfStringPoolEntryRefTest.cpp", + "GCMetadata.cpp", "InstrRefLDVTest.cpp", "LexicalScopesTest.cpp", "LowLevelTypeTest.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn index f45542519173e..9ff66aa86eb6d 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn @@ -10,6 +10,7 @@ unittest("ProfileDataTests") { sources = [ "BPFunctionNodeTest.cpp", "CoverageMappingTest.cpp", + "DataAccessProfTest.cpp", "InstrProfDataTest.cpp", "InstrProfTest.cpp", "ItaniumManglingCanonicalizerTest.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/BUILD.gn index d561917a974f4..4c84add8612e8 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/Vectorize/BUILD.gn @@ -11,6 +11,7 @@ unittest("VectorizeTests") { sources = [ "VPDomTreeTest.cpp", "VPlanHCFGTest.cpp", + "VPlanPatternMatchTest.cpp", "VPlanSlpTest.cpp", "VPlanTest.cpp", "VPlanVerifierTest.cpp", diff --git a/mlir/docs/Bufferization.md b/mlir/docs/Bufferization.md index 02cfee5f2b8dc..e04934a120a00 100644 --- a/mlir/docs/Bufferization.md +++ b/mlir/docs/Bufferization.md @@ -202,13 +202,13 @@ e.g.: %2 = "my_dialect.yet_another_op"(%0) : (tensor) -> (tensor) ``` -## Tensor / MemRef Boundary +## Tensor / Buffer Boundary The bufferization dialect provides a few helper ops to connect tensor IR (that should be bufferized) with existing buffers (that may be allocated/provided by a different runtime/library/etc.). -`bufferization.to_memref %t` returns the future buffer of a tensor SSA value. +`bufferization.to_buffer %t` returns the future buffer of a tensor SSA value. `bufferization.to_tensor %m` returns a tensor SSA value for a given MemRef buffer. `bufferization.materialize_in_destination` indicates that a tensor value should materialize in a certain buffer. @@ -268,7 +268,7 @@ By default, One-Shot Bufferize fails when it encounters an op with tensor semantics (i.e., tensor result or tensor operand) that is not bufferizable (i.e., does not implement `BufferizableOpInterface`). This can be avoided with `allow-unknown-ops`. In that case, One-Shot Bufferize inserts -`to_memref`/`to_tensor` ops around the bufferization boundary. +`to_buffer`/`to_tensor` ops around the bufferization boundary. One-Shot Bufferize can be configured to bufferize only ops from a set of dialects with `dialect-filter`. @@ -291,7 +291,7 @@ memref. The layout map of the memref type can be controlled with One-Shot Bufferize bufferizes ops from top to bottom. This works well when all ops are bufferizable. However, when encountering a non-bufferizable tensor with -`allow-unknown-ops`, One-Shot Bufferize must insert `to_memref` ops at the +`allow-unknown-ops`, One-Shot Bufferize must insert `to_buffer` ops at the bufferization boundary and decide on a memref type. By default, One-Shot Bufferize choose the most dynamic memref type wrt. layout maps. E.g.: @@ -300,12 +300,12 @@ Bufferize choose the most dynamic memref type wrt. layout maps. E.g.: %1 = tensor.extract %0[%idx1, %idx2] : tensor ``` -When bufferizing the above IR, One-Shot Bufferize inserts a `to_memref` ops with +When bufferizing the above IR, One-Shot Bufferize inserts a `to_buffer` ops with dynamic offset and strides: ```mlir %0 = "my_dialect.unbufferizable_op(%t) : (tensor) -> (tensor) -%0_m = bufferization.to_memref %0 : memref> +%0_m = bufferization.to_buffer %0 : memref> %1 = memref.load %0_m[%idx1, %idx2] : memref> ``` @@ -335,7 +335,7 @@ generation of layout maps when no precise layout can be inferred: * `identity-layout-map` uses static identity layout maps. This option can be useful for legacy code that cannot handle memref types with layout maps. Note that this setting can lead to additional buffer copies when folding a - `to_tensor`/`to_memref` pair with memref types that are not cast-compatible. + `to_tensor`/`to_buffer` pair with memref types that are not cast-compatible. Note: The `unknown-type-conversion` option does not affect layout maps of function signatures. There is a separate `function-signature-type-conversion` diff --git a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td index cdcf4d8752e87..7385bb73b449a 100644 --- a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td +++ b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td @@ -61,6 +61,13 @@ class Scalable1DVectorOfLength elementTypes> : ShapedCont "a 1-D scalable vector with length " # length, "::mlir::VectorType">; +def SVEVector : AnyTypeOf<[ + Scalable1DVectorOfLength<2, [I64, F64]>, + Scalable1DVectorOfLength<4, [I32, F32]>, + Scalable1DVectorOfLength<8, [I16, F16, BF16]>, + Scalable1DVectorOfLength<16, [I8]>], + "an SVE vector with element size <= 64-bit">; + //===----------------------------------------------------------------------===// // ArmSVE op definitions //===----------------------------------------------------------------------===// @@ -72,14 +79,22 @@ class ArmSVE_IntrOp traits = [], list overloadedOperands = [], list overloadedResults = [], - int numResults = 1> : + int numResults = 1, + list immArgPositions = [], + list immArgAttrNames = []> : LLVM_IntrOpBase overloadedResults=*/overloadedResults, /*list overloadedOperands=*/overloadedOperands, /*list traits=*/traits, - /*int numResults=*/numResults>; + /*int numResults=*/numResults, + /*bit requiresAccessGroup=*/0, + /*bit requiresAliasAnalysis=*/0, + /*bit requiresFastmath=*/0, + /*bit requiresOpBundles=*/0, + /*list immArgPositions=*/immArgPositions, + /*list immArgAttrNames=*/immArgAttrNames>; class ArmSVE_IntrBinaryOverloadedOp traits = []>: @@ -132,11 +147,9 @@ class ScalableMaskedIOp, - AllTypesMatch<["acc", "dst"]>, - ]> { +def SdotOp : ArmSVE_Op<"sdot", [Pure, + AllTypesMatch<["src1", "src2"]>, + AllTypesMatch<["acc", "dst"]>]> { let summary = "Vector-vector dot product and accumulate op"; let description = [{ SDOT: Signed integer addition of dot product. @@ -163,11 +176,9 @@ def SdotOp : ArmSVE_Op<"sdot", "$acc `,` $src1 `,` $src2 attr-dict `:` type($src1) `to` type($dst)"; } -def SmmlaOp : ArmSVE_Op<"smmla", - [Pure, - AllTypesMatch<["src1", "src2"]>, - AllTypesMatch<["acc", "dst"]>, - ]> { +def SmmlaOp : ArmSVE_Op<"smmla", [Pure, + AllTypesMatch<["src1", "src2"]>, + AllTypesMatch<["acc", "dst"]>]> { let summary = "Matrix-matrix multiply and accumulate op"; let description = [{ SMMLA: Signed integer matrix multiply-accumulate. @@ -195,11 +206,9 @@ def SmmlaOp : ArmSVE_Op<"smmla", "$acc `,` $src1 `,` $src2 attr-dict `:` type($src1) `to` type($dst)"; } -def UdotOp : ArmSVE_Op<"udot", - [Pure, - AllTypesMatch<["src1", "src2"]>, - AllTypesMatch<["acc", "dst"]>, - ]> { +def UdotOp : ArmSVE_Op<"udot", [Pure, + AllTypesMatch<["src1", "src2"]>, + AllTypesMatch<["acc", "dst"]>]> { let summary = "Vector-vector dot product and accumulate op"; let description = [{ UDOT: Unsigned integer addition of dot product. @@ -226,11 +235,9 @@ def UdotOp : ArmSVE_Op<"udot", "$acc `,` $src1 `,` $src2 attr-dict `:` type($src1) `to` type($dst)"; } -def UmmlaOp : ArmSVE_Op<"ummla", - [Pure, - AllTypesMatch<["src1", "src2"]>, - AllTypesMatch<["acc", "dst"]>, - ]> { +def UmmlaOp : ArmSVE_Op<"ummla", [Pure, + AllTypesMatch<["src1", "src2"]>, + AllTypesMatch<["acc", "dst"]>]> { let summary = "Matrix-matrix multiply and accumulate op"; let description = [{ UMMLA: Unsigned integer matrix multiply-accumulate. @@ -258,14 +265,42 @@ def UmmlaOp : ArmSVE_Op<"ummla", "$acc `,` $src1 `,` $src2 attr-dict `:` type($src1) `to` type($dst)"; } +def UsmmlaOp : ArmSVE_Op<"usmmla", [Pure, + AllTypesMatch<["src1", "src2"]>, + AllTypesMatch<["acc", "dst"]>]> { + let summary = "Matrix-matrix multiply and accumulate op"; + let description = [{ + USMMLA: Unsigned by signed integer matrix multiply-accumulate. + + The unsigned by signed integer matrix multiply-accumulate operation + multiplies the 2×8 matrix of unsigned 8-bit integer values held + the first source vector by the 8×2 matrix of signed 8-bit integer + values in the second source vector. The resulting 2×2 widened 32-bit + integer matrix product is then added to the 32-bit integer matrix + accumulator. + + Source: + https://developer.arm.com/documentation/100987/0000 + }]; + // Supports (vector<16xi8>, vector<16xi8>) -> (vector<4xi32>) + let arguments = (ins + ScalableVectorOfLengthAndType<[4], [I32]>:$acc, + ScalableVectorOfLengthAndType<[16], [I8]>:$src1, + ScalableVectorOfLengthAndType<[16], [I8]>:$src2 + ); + let results = (outs ScalableVectorOfLengthAndType<[4], [I32]>:$dst); + let assemblyFormat = + "$acc `,` $src1 `,` $src2 attr-dict `:` type($src1) `to` type($dst)"; +} + class SvboolTypeConstraint : TypesMatchWith< "expected corresponding svbool type widened to [16]xi1", lhsArg, rhsArg, "VectorType(VectorType::Builder(::llvm::cast($_self)).setDim(::llvm::cast($_self).getRank() - 1, 16))">; def ConvertFromSvboolOp : ArmSVE_Op<"convert_from_svbool", - [Pure, SvboolTypeConstraint<"result", "source">]> -{ + [Pure, + SvboolTypeConstraint<"result", "source">]> { let summary = "Convert a svbool type to a SVE predicate type"; let description = [{ Converts svbool types (`vector<[16]xi1>` or vectors of that type, e.g. @@ -298,8 +333,8 @@ def ConvertFromSvboolOp : ArmSVE_Op<"convert_from_svbool", } def ConvertToSvboolOp : ArmSVE_Op<"convert_to_svbool", - [Pure, SvboolTypeConstraint<"source", "result">]> -{ + [Pure, + SvboolTypeConstraint<"source", "result">]> { let summary = "Convert a SVE predicate type to a svbool type"; let description = [{ Converts SVE predicate types (or vectors of predicate types, e.g. @@ -341,10 +376,9 @@ def ZipInputVectorType : AnyTypeOf<[ Scalable1DVectorOfLength<16, [I8]>], "an SVE vector with element size <= 64-bit">; -def ZipX2Op : ArmSVE_Op<"zip.x2", [ - Pure, - AllTypesMatch<["sourceV1", "sourceV2", "resultV1", "resultV2"]>] -> { +def ZipX2Op : ArmSVE_Op<"zip.x2", [Pure, + AllTypesMatch<["sourceV1", "sourceV2", + "resultV1", "resultV2"]>]> { let summary = "Multi-vector two-way zip op"; let description = [{ @@ -385,12 +419,11 @@ def ZipX2Op : ArmSVE_Op<"zip.x2", [ }]; } -def ZipX4Op : ArmSVE_Op<"zip.x4", [ - Pure, - AllTypesMatch<[ - "sourceV1", "sourceV2", "sourceV3", "sourceV4", - "resultV1", "resultV2", "resultV3", "resultV4"]>] -> { +def ZipX4Op + : ArmSVE_Op<"zip.x4", + [Pure, + AllTypesMatch<["sourceV1", "sourceV2", "sourceV3", "sourceV4", + "resultV1", "resultV2", "resultV3", "resultV4"]>]> { let summary = "Multi-vector four-way zip op"; let description = [{ @@ -448,10 +481,7 @@ def ZipX4Op : ArmSVE_Op<"zip.x4", [ }]; } -def PselOp : ArmSVE_Op<"psel", [ - Pure, - AllTypesMatch<["p1", "result"]>, -]> { +def PselOp : ArmSVE_Op<"psel", [Pure, AllTypesMatch<["p1", "result"]>]> { let summary = "Predicate select"; let description = [{ @@ -509,6 +539,45 @@ def ScalableMaskedUDivIOp : ScalableMaskedIOp<"masked.divi_unsigned", def ScalableMaskedDivFOp : ScalableMaskedFOp<"masked.divf", "division">; +def DupQLaneOp : ArmSVE_Op<"dupq_lane", [Pure, AllTypesMatch<["src", "dst"]>]> { + let summary = "Broadcast indexed 128-bit segment to vector"; + + let description = [{ + This operation fills each 128-bit segment of a vector with the elements + from the indexed 128-bit segment of the source vector. If the VL is + 128 bits the operation is a NOP. If the index exceeds the number of + 128-bit segments in a vector the result is an all-zeroes vector. + + Example: + ```mlir + // VL == 256 + // %X = [A B C D x x x x] + %Y = arm_sve.dupq_lane %X[0] : vector<[4]xi32> + // Y = [A B C D A B C D] + + // %U = [x x x x x x x x A B C D E F G H] + %V = arm_sve.dupq_lane %U[1] : vector<[8]xf16> + // %V = [A B C D E F H A B C D E F H] + ``` + + Note: The semantics of the operation match those of the `svdupq_lane` instrinsics. + [Source](https://developer.arm.com/architectures/instruction-sets/intrinsics/#q=svdupq_lane) + }]; + + let arguments = (ins SVEVector:$src, + I64Attr:$lane); + let results = (outs SVEVector:$dst); + + let builders = [ + OpBuilder<(ins "Value":$src, "int64_t":$lane), [{ + build($_builder, $_state, src.getType(), src, lane); + }]>]; + + let assemblyFormat = [{ + $src `[` $lane `]` attr-dict `:` type($dst) + }]; +} + def UmmlaIntrOp : ArmSVE_IntrBinaryOverloadedOp<"ummla">, Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>; @@ -517,6 +586,10 @@ def SmmlaIntrOp : ArmSVE_IntrBinaryOverloadedOp<"smmla">, Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>; +def UsmmlaIntrOp : + ArmSVE_IntrBinaryOverloadedOp<"usmmla">, + Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>; + def SdotIntrOp : ArmSVE_IntrBinaryOverloadedOp<"sdot">, Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>; @@ -610,4 +683,14 @@ def WhileLTIntrOp : /*overloadedResults=*/[0]>, Arguments<(ins I64:$base, I64:$n)>; +def DupQLaneIntrOp : ArmSVE_IntrOp<"dupq_lane", + /*traits=*/[], + /*overloadedOperands=*/[0], + /*overloadedResults=*/[], + /*numResults=*/1, + /*immArgPositions*/[1], + /*immArgAttrNames*/["lane"]>, + Arguments<(ins Arg, "v">:$v, + Arg:$lane)>; + #endif // ARMSVE_OPS diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h index ada9539e87121..cb6ef8bc17220 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -302,7 +302,7 @@ struct BufferizationOptions { Value to) const; /// Specifies whether not bufferizable ops are allowed in the input. If so, - /// bufferization.to_memref and bufferization.to_tensor ops are inserted at + /// bufferization.to_buffer and bufferization.to_tensor ops are inserted at /// the boundaries. bool allowUnknownOps = false; @@ -587,7 +587,7 @@ allocateTensorForShapedValue(OpBuilder &b, Location loc, Value shapedValue, bool copy = true); /// Lookup the buffer for the given value. If the value was not bufferized -/// yet, wrap it in a ToMemrefOp. Otherwise, it is the result of a ToTensorOp, +/// yet, wrap it in a ToBufferOp. Otherwise, it is the result of a ToTensorOp, /// from which the memref operand is returned. FailureOr getBuffer(RewriterBase &rewriter, Value value, const BufferizationOptions &options); diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h index 6f19dca2e8222..1ef5370802953 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h @@ -56,10 +56,10 @@ FailureOr castOrReallocMemRefValue(OpBuilder &b, Value value, MemRefType type, const BufferizationOptions &options); -/// Try to fold to_memref(to_tensor(x)). If x's type and the result type of the -/// to_memref op are different, a memref.cast is needed. -LogicalResult foldToMemrefToTensorPair(RewriterBase &rewriter, - ToMemrefOp toMemref, +/// Try to fold to_buffer(to_tensor(x)). If x's type and the result type of the +/// to_buffer op are different, a memref.cast is needed. +LogicalResult foldToBufferToTensorPair(RewriterBase &rewriter, + ToBufferOp toBuffer, const BufferizationOptions &options); /// Add the canonicalization patterns for bufferization.dealloc to the given diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td index fad78a63444b9..7a1a701bea6dc 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td @@ -394,7 +394,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [ An operation that creates a tensor from a `memref`. The result value is a tensor whose shape and element type match the memref operand. - The opposite of this op is `to_memref`. Together, these two ops are + The opposite of this op is `to_buffer`. Together, these two ops are useful for source/target materializations when doing type conversions involving tensors and memrefs. @@ -459,7 +459,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [ LogicalResult bufferize(RewriterBase &rewriter, const BufferizationOptions &options) const { - // to_tensor/to_memref pairs fold away after bufferization. + // to_tensor/to_buffer pairs fold away after bufferization. return success(); } @@ -490,10 +490,10 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [ //===----------------------------------------------------------------------===// -// ToMemrefOp +// ToBufferOp //===----------------------------------------------------------------------===// -def Bufferization_ToMemrefOp : Bufferization_Op<"to_memref", [ +def Bufferization_ToBufferOp : Bufferization_Op<"to_buffer", [ BufferizableOpInterface, SameOperandsAndResultShape, SameOperandsAndResultElementType, @@ -507,7 +507,7 @@ def Bufferization_ToMemrefOp : Bufferization_Op<"to_memref", [ ```mlir // Result type is memref<4x?xf32, #layout, 0> - %m = bufferization.to_memref %t : tensor<4x?xf32> to memref<4x?xf32, #layout, 0> + %m = bufferization.to_buffer %t : tensor<4x?xf32> to memref<4x?xf32, #layout, 0> ``` This operation is a specialized variant of the built-in @@ -527,7 +527,7 @@ def Bufferization_ToMemrefOp : Bufferization_Op<"to_memref", [ // BufferizableOpInterface implementation //===------------------------------------------------------------------===// - // Note: ToMemrefOp / ToTensorOp are temporary ops that are inserted at the + // Note: ToBufferOp / ToTensorOp are temporary ops that are inserted at the // bufferization boundary. When One-Shot bufferization is complete, there // should be no such ops left over. If `allowUnknownOps` (or after running a // partial bufferization pass), such ops may be part of the resulting IR, diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h index 2f495d304b4a5..d5cb8d8eb673c 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Bufferize.h @@ -50,7 +50,7 @@ LogicalResult bufferizeOp(Operation *op, const BufferizationOptions &options, /// Bufferize the signature of `block` and its callers (i.e., ops that have the /// given block as a successor). All block argument types are changed to memref /// types. All corresponding operands of all callers are wrapped in -/// bufferization.to_memref ops. All uses of bufferized tensor block arguments +/// bufferization.to_buffer ops. All uses of bufferized tensor block arguments /// are wrapped in bufferization.to_tensor ops. /// /// It is expected that all callers implement the `BranchOpInterface`. diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h index e8e6226460ac7..51f3c0843569d 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h @@ -69,6 +69,9 @@ struct FuncAnalysisState : public OneShotAnalysisState::Extension { /// analyzed. DenseMap analyzedFuncOps; + /// A collection of cached SymbolTables used for faster function lookup. + mutable SymbolTableCollection symbolTables; + /// This function is called right before analyzing the given FuncOp. It /// initializes the data structures for the FuncOp in this state object. void startFunctionAnalysis(FuncOp funcOp); diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index ee33476f441ee..a0d113c150c5e 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -47,7 +47,7 @@ def OwnershipBasedBufferDeallocationPass Otherwise, the pass that bufferizes the remaining tensors is responsible to add the corresponding deallocation operations. Note that this pass does not consider any values of tensor type and assumes that MemRef values defined by - `bufferization.to_memref` do not return ownership and do not have to be + `bufferization.to_buffer` do not return ownership and do not have to be deallocated. `bufferization.to_tensor` operations are handled similarly to `bufferization.clone` operations with the exception that the result value is not handled because it's a tensor (not a MemRef). @@ -321,7 +321,7 @@ def OneShotBufferizePass : Pass<"one-shot-bufferize", "ModuleOp"> { One-Shot Bufferize will by default reject IR that contains non-bufferizable op, i.e., ops that do not implemement BufferizableOpInterface. Such IR can - be allowed with `allow-unknown-ops=1`. In that case, to_memref and to_tensor + be allowed with `allow-unknown-ops=1`. In that case, to_buffer and to_tensor ops will be generated at the bufferization boundary. This is useful for compatibility with existing partial bufferization passes: These can bufferize the remaining IR after running One-Shot Bufferize. @@ -341,7 +341,7 @@ def OneShotBufferizePass : Pass<"one-shot-bufferize", "ModuleOp"> { One-Shot Bufferize will by default assume memref types with fully dynamic layout maps when a precise layout cannot be inferred. E.g., this is the case - when wrapping a non-bufferizable op in to_memref/to_tensor ops. This + when wrapping a non-bufferizable op in to_buffer/to_tensor ops. This behavior can be overridden with `unknown-type-conversion`. Valid values are `fully-dynamic-layout-map` and `identity-layout-map`. diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt index 9d1a840d6644b..56dc97282fa4a 100644 --- a/mlir/include/mlir/Dialect/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/CMakeLists.txt @@ -28,7 +28,6 @@ add_subdirectory(OpenACCMPCommon) add_subdirectory(OpenMP) add_subdirectory(PDL) add_subdirectory(PDLInterp) -add_subdirectory(Polynomial) add_subdirectory(Ptr) add_subdirectory(Quant) add_subdirectory(SCF) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 654aff71f25be..a8e7dcb54ac20 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -236,6 +236,76 @@ foreach index = !range(0, 32) in { def NVVM_EnvReg # index # Op : NVVM_SpecialRegisterOp<"read.ptx.sreg.envreg" # index>; } +//===----------------------------------------------------------------------===// +// Inline PTX op definition +//===----------------------------------------------------------------------===// + +def NVVM_InlinePtxOp : NVVM_Op<"inline_ptx", + [DeclareOpInterfaceMethods, + AttrSizedOperandSegments]> +{ + let summary = "Inline PTX Op"; + let description = [{This op allows using PTX directly within the NVVM + dialect, while greatly simplifying llvm.inline_asm generation. It + automatically handles register size selection and sets the correct + read/write access for each operand. The operation leverages the + `BasicPtxBuilderInterface` to abstract away low-level details of + PTX assembly formatting. + + The `predicate` attribute is used to specify a predicate for the + PTX instruction. + + Example 1: Read-only Parameters + ```mlir + nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count) : !llvm.ptr, i32 + + // Lowers to: + llvm.inline_asm has_side_effects asm_dialect = att + "mbarrier.init.b64 [$0], $1;", "l,r" %arg0, %arg2 : (!llvm.ptr, i32) -> () + ``` + + Example 2: Read-only and Write-only Parameters + ```mlir + %0 = nvvm.inline_ptx "ex2.approx.ftz.f32 $0, $1;" (%input) : f32 -> f32 + + // Lowers to: + %0 = llvm.inline_asm has_side_effects asm_dialect = att + "ex2.approx.ftz.f32 $0, $1;", "=f,f" %arg0 : (f32) -> f32 + ``` + + Example 3: Predicate Usage + ```mlir + nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count), + predicate = %pred : !llvm.ptr, i32, i1 + + // Lowers to: + llvm.inline_asm has_side_effects asm_dialect = att + "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" %arg0, %arg2, %arg3 + : (!llvm.ptr, i32, i1) -> () + ``` + }]; + + let arguments = (ins Variadic:$readOnlyArgs, + StrAttr:$ptxCode, + PtxPredicate:$predicate); + + let results = (outs Variadic:$writeOnlyArgs); + + let assemblyFormat = [{ + $ptxCode `(` $readOnlyArgs `)` + (`,` `predicate` `=` $predicate^)? attr-dict + `:` type(operands) + (`->` type($writeOnlyArgs)^)? + }]; + + let extraClassDefinition = [{ + std::string $cppClass::getPtx() { + StringRef ptxInstStr = getPtxCode(); + return std::string(ptxInstStr.data()); + } + }]; +} + //===----------------------------------------------------------------------===// // NVVM approximate op definitions //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 5d5add6318e06..3c22aeb9a1ff7 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -561,8 +561,8 @@ class OpenACC_DataEntryOp($asyncOperands, - type($asyncOperands), $asyncOperandsDeviceType) `)` + | `async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly) ) `->` type($accVar) attr-dict }]; @@ -922,8 +922,8 @@ class OpenACC_DataExitOpWithVarPtr let assemblyFormat = [{ custom($accVar, type($accVar)) (`bounds` `(` $bounds^ `)` )? - (`async` `(` custom($asyncOperands, - type($asyncOperands), $asyncOperandsDeviceType)^ `)`)? + (`async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly)^)? `to` custom($var) `:` custom(type($var), $varType) attr-dict }]; @@ -983,8 +983,8 @@ class OpenACC_DataExitOpNoVarPtr : let assemblyFormat = [{ custom($accVar, type($accVar)) (`bounds` `(` $bounds^ `)` )? - (`async` `(` custom($asyncOperands, - type($asyncOperands), $asyncOperandsDeviceType)^ `)`)? + (`async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly)^)? attr-dict }]; @@ -1439,8 +1439,8 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel", ( `combined` `(` `loop` `)` $combined^)? oilist( `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)` - | `async` `(` custom($asyncOperands, - type($asyncOperands), $asyncOperandsDeviceType) `)` + | `async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly) | `firstprivate` `(` custom($firstprivateOperands, type($firstprivateOperands), $firstprivatizations) `)` @@ -1581,8 +1581,8 @@ def OpenACC_SerialOp : OpenACC_Op<"serial", ( `combined` `(` `loop` `)` $combined^)? oilist( `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)` - | `async` `(` custom($asyncOperands, - type($asyncOperands), $asyncOperandsDeviceType) `)` + | `async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly) | `firstprivate` `(` custom($firstprivateOperands, type($firstprivateOperands), $firstprivatizations) `)` @@ -1750,8 +1750,8 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels", ( `combined` `(` `loop` `)` $combined^)? oilist( `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)` - | `async` `(` custom($asyncOperands, - type($asyncOperands), $asyncOperandsDeviceType) `)` + | `async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly) | `num_gangs` `(` custom($numGangs, type($numGangs), $numGangsDeviceType, $numGangsSegments) `)` | `num_workers` `(` custom($numWorkers, @@ -1870,8 +1870,8 @@ def OpenACC_DataOp : OpenACC_Op<"data", let assemblyFormat = [{ oilist( `if` `(` $ifCond `)` - | `async` `(` custom($asyncOperands, - type($asyncOperands), $asyncOperandsDeviceType) `)` + | `async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly) | `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)` | `wait` `` custom($waitOperands, type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments, $hasWaitDevnum, @@ -1934,9 +1934,11 @@ def OpenACC_EnterDataOp : OpenACC_Op<"enter_data", let assemblyFormat = [{ oilist( `if` `(` $ifCond `)` - | `async` `(` $asyncOperand `:` type($asyncOperand) `)` + | `async` `` custom($asyncOperand, + type($asyncOperand), $async) | `wait_devnum` `(` $waitDevnum `:` type($waitDevnum) `)` - | `wait` `(` $waitOperands `:` type($waitOperands) `)` + | `wait` `` custom($waitOperands, + type($waitOperands), $wait) | `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)` ) attr-dict-with-keyword @@ -1986,9 +1988,11 @@ def OpenACC_ExitDataOp : OpenACC_Op<"exit_data", let assemblyFormat = [{ oilist( `if` `(` $ifCond `)` - | `async` `(` $asyncOperand `:` type($asyncOperand) `)` + | `async` `` custom($asyncOperand, + type($asyncOperand), $async) | `wait_devnum` `(` $waitDevnum `:` type($waitDevnum) `)` - | `wait` `(` $waitOperands `:` type($waitOperands) `)` + | `wait` `` custom($waitOperands, + type($waitOperands), $wait) | `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)` ) attr-dict-with-keyword @@ -2853,7 +2857,7 @@ def OpenACC_UpdateOp : OpenACC_Op<"update", let arguments = (ins Optional:$ifCond, Variadic:$asyncOperands, OptionalAttr:$asyncOperandsDeviceType, - OptionalAttr:$async, + OptionalAttr:$asyncOnly, Variadic:$waitOperands, OptionalAttr:$waitOperandsSegments, OptionalAttr:$waitOperandsDeviceType, @@ -2901,9 +2905,8 @@ def OpenACC_UpdateOp : OpenACC_Op<"update", let assemblyFormat = [{ oilist( `if` `(` $ifCond `)` - | `async` `` custom( - $asyncOperands, type($asyncOperands), - $asyncOperandsDeviceType, $async) + | `async` `` custom($asyncOperands, + type($asyncOperands), $asyncOperandsDeviceType, $asyncOnly) | `wait` `` custom($waitOperands, type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments, $hasWaitDevnum, $waitOnly) @@ -2948,9 +2951,11 @@ def OpenACC_WaitOp : OpenACC_Op<"wait", [AttrSizedOperandSegments]> { let assemblyFormat = [{ ( `(` $waitOperands^ `:` type($waitOperands) `)` )? - oilist(`async` `(` $asyncOperand `:` type($asyncOperand) `)` - |`wait_devnum` `(` $waitDevnum `:` type($waitDevnum) `)` - |`if` `(` $ifCond `)` + oilist( + `async` `` custom($asyncOperand, + type($asyncOperand), $async) + | `wait_devnum` `(` $waitDevnum `:` type($waitDevnum) `)` + | `if` `(` $ifCond `)` ) attr-dict-with-keyword }]; let hasVerifier = 1; diff --git a/mlir/include/mlir/Dialect/Polynomial/CMakeLists.txt b/mlir/include/mlir/Dialect/Polynomial/CMakeLists.txt deleted file mode 100644 index f33061b2d87cf..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(IR) diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt deleted file mode 100644 index ecdea158ddefb..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_mlir_dialect(Polynomial polynomial) -add_mlir_doc(Polynomial PolynomialDialect Dialects/ -gen-dialect-doc -dialect=polynomial) - -set(LLVM_TARGET_DEFINITIONS PolynomialAttributes.td) -mlir_tablegen(PolynomialAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=polynomial) -mlir_tablegen(PolynomialAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=polynomial) -add_public_tablegen_target(MLIRPolynomialAttributesIncGen) diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h deleted file mode 100644 index 8d7f1436fdc60..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h +++ /dev/null @@ -1,282 +0,0 @@ -//===- Polynomial.h - A data class for polynomials --------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIAL_H_ -#define MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIAL_H_ - -#include "mlir/Support/LLVM.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/Hashing.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Support/raw_ostream.h" - -namespace mlir { - -class MLIRContext; - -namespace polynomial { - -/// This restricts statically defined polynomials to have at most 64-bit -/// coefficients. This may be relaxed in the future, but it seems unlikely one -/// would want to specify 128-bit polynomials statically in the source code. -constexpr unsigned apintBitWidth = 64; - -template -class MonomialBase { -public: - MonomialBase(const CoefficientType &coeff, const APInt &expo) - : coefficient(coeff), exponent(expo) {} - virtual ~MonomialBase() = default; - - const CoefficientType &getCoefficient() const { return coefficient; } - CoefficientType &getMutableCoefficient() { return coefficient; } - const APInt &getExponent() const { return exponent; } - void setCoefficient(const CoefficientType &coeff) { coefficient = coeff; } - void setExponent(const APInt &exp) { exponent = exp; } - - bool operator==(const MonomialBase &other) const { - return other.coefficient == coefficient && other.exponent == exponent; - } - bool operator!=(const MonomialBase &other) const { - return other.coefficient != coefficient || other.exponent != exponent; - } - - /// Monomials are ordered by exponent. - bool operator<(const MonomialBase &other) const { - return (exponent.ult(other.exponent)); - } - - Derived add(const Derived &other) { - assert(exponent == other.exponent); - CoefficientType newCoeff = coefficient + other.coefficient; - Derived result; - result.setCoefficient(newCoeff); - result.setExponent(exponent); - return result; - } - - virtual bool isMonic() const = 0; - virtual void - coefficientToString(llvm::SmallString<16> &coeffString) const = 0; - - template - friend ::llvm::hash_code hash_value(const MonomialBase &arg); - -protected: - CoefficientType coefficient; - APInt exponent; -}; - -/// A class representing a monomial of a single-variable polynomial with integer -/// coefficients. -class IntMonomial : public MonomialBase { -public: - IntMonomial(int64_t coeff, uint64_t expo) - : MonomialBase(APInt(apintBitWidth, coeff), APInt(apintBitWidth, expo)) {} - - IntMonomial() - : MonomialBase(APInt(apintBitWidth, 0), APInt(apintBitWidth, 0)) {} - - ~IntMonomial() override = default; - - bool isMonic() const override { return coefficient == 1; } - - void coefficientToString(llvm::SmallString<16> &coeffString) const override { - coefficient.toStringSigned(coeffString); - } -}; - -/// A class representing a monomial of a single-variable polynomial with integer -/// coefficients. -class FloatMonomial : public MonomialBase { -public: - FloatMonomial(double coeff, uint64_t expo) - : MonomialBase(APFloat(coeff), APInt(apintBitWidth, expo)) {} - - FloatMonomial() : MonomialBase(APFloat((double)0), APInt(apintBitWidth, 0)) {} - - ~FloatMonomial() override = default; - - bool isMonic() const override { return coefficient == APFloat(1.0); } - - void coefficientToString(llvm::SmallString<16> &coeffString) const override { - coefficient.toString(coeffString); - } -}; - -template -class PolynomialBase { -public: - PolynomialBase() = delete; - - explicit PolynomialBase(ArrayRef terms) : terms(terms) {}; - - explicit operator bool() const { return !terms.empty(); } - bool operator==(const PolynomialBase &other) const { - return other.terms == terms; - } - bool operator!=(const PolynomialBase &other) const { - return !(other.terms == terms); - } - - void print(raw_ostream &os, ::llvm::StringRef separator, - ::llvm::StringRef exponentiation) const { - bool first = true; - for (const Monomial &term : getTerms()) { - if (first) { - first = false; - } else { - os << separator; - } - std::string coeffToPrint; - if (term.isMonic() && term.getExponent().uge(1)) { - coeffToPrint = ""; - } else { - llvm::SmallString<16> coeffString; - term.coefficientToString(coeffString); - coeffToPrint = coeffString.str(); - } - - if (term.getExponent() == 0) { - os << coeffToPrint; - } else if (term.getExponent() == 1) { - os << coeffToPrint << "x"; - } else { - llvm::SmallString<16> expString; - term.getExponent().toStringSigned(expString); - os << coeffToPrint << "x" << exponentiation << expString; - } - } - } - - Derived add(const Derived &other) { - SmallVector newTerms; - auto it1 = terms.begin(); - auto it2 = other.terms.begin(); - while (it1 != terms.end() || it2 != other.terms.end()) { - if (it1 == terms.end()) { - newTerms.emplace_back(*it2); - it2++; - continue; - } - - if (it2 == other.terms.end()) { - newTerms.emplace_back(*it1); - it1++; - continue; - } - - while (it1->getExponent().ult(it2->getExponent())) { - newTerms.emplace_back(*it1); - it1++; - if (it1 == terms.end()) - break; - } - - while (it2->getExponent().ult(it1->getExponent())) { - newTerms.emplace_back(*it2); - it2++; - if (it2 == terms.end()) - break; - } - - newTerms.emplace_back(it1->add(*it2)); - it1++; - it2++; - } - return Derived(newTerms); - } - - // Prints polynomial to 'os'. - void print(raw_ostream &os) const { print(os, " + ", "**"); } - - void dump() const; - - // Prints polynomial so that it can be used as a valid identifier - std::string toIdentifier() const { - std::string result; - llvm::raw_string_ostream os(result); - print(os, "_", ""); - return os.str(); - } - - unsigned getDegree() const { - return terms.back().getExponent().getZExtValue(); - } - - ArrayRef getTerms() const { return terms; } - - template - friend ::llvm::hash_code hash_value(const PolynomialBase &arg); - -private: - // The monomial terms for this polynomial. - SmallVector terms; -}; - -/// A single-variable polynomial with integer coefficients. -/// -/// Eg: x^1024 + x + 1 -class IntPolynomial : public PolynomialBase { -public: - explicit IntPolynomial(ArrayRef terms) : PolynomialBase(terms) {} - - // Returns a Polynomial from a list of monomials. - // Fails if two monomials have the same exponent. - static FailureOr - fromMonomials(ArrayRef monomials); - - /// Returns a polynomial with coefficients given by `coeffs`. The value - /// coeffs[i] is converted to a monomial with exponent i. - static IntPolynomial fromCoefficients(ArrayRef coeffs); -}; - -/// A single-variable polynomial with double coefficients. -/// -/// Eg: 1.0 x^1024 + 3.5 x + 1e-05 -class FloatPolynomial : public PolynomialBase { -public: - explicit FloatPolynomial(ArrayRef terms) - : PolynomialBase(terms) {} - - // Returns a Polynomial from a list of monomials. - // Fails if two monomials have the same exponent. - static FailureOr - fromMonomials(ArrayRef monomials); - - /// Returns a polynomial with coefficients given by `coeffs`. The value - /// coeffs[i] is converted to a monomial with exponent i. - static FloatPolynomial fromCoefficients(ArrayRef coeffs); -}; - -// Make Polynomials hashable. -template -inline ::llvm::hash_code hash_value(const PolynomialBase &arg) { - return ::llvm::hash_combine_range(arg.terms); -} - -template -inline ::llvm::hash_code hash_value(const MonomialBase &arg) { - return llvm::hash_combine(::llvm::hash_value(arg.coefficient), - ::llvm::hash_value(arg.exponent)); -} - -template -inline raw_ostream &operator<<(raw_ostream &os, - const PolynomialBase &polynomial) { - polynomial.print(os); - return os; -} - -} // namespace polynomial -} // namespace mlir - -#endif // MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIAL_H_ diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td deleted file mode 100644 index 755396c8b9023..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td +++ /dev/null @@ -1,350 +0,0 @@ -//===- Polynomial.td - Polynomial dialect ------------------*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef POLYNOMIAL_OPS -#define POLYNOMIAL_OPS - -include "mlir/IR/BuiltinAttributes.td" -include "mlir/IR/OpBase.td" -include "mlir/Interfaces/InferTypeOpInterface.td" -include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/Dialect/Polynomial/IR/PolynomialDialect.td" -include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.td" -include "mlir/Dialect/Polynomial/IR/PolynomialTypes.td" - -class Polynomial_Op traits = []> : - Op { - let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)"; -} - -class Polynomial_UnaryOp traits = []> : - Polynomial_Op { - let arguments = (ins Polynomial_PolynomialType:$operand); - let results = (outs Polynomial_PolynomialType:$result); -} - -class Polynomial_BinaryOp traits = []> : - Polynomial_Op { - let arguments = (ins PolynomialLike:$lhs, PolynomialLike:$rhs); - let results = (outs PolynomialLike:$result); - let assemblyFormat = "operands attr-dict `:` type($result)"; -} - -def Polynomial_AddOp : Polynomial_BinaryOp<"add", [Commutative]> { - let summary = "Addition operation between polynomials."; - let description = [{ - Performs polynomial addition on the operands. The operands may be single - polynomials or containers of identically-typed polynomials, i.e., polynomials - from the same underlying ring with the same coefficient types. - - Addition is defined to occur in the ring defined by the ring attribute of - the two operands, meaning the addition is taken modulo the coefficientModulus - and the polynomialModulus of the ring. - - Example: - - ```mlir - // add two polynomials modulo x^1024 - 1 - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring> - %1 = polynomial.constant int : !polynomial.polynomial<#ring> - %2 = polynomial.add %0, %1 : !polynomial.polynomial<#ring> - ``` - }]; -} - -def Polynomial_SubOp : Polynomial_BinaryOp<"sub"> { - let summary = "Subtraction operation between polynomials."; - let description = [{ - Performs polynomial subtraction on the operands. The operands may be single - polynomials or containers of identically-typed polynomials, i.e., polynomials - from the same underlying ring with the same coefficient types. - - Subtraction is defined to occur in the ring defined by the ring attribute of - the two operands, meaning the subtraction is taken modulo the coefficientModulus - and the polynomialModulus of the ring. - - Example: - - ```mlir - // subtract two polynomials modulo x^1024 - 1 - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring> - %1 = polynomial.constant int : !polynomial.polynomial<#ring> - %2 = polynomial.sub %0, %1 : !polynomial.polynomial<#ring> - ``` - }]; - let hasCanonicalizer = 1; -} - -def Polynomial_MulOp : Polynomial_BinaryOp<"mul", [Commutative]> { - let summary = "Multiplication operation between polynomials."; - let description = [{ - Performs polynomial multiplication on the operands. The operands may be single - polynomials or containers of identically-typed polynomials, i.e., polynomials - from the same underlying ring with the same coefficient types. - - Multiplication is defined to occur in the ring defined by the ring attribute of - the two operands, meaning the multiplication is taken modulo the coefficientModulus - and the polynomialModulus of the ring. - - Example: - - ```mlir - // multiply two polynomials modulo x^1024 - 1 - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring> - %1 = polynomial.constant int : !polynomial.polynomial<#ring> - %2 = polynomial.mul %0, %1 : !polynomial.polynomial<#ring> - ``` - }]; -} - -def Polynomial_MulScalarOp : Polynomial_Op<"mul_scalar", [ - ElementwiseMappable, AllTypesMatch<["polynomial", "output"]>]> { - let summary = "Multiplication by a scalar of the field."; - let description = [{ - Multiplies the polynomial operand's coefficients by a given scalar value. - The operation is defined to occur in the ring defined by the ring attribute - of the two operands, meaning the multiplication is taken modulo the - coefficientModulus of the ring. - - The `scalar` input must have the same type as the polynomial ring's - coefficientType. - - Example: - - ```mlir - // multiply two polynomials modulo x^1024 - 1 - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring> - %1 = arith.constant 3 : i32 - %2 = polynomial.mul_scalar %0, %1 : !polynomial.polynomial<#ring>, i32 - ``` - }]; - - let arguments = (ins - PolynomialLike:$polynomial, - AnyInteger:$scalar - ); - let results = (outs - PolynomialLike:$output - ); - let assemblyFormat = "operands attr-dict `:` type($polynomial) `,` type($scalar)"; - let hasVerifier = 1; -} - -def Polynomial_LeadingTermOp: Polynomial_Op<"leading_term"> { - let summary = "Compute the leading term of the polynomial."; - let description = [{ - The degree of a polynomial is the largest $k$ for which the coefficient - `a_k` of `x^k` is nonzero. The leading term is the term `a_k * x^k`, which - this op represents as a pair of results. The first is the degree `k` as an - index, and the second is the coefficient, whose type matches the - coefficient type of the polynomial's ring attribute. - - Example: - - ```mlir - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring> - %1, %2 = polynomial.leading_term %0 : !polynomial.polynomial<#ring> -> (index, i32) - ``` - }]; - let arguments = (ins Polynomial_PolynomialType:$input); - let results = (outs Index:$degree, AnyInteger:$coefficient); - let assemblyFormat = "operands attr-dict `:` type($input) `->` `(` type($degree) `,` type($coefficient) `)`"; -} - -def Polynomial_MonomialOp: Polynomial_Op<"monomial"> { - let summary = "Create a polynomial that consists of a single monomial."; - let description = [{ - Construct a polynomial that consists of a single monomial term, from its - degree and coefficient as dynamic inputs. - - The coefficient type of the output polynomial's ring attribute must match - the `coefficient` input type. - - Example: - - ```mlir - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %deg = arith.constant 1023 : index - %five = arith.constant 5 : i32 - %0 = polynomial.monomial %five, %deg : (i32, index) -> !polynomial.polynomial<#ring> - ``` - }]; - let arguments = (ins AnyInteger:$coefficient, Index:$degree); - let results = (outs Polynomial_PolynomialType:$output); -} - -def Polynomial_MonicMonomialMulOp: Polynomial_Op<"monic_monomial_mul", [AllTypesMatch<["input", "output"]>]> { - let summary = "Multiply a polynomial by a monic monomial."; - let description = [{ - Multiply a polynomial by a monic monomial, meaning a polynomial of the form - `1 * x^k` for an index operand `k`. - - In some special rings of polynomials, such as a ring of polynomials - modulo `x^n - 1`, `monomial_mul` can be interpreted as a cyclic shift of - the coefficients of the polynomial. For some rings, this results in - optimized lowerings that involve rotations and rescaling of the - coefficients of the input. - }]; - let arguments = (ins PolynomialLike:$input, Index:$monomialDegree); - let results = (outs PolynomialLike:$output); -} - -def Polynomial_FromTensorOp : Polynomial_Op<"from_tensor", [Pure]> { - let summary = "Creates a polynomial from integer coefficients stored in a tensor."; - let description = [{ - `polynomial.from_tensor` creates a polynomial value from a tensor of coefficients. - The input tensor must list the coefficients in degree-increasing order. - - The input one-dimensional tensor may have size at most the degree of the - ring's polynomialModulus generator polynomial, with smaller dimension implying that - all higher-degree terms have coefficient zero. - - Example: - - ```mlir - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %two = arith.constant 2 : i32 - %five = arith.constant 5 : i32 - %coeffs = tensor.from_elements %two, %two, %five : tensor<3xi32> - %poly = polynomial.from_tensor %coeffs : tensor<3xi32> -> !polynomial.polynomial<#ring> - ``` - }]; - let arguments = (ins RankedTensorOf<[AnyInteger]>:$input); - let results = (outs Polynomial_PolynomialType:$output); - - let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)"; - - let builders = [ - // Builder that infers coefficient modulus from tensor bit width, - // and uses whatever input ring is provided by the caller. - OpBuilder<(ins "::mlir::Value":$input, "::mlir::polynomial::RingAttr":$ring)> - ]; - let hasVerifier = 1; -} - -def Polynomial_ToTensorOp : Polynomial_Op<"to_tensor", [Pure]> { - let summary = "Creates a tensor containing the coefficients of a polynomial."; - let description = [{ - `polynomial.to_tensor` creates a dense tensor value containing the - coefficients of the input polynomial. The output tensor contains the - coefficients in degree-increasing order. - - Operations that act on the coefficients of a polynomial, such as extracting - a specific coefficient or extracting a range of coefficients, should be - implemented by composing `to_tensor` with the relevant `tensor` dialect - ops. - - The output tensor has shape equal to the degree of the polynomial ring - attribute's polynomialModulus, including zeroes. - - Example: - - ```mlir - #poly = #polynomial.int_polynomial - #ring = #polynomial.ring - %two = arith.constant 2 : i32 - %five = arith.constant 5 : i32 - %coeffs = tensor.from_elements %two, %two, %five : tensor<3xi32> - %poly = polynomial.from_tensor %coeffs : tensor<3xi32> -> !polynomial.polynomial<#ring> - %tensor = polynomial.to_tensor %poly : !polynomial.polynomial<#ring> -> tensor<1024xi32> - ``` - }]; - let arguments = (ins Polynomial_PolynomialType:$input); - let results = (outs RankedTensorOf<[AnyInteger]>:$output); - let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)"; - let hasVerifier = 1; -} - -def Polynomial_AnyTypedPolynomialAttr : AnyAttrOf<[ - Polynomial_TypedFloatPolynomialAttr, - Polynomial_TypedIntPolynomialAttr -]>; - -def Polynomial_ConstantOp : Op { - let summary = "Define a constant polynomial via an attribute."; - let description = [{ - Example: - - ```mlir - !int_poly_ty = !polynomial.polynomial> - %0 = polynomial.constant int<1 + x**2> : !int_poly_ty - - !float_poly_ty = !polynomial.polynomial> - %1 = polynomial.constant float<0.5 + 1.3e06 x**2> : !float_poly_ty - ``` - }]; - let arguments = (ins Polynomial_AnyTypedPolynomialAttr:$value); - let results = (outs Polynomial_PolynomialType:$output); - let hasCustomAssemblyFormat = 1; -} - -def Polynomial_NTTOp : Polynomial_Op<"ntt", [Pure]> { - let summary = "Computes point-value tensor representation of a polynomial."; - let description = [{ - `polynomial.ntt` computes the forward integer Number Theoretic Transform - (NTT) on the input polynomial. It returns a tensor containing a point-value - representation of the input polynomial. The output tensor has shape equal - to the degree of the ring's `polynomialModulus`. The polynomial's RingAttr - is embedded as the encoding attribute of the output tensor. - - Given an input polynomial `F(x)` over a ring whose `polynomialModulus` has - degree `n`, and a primitive `n`-th root of unity `omega_n`, the output is - the list of $n$ evaluations - - `f[k] = F(omega[n]^k) ; k = {0, ..., n-1}` - - The choice of primitive root may be optionally specified. - }]; - let arguments = (ins - Polynomial_PolynomialType:$input, - OptionalAttr:$root - ); - let results = (outs RankedTensorOf<[AnyInteger]>:$output); - let assemblyFormat = "$input attr-dict `:` qualified(type($input)) `->` type($output)"; - let hasCanonicalizer = 1; - let hasVerifier = 1; -} - -def Polynomial_INTTOp : Polynomial_Op<"intt", [Pure]> { - let summary = "Computes the reverse integer Number Theoretic Transform (NTT)."; - let description = [{ - `polynomial.intt` computes the reverse integer Number Theoretic Transform - (INTT) on the input tensor. This is the inverse operation of the - `polynomial.ntt` operation. - - The input tensor is interpreted as a point-value representation of the - output polynomial at powers of a primitive `n`-th root of unity (see - `polynomial.ntt`). The ring of the polynomial is taken from the required - encoding attribute of the tensor. - - The choice of primitive root may be optionally specified. - }]; - let arguments = ( - ins RankedTensorOf<[AnyInteger]>:$input, - OptionalAttr:$root - ); - let results = (outs Polynomial_PolynomialType:$output); - let assemblyFormat = "$input attr-dict `:` qualified(type($input)) `->` type($output)"; - let hasCanonicalizer = 1; - let hasVerifier = 1; -} - -#endif // POLYNOMIAL_OPS diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.h b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.h deleted file mode 100644 index b37d17bb89fb2..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.h +++ /dev/null @@ -1,17 +0,0 @@ -//===- PolynomialAttributes.h - polynomial dialect attributes ---*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALATTRIBUTES_H_ -#define MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALATTRIBUTES_H_ - -#include "Polynomial.h" -#include "PolynomialDialect.h" - -#define GET_ATTRDEF_CLASSES -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h.inc" - -#endif // MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALATTRIBUTES_H_ diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td deleted file mode 100644 index 7d59add3d37c2..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td +++ /dev/null @@ -1,222 +0,0 @@ -//===- PolynomialOps.td - Polynomial dialect ---------------*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef POLYNOMIAL_ATTRIBUTES -#define POLYNOMIAL_ATTRIBUTES - -include "mlir/IR/BuiltinAttributes.td" -include "mlir/Dialect/Polynomial/IR/PolynomialDialect.td" - -class Polynomial_Attr traits = []> - : AttrDef { - let mnemonic = attrMnemonic; -} - -def Polynomial_IntPolynomialAttr : Polynomial_Attr<"IntPolynomial", "int_polynomial"> { - let summary = "an attribute containing a single-variable polynomial with integer coefficients"; - let description = [{ - A polynomial attribute represents a single-variable polynomial with integer - coefficients, which is used to define the modulus of a `RingAttr`, as well - as to define constants and perform constant folding for `polynomial` ops. - - The polynomial must be expressed as a list of monomial terms, with addition - or subtraction between them. The choice of variable name is arbitrary, but - must be consistent across all the monomials used to define a single - attribute. The order of monomial terms is arbitrary, each monomial degree - must occur at most once. - - Example: - - ```mlir - #poly = #polynomial.int_polynomial - ``` - }]; - let parameters = (ins "::mlir::polynomial::IntPolynomial":$polynomial); - let hasCustomAssemblyFormat = 1; -} - -def Polynomial_FloatPolynomialAttr : Polynomial_Attr<"FloatPolynomial", "float_polynomial"> { - let summary = "an attribute containing a single-variable polynomial with double precision floating point coefficients"; - let description = [{ - A polynomial attribute represents a single-variable polynomial with double - precision floating point coefficients. - - The polynomial must be expressed as a list of monomial terms, with addition - or subtraction between them. The choice of variable name is arbitrary, but - must be consistent across all the monomials used to define a single - attribute. The order of monomial terms is arbitrary, each monomial degree - must occur at most once. - - Example: - - ```mlir - #poly = #polynomial.float_polynomial<0.5 x**7 + 1.5> - ``` - }]; - let parameters = (ins "FloatPolynomial":$polynomial); - let hasCustomAssemblyFormat = 1; -} - -def Polynomial_TypedIntPolynomialAttr : Polynomial_Attr< - "TypedIntPolynomial", "typed_int_polynomial", [TypedAttrInterface]> { - let summary = "a typed int_polynomial"; - let description = [{ - Example: - - ```mlir - !poly_ty = !polynomial.polynomial> - #poly = int<1 x**7 + 4> : !poly_ty - #poly_verbose = #polynomial.typed_int_polynomial<1 x**7 + 4> : !poly_ty - ``` - }]; - let parameters = (ins "::mlir::Type":$type, "::mlir::polynomial::IntPolynomialAttr":$value); - let assemblyFormat = "$value `:` $type"; - let builders = [ - AttrBuilderWithInferredContext<(ins "Type":$type, - "const IntPolynomial &":$value), [{ - return $_get( - type.getContext(), - type, - IntPolynomialAttr::get(type.getContext(), value)); - }]>, - AttrBuilderWithInferredContext<(ins "Type":$type, - "const Attribute &":$value), [{ - return $_get(type.getContext(), type, ::llvm::cast(value)); - }]> - ]; - let extraClassDeclaration = [{ - using ValueType = ::mlir::Attribute; - }]; -} - -def Polynomial_TypedFloatPolynomialAttr : Polynomial_Attr< - "TypedFloatPolynomial", "typed_float_polynomial", [TypedAttrInterface]> { - let summary = "a typed float_polynomial"; - let description = [{ - Example: - - ```mlir - !poly_ty = !polynomial.polynomial> - #poly = float<1.4 x**7 + 4.5> : !poly_ty - #poly_verbose = #polynomial.typed_float_polynomial<1.4 x**7 + 4.5> : !poly_ty - ``` - }]; - let parameters = (ins "::mlir::Type":$type, "::mlir::polynomial::FloatPolynomialAttr":$value); - let assemblyFormat = "$value `:` $type"; - let builders = [ - AttrBuilderWithInferredContext<(ins "Type":$type, - "const FloatPolynomial &":$value), [{ - return $_get( - type.getContext(), - type, - FloatPolynomialAttr::get(type.getContext(), value)); - }]>, - AttrBuilderWithInferredContext<(ins "Type":$type, - "const Attribute &":$value), [{ - return $_get(type.getContext(), type, ::llvm::cast(value)); - }]> - ]; - let extraClassDeclaration = [{ - using ValueType = ::mlir::Attribute; - }]; -} - -def Polynomial_RingAttr : Polynomial_Attr<"Ring", "ring"> { - let summary = "an attribute specifying a polynomial ring"; - let description = [{ - A ring describes the domain in which polynomial arithmetic occurs. The ring - attribute in `polynomial` represents the more specific case of polynomials - with a single indeterminate; whose coefficients can be represented by - another MLIR type (`coefficientType`); and, if the coefficient type is - integral, whose coefficients are taken modulo some statically known modulus - (`coefficientModulus`). - - Additionally, a polynomial ring can specify a _polynomialModulus_, which converts - polynomial arithmetic to the analogue of modular integer arithmetic, where - each polynomial is represented as its remainder when dividing by the - modulus. For single-variable polynomials, an "polynomialModulus" is always specificed - via a single polynomial, which we call `polynomialModulus`. - - An expressive example is polynomials with i32 coefficients, whose - coefficients are taken modulo `2**32 - 5`, with a polynomial modulus of - `x**1024 - 1`. - - ```mlir - #poly_mod = #polynomial.int_polynomial<-1 + x**1024> - #ring = #polynomial.ring - - %0 = ... : polynomial.polynomial<#ring> - ``` - - In this case, the value of a polynomial is always "converted" to a - canonical form by applying repeated reductions by setting `x**1024 = 1` - and simplifying. - - The coefficient and polynomial modulus parameters are optional, and the - coefficient modulus is only allowed if the coefficient type is integral. - - The coefficient modulus, if specified, should be positive and not larger - than `2 ** width(coefficientType)`. - - If the coefficient modulus is not specified, the handling of coefficients - overflows is determined by subsequent lowering passes, which may choose to - wrap around or widen the overflow at their discretion. - - Note that coefficient modulus is contained in `i64` by default, which is signed. - To specify a 64 bit number without intepreting it as a negative number, its container - type should be manually specified like `coefficientModulus=18446744073709551615:i128`. - }]; - - let parameters = (ins - "Type": $coefficientType, - OptionalParameter<"::mlir::IntegerAttr">: $coefficientModulus, - OptionalParameter<"::mlir::polynomial::IntPolynomialAttr">: $polynomialModulus - ); - let genVerifyDecl = 1; - let assemblyFormat = "`<` struct(params) `>`"; - let builders = [ - AttrBuilderWithInferredContext< - (ins "::mlir::Type":$coefficientTy, - CArg<"::mlir::IntegerAttr", "nullptr"> :$coefficientModulusAttr, - CArg<"::mlir::polynomial::IntPolynomialAttr", "nullptr"> :$polynomialModulusAttr), [{ - return $_get( - coefficientTy.getContext(), - coefficientTy, - coefficientModulusAttr, - polynomialModulusAttr); - }]>, - ]; -} - -def Polynomial_PrimitiveRootAttr: Polynomial_Attr<"PrimitiveRoot", "primitive_root"> { - let summary = "an attribute containing an integer and its degree as a root of unity"; - let description = [{ - A primitive root attribute stores an integer root `value` and an integer - `degree`, corresponding to a primitive root of unity of the given degree in - an unspecified ring. - - This is used as an attribute on `polynomial.ntt` and `polynomial.intt` ops - to specify the root of unity used in lowering the transform. - - Example: - - ```mlir - #poly = #polynomial.primitive_root - ``` - }]; - let parameters = (ins - "::mlir::IntegerAttr":$value, - "::mlir::IntegerAttr":$degree - ); - let assemblyFormat = "`<` struct(params) `>`"; -} - - -#endif // POLYNOMIAL_ATTRIBUTES diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.h b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.h deleted file mode 100644 index 7b7acebe7a93b..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.h +++ /dev/null @@ -1,19 +0,0 @@ -//===- PolynomialDialect.h - The Polynomial dialect -------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALDIALECT_H_ -#define MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALDIALECT_H_ - -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Dialect.h" -#include "mlir/IR/DialectImplementation.h" - -// Generated headers (block clang-format from messing up order) -#include "mlir/Dialect/Polynomial/IR/PolynomialDialect.h.inc" - -#endif // MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALDIALECT_H_ diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.td b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.td deleted file mode 100644 index b0573b3715f78..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialDialect.td +++ /dev/null @@ -1,55 +0,0 @@ -//===- PolynomialDialect.td - Polynomial dialect base ------*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef POLYNOMIAL_DIALECT -#define POLYNOMIAL_DIALECT - -include "mlir/IR/OpBase.td" - -def Polynomial_Dialect : Dialect { - let name = "polynomial"; - let cppNamespace = "::mlir::polynomial"; - let description = [{ - The Polynomial dialect defines single-variable polynomial types and - operations. - - The simplest use of `polynomial` is to represent mathematical operations in - a polynomial ring `R[x]`, where `R` is another MLIR type like `i32`. - - More generally, this dialect supports representing polynomial operations in a - quotient ring `R[X]/(f(x))` for some statically fixed polynomial `f(x)`. - Two polyomials `p(x), q(x)` are considered equal in this ring if they have the - same remainder when dividing by `f(x)`. When a modulus is given, ring operations - are performed with reductions modulo `f(x)` and relative to the coefficient ring - `R`. - - Examples: - - ```mlir - // A constant polynomial in a ring with i32 coefficients and no polynomial modulus - #ring = #polynomial.ring - %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring> - - // A constant polynomial in a ring with i32 coefficients, modulo (x^1024 + 1) - #modulus = #polynomial.int_polynomial<1 + x**1024> - #ring = #polynomial.ring - %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring> - - // A constant polynomial in a ring with i32 coefficients, with a polynomial - // modulus of (x^1024 + 1) and a coefficient modulus of 17. - #modulus = #polynomial.int_polynomial<1 + x**1024> - #ring = #polynomial.ring - %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring> - ``` - }]; - - let useDefaultTypePrinterParser = 1; - let useDefaultAttributePrinterParser = 1; -} - -#endif // POLYNOMIAL_OPS diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialOps.h b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialOps.h deleted file mode 100644 index bacaad81ce8e5..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialOps.h +++ /dev/null @@ -1,21 +0,0 @@ -//===- PolynomialOps.h - Ops for the Polynomial dialect ---------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALOPS_H_ -#define MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALOPS_H_ - -#include "PolynomialDialect.h" -#include "PolynomialTypes.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Dialect.h" -#include "mlir/Interfaces/InferTypeOpInterface.h" - -#define GET_OP_CLASSES -#include "mlir/Dialect/Polynomial/IR/Polynomial.h.inc" - -#endif // MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALOPS_H_ diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.h b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.h deleted file mode 100644 index 2fc6877452547..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.h +++ /dev/null @@ -1,17 +0,0 @@ -//===- PolynomialTypes.h - Types for the Polynomial dialect -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALTYPES_H_ -#define MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALTYPES_H_ - -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialDialect.h" - -#define GET_TYPEDEF_CLASSES -#include "mlir/Dialect/Polynomial/IR/PolynomialTypes.h.inc" - -#endif // MLIR_INCLUDE_MLIR_DIALECT_POLYNOMIAL_IR_POLYNOMIALTYPES_H_ diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.td b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.td deleted file mode 100644 index cf33503764abb..0000000000000 --- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialTypes.td +++ /dev/null @@ -1,33 +0,0 @@ -//===- PolynomialTypes.td - Polynomial types ---------------*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef POLYNOMIAL_TYPES -#define POLYNOMIAL_TYPES - -include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.td" -include "mlir/Dialect/Polynomial/IR/PolynomialDialect.td" - -class Polynomial_Type - : TypeDef { - let mnemonic = typeMnemonic; -} - -def Polynomial_PolynomialType : Polynomial_Type<"Polynomial", "polynomial"> { - let summary = "An element of a polynomial ring."; - let description = [{ - A type for polynomials in a polynomial quotient ring. - }]; - let parameters = (ins Polynomial_RingAttr:$ring); - let assemblyFormat = "`<` struct(params) `>`"; -} - -def PolynomialLike : TypeOrValueSemanticsContainer< - Polynomial_PolynomialType, "polynomial-like">; - - -#endif // POLYNOMIAL_TYPES diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td index 2c281c9f6aa85..a61d90a0c39b1 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -266,9 +266,9 @@ def SparseTensor_ToPositionsOp : SparseTensor_Op<"positions", let summary = "Extracts the `level`-th positions array of the `tensor`"; let description = [{ Returns the positions array of the tensor's storage at the given - level. This is similar to the `bufferization.to_memref` operation + level. This is similar to the `bufferization.to_buffer` operation in the sense that it provides a bridge between a tensor world view - and a bufferized world view. Unlike the `bufferization.to_memref` + and a bufferized world view. Unlike the `bufferization.to_buffer` operation, however, this sparse operation actually lowers into code that extracts the positions array from the sparse storage itself (either by calling a support library or through direct code). @@ -295,9 +295,9 @@ def SparseTensor_ToCoordinatesOp : SparseTensor_Op<"coordinates", let summary = "Extracts the `level`-th coordinates array of the `tensor`"; let description = [{ Returns the coordinates array of the tensor's storage at the given - level. This is similar to the `bufferization.to_memref` operation + level. This is similar to the `bufferization.to_buffer` operation in the sense that it provides a bridge between a tensor world view - and a bufferized world view. Unlike the `bufferization.to_memref` + and a bufferized world view. Unlike the `bufferization.to_buffer` operation, however, this sparse operation actually lowers into code that extracts the coordinates array from the sparse storage itself (either by calling a support library or through direct code). @@ -326,9 +326,9 @@ def SparseTensor_ToCoordinatesBufferOp : SparseTensor_Op<"coordinates_buffer", Returns the linear coordinates array for a sparse tensor with a trailing COO region with at least two levels. It is an error if the tensor doesn't contain such a COO region. This is similar - to the `bufferization.to_memref` operation in the sense that it + to the `bufferization.to_buffer` operation in the sense that it provides a bridge between a tensor world view and a bufferized - world view. Unlike the `bufferization.to_memref` operation, + world view. Unlike the `bufferization.to_buffer` operation, however, this operation actually lowers into code that extracts the linear coordinates array from the sparse storage scheme that stores the coordinates for the COO region as an array of structures. @@ -359,9 +359,9 @@ def SparseTensor_ToValuesOp : SparseTensor_Op<"values", let description = [{ Returns the values array of the sparse storage format for the given sparse tensor, independent of the actual dimension. This is similar to - the `bufferization.to_memref` operation in the sense that it provides a bridge + the `bufferization.to_buffer` operation in the sense that it provides a bridge between a tensor world view and a bufferized world view. Unlike the - `bufferization.to_memref` operation, however, this sparse operation actually + `bufferization.to_buffer` operation, however, this sparse operation actually lowers into code that extracts the values array from the sparse storage scheme (either by calling a support library or through direct code). diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h index f1100d5cf8b68..34a94e6ea7051 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h @@ -407,13 +407,22 @@ void populateVectorTransposeNarrowTypeRewritePatterns( RewritePatternSet &patterns, PatternBenefit benefit = 1); /// Initialize `typeConverter` and `conversionTarget` for vector linearization. -/// This registers (1) which operations are legal and hence should not be -/// linearized, (2) what converted types are (rank-1 vectors) and how to +/// +/// Definition: here 'linearization' means converting a single operation with +/// 1+ vector operand/result of rank>1, into a new single operation whose +/// vector operands and results are all of rank<=1. +/// +/// This function registers (1) which operations are legal, and hence should not +/// be linearized, (2) what the converted types are (rank-1 vectors) and how to /// materialze the conversion (with shape_cast) /// /// Note: the set of legal operations can be extended by a user if for example -/// certain rank>1 vectors are considered valid, but adding additional +/// certain rank>1 vectors are considered valid, by adding additional /// dynamically legal ops to `conversionTarget`. +/// +/// Further note: the choice to use a dialect conversion design for +/// linearization is to make it easy to reuse generic structural type +/// conversions for linearizing scf/cf/func operations void populateForVectorLinearize(TypeConverter &typeConverter, ConversionTarget &conversionTarget); diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td index 4f8301f9380b8..25d9c404f0181 100644 --- a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td +++ b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td @@ -83,7 +83,10 @@ def MaskCompressOp : AVX512_Op<"mask.compress", [Pure, } }]; let extraClassDeclaration = [{ - SmallVector getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&); + SmallVector getIntrinsicOperands( + ::mlir::ArrayRef operands, + const ::mlir::LLVMTypeConverter &typeConverter, + ::mlir::RewriterBase &rewriter); }]; } @@ -404,7 +407,10 @@ def DotOp : AVX_LowOp<"dot", [Pure, } }]; let extraClassDeclaration = [{ - SmallVector getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&); + SmallVector getIntrinsicOperands( + ::mlir::ArrayRef operands, + const ::mlir::LLVMTypeConverter &typeConverter, + ::mlir::RewriterBase &rewriter); }]; } @@ -452,7 +458,10 @@ def BcstToPackedF32Op : AVX_Op<"bcst_to_f32.packed", [MemoryEffects<[MemRead]>, }]; let extraClassDeclaration = [{ - SmallVector getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&); + SmallVector getIntrinsicOperands( + ::mlir::ArrayRef operands, + const ::mlir::LLVMTypeConverter &typeConverter, + ::mlir::RewriterBase &rewriter); }]; } @@ -500,7 +509,10 @@ def CvtPackedEvenIndexedToF32Op : AVX_Op<"cvt.packed.even.indexed_to_f32", [Memo }]; let extraClassDeclaration = [{ - SmallVector getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&); + SmallVector getIntrinsicOperands( + ::mlir::ArrayRef operands, + const ::mlir::LLVMTypeConverter &typeConverter, + ::mlir::RewriterBase &rewriter); }]; } @@ -543,7 +555,10 @@ def CvtPackedOddIndexedToF32Op : AVX_Op<"cvt.packed.odd.indexed_to_f32", [Memory }]; let extraClassDeclaration = [{ - SmallVector getIntrinsicOperands(::mlir::RewriterBase&, const LLVMTypeConverter&); + SmallVector getIntrinsicOperands( + ::mlir::ArrayRef operands, + const ::mlir::LLVMTypeConverter &typeConverter, + ::mlir::RewriterBase &rewriter); }]; } #endif // X86VECTOR_OPS diff --git a/mlir/include/mlir/Dialect/X86Vector/X86VectorInterfaces.td b/mlir/include/mlir/Dialect/X86Vector/X86VectorInterfaces.td index 5176f4a447b6e..cde9d1dce65ee 100644 --- a/mlir/include/mlir/Dialect/X86Vector/X86VectorInterfaces.td +++ b/mlir/include/mlir/Dialect/X86Vector/X86VectorInterfaces.td @@ -58,9 +58,11 @@ def OneToOneIntrinsicOpInterface : OpInterface<"OneToOneIntrinsicOp"> { }], /*retType=*/"SmallVector", /*methodName=*/"getIntrinsicOperands", - /*args=*/(ins "::mlir::RewriterBase &":$rewriter, "const LLVMTypeConverter &":$typeConverter), + /*args=*/(ins "::mlir::ArrayRef":$operands, + "const ::mlir::LLVMTypeConverter &":$typeConverter, + "::mlir::RewriterBase &":$rewriter), /*methodBody=*/"", - /*defaultImplementation=*/"return SmallVector($_op->getOperands());" + /*defaultImplementation=*/"return SmallVector(operands);" >, ]; } diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h index 6efad01dec4cc..8b56d81c8eecc 100644 --- a/mlir/include/mlir/IR/OpImplementation.h +++ b/mlir/include/mlir/IR/OpImplementation.h @@ -656,6 +656,12 @@ class AsmParser { /// Parse a '+' token if present. virtual ParseResult parseOptionalPlus() = 0; + /// Parse a '/' token. + virtual ParseResult parseSlash() = 0; + + /// Parse a '/' token if present. + virtual ParseResult parseOptionalSlash() = 0; + /// Parse a '-' token. virtual ParseResult parseMinus() = 0; diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index e83be7b40eded..ea285ac7f16e3 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -65,7 +65,6 @@ #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/Dialect/PDL/IR/PDL.h" #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialDialect.h" #include "mlir/Dialect/Ptr/IR/PtrDialect.h" #include "mlir/Dialect/Quant/IR/Quant.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -138,7 +137,6 @@ inline void registerAllDialects(DialectRegistry ®istry) { omp::OpenMPDialect, pdl::PDLDialect, pdl_interp::PDLInterpDialect, - polynomial::PolynomialDialect, ptr::PtrDialect, quant::QuantDialect, ROCDL::ROCDLDialect, diff --git a/mlir/include/mlir/Target/LLVMIR/Import.h b/mlir/include/mlir/Target/LLVMIR/Import.h index c6181243a06b0..458361842ec81 100644 --- a/mlir/include/mlir/Target/LLVMIR/Import.h +++ b/mlir/include/mlir/Target/LLVMIR/Import.h @@ -46,10 +46,14 @@ class ModuleOp; /// registered an explicit intrinsic operation. Warning: passes that rely on /// matching explicit intrinsic operations may not work properly if this flag is /// enabled. +/// The `importStructsAsLiterals` flag (default off) ensures that all structs +/// are imported as literal structs, even when they are named in the LLVM +/// module. OwningOpRef translateLLVMIRToModule( std::unique_ptr llvmModule, MLIRContext *context, bool emitExpensiveWarnings = true, bool dropDICompositeTypeElements = false, - bool loadAllDialects = true, bool preferUnregisteredIntrinsics = false); + bool loadAllDialects = true, bool preferUnregisteredIntrinsics = false, + bool importStructsAsLiterals = false); /// Translate the given LLVM data layout into an MLIR equivalent using the DLTI /// dialect. diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h index 568dc00b3bb97..9902c6bb15caf 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h @@ -48,7 +48,7 @@ class ModuleImport { public: ModuleImport(ModuleOp mlirModule, std::unique_ptr llvmModule, bool emitExpensiveWarnings, bool importEmptyDICompositeTypes, - bool preferUnregisteredIntrinsics); + bool preferUnregisteredIntrinsics, bool importStructsAsLiterals); /// Calls the LLVMImportInterface initialization that queries the registered /// dialect interfaces for the supported LLVM IR intrinsics and metadata kinds @@ -330,6 +330,9 @@ class ModuleImport { /// Converts a single debug intrinsic. LogicalResult processDebugIntrinsic(llvm::DbgVariableIntrinsic *dbgIntr, DominanceInfo &domInfo); + /// Converts LLMV IR asm inline call operand's attributes into an array of + /// MLIR attributes to be utilized in `llvm.inline_asm`. + ArrayAttr convertAsmInlineOperandAttrs(const llvm::CallBase &llvmCall); /// Converts an LLVM intrinsic to an MLIR LLVM dialect operation if an MLIR /// counterpart exists. Otherwise, returns failure. LogicalResult convertIntrinsic(llvm::CallInst *inst); diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h index 30c190e50a4f7..97ae14aa0d6af 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h @@ -142,21 +142,20 @@ class ModuleTranslation { auto result = unresolvedBlockAddressMapping.try_emplace(op, cst); (void)result; assert(result.second && - "attempting to map a blockaddress that is already mapped"); + "attempting to map a blockaddress operation that is already mapped"); } - /// Maps a blockaddress operation to its corresponding placeholder LLVM - /// value. - void mapBlockTag(BlockAddressAttr attr, BlockTagOp blockTag) { - // Attempts to map already mapped block labels which is fine if the given - // labels are verified to be unique. - blockTagMapping[attr] = blockTag; + /// Maps a BlockAddressAttr to its corresponding LLVM basic block. + void mapBlockAddress(BlockAddressAttr attr, llvm::BasicBlock *block) { + auto result = blockAddressToLLVMMapping.try_emplace(attr, block); + (void)result; + assert(result.second && + "attempting to map a blockaddress attribute that is already mapped"); } - /// Finds an MLIR block that corresponds to the given MLIR call - /// operation. - BlockTagOp lookupBlockTag(BlockAddressAttr attr) const { - return blockTagMapping.lookup(attr); + /// Finds the LLVM basic block that corresponds to the given BlockAddressAttr. + llvm::BasicBlock *lookupBlockAddress(BlockAddressAttr attr) const { + return blockAddressToLLVMMapping.lookup(attr); } /// Removes the mapping for blocks contained in the region and values defined @@ -463,10 +462,9 @@ class ModuleTranslation { /// mapping is used to replace the placeholders with the LLVM block addresses. DenseMap unresolvedBlockAddressMapping; - /// Mapping from a BlockAddressAttr attribute to a matching BlockTagOp. This - /// is used to cache BlockTagOp locations instead of walking a LLVMFuncOp in - /// search for those. - DenseMap blockTagMapping; + /// Mapping from a BlockAddressAttr attribute to it's matching LLVM basic + /// block. + DenseMap blockAddressToLLVMMapping; /// Stack of user-specified state elements, useful when translating operations /// with regions. diff --git a/mlir/include/mlir/Target/LLVMIR/TypeFromLLVM.h b/mlir/include/mlir/Target/LLVMIR/TypeFromLLVM.h index 9bb56ee358b8c..0a519534128d6 100644 --- a/mlir/include/mlir/Target/LLVMIR/TypeFromLLVM.h +++ b/mlir/include/mlir/Target/LLVMIR/TypeFromLLVM.h @@ -17,8 +17,6 @@ #include namespace llvm { -class DataLayout; -class LLVMContext; class Type; } // namespace llvm @@ -38,7 +36,8 @@ class TypeFromLLVMIRTranslatorImpl; /// reused across translations. class TypeFromLLVMIRTranslator { public: - TypeFromLLVMIRTranslator(MLIRContext &context); + TypeFromLLVMIRTranslator(MLIRContext &context, + bool importStructsAsLiterals = false); ~TypeFromLLVMIRTranslator(); /// Translates the given LLVM IR type to the MLIR LLVM dialect. diff --git a/mlir/lib/AsmParser/AsmParserImpl.h b/mlir/lib/AsmParser/AsmParserImpl.h index 1f8fbfdd93568..eec2702cba343 100644 --- a/mlir/lib/AsmParser/AsmParserImpl.h +++ b/mlir/lib/AsmParser/AsmParserImpl.h @@ -206,6 +206,16 @@ class AsmParserImpl : public BaseT { return success(parser.consumeIf(Token::question)); } + /// Parses a '/' token. + ParseResult parseSlash() override { + return parser.parseToken(Token::slash, "expected '/'"); + } + + /// Parses a '/' if present. + ParseResult parseOptionalSlash() override { + return success(parser.consumeIf(Token::slash)); + } + /// Parses a '*' token. ParseResult parseStar() override { return parser.parseToken(Token::star, "expected '*'"); diff --git a/mlir/lib/AsmParser/Lexer.cpp b/mlir/lib/AsmParser/Lexer.cpp index b4189181a8495..751bd63e537f8 100644 --- a/mlir/lib/AsmParser/Lexer.cpp +++ b/mlir/lib/AsmParser/Lexer.cpp @@ -157,7 +157,7 @@ Token Lexer::lexToken() { skipComment(); continue; } - return emitError(tokStart, "unexpected character"); + return formToken(Token::slash, tokStart); case '@': return lexAtIdentifier(tokStart); diff --git a/mlir/lib/AsmParser/TokenKinds.def b/mlir/lib/AsmParser/TokenKinds.def index 49da8c3dea5fa..fe7c53753e156 100644 --- a/mlir/lib/AsmParser/TokenKinds.def +++ b/mlir/lib/AsmParser/TokenKinds.def @@ -70,6 +70,7 @@ TOK_PUNCTUATION(question, "?") TOK_PUNCTUATION(r_brace, "}") TOK_PUNCTUATION(r_paren, ")") TOK_PUNCTUATION(r_square, "]") +TOK_PUNCTUATION(slash, "/") TOK_PUNCTUATION(star, "*") TOK_PUNCTUATION(vertical_bar, "|") diff --git a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp index cafbf835de22f..823d4d644f586 100644 --- a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp +++ b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp @@ -576,7 +576,7 @@ struct ConvertUpdateHaloOp : public OpConversionPattern { auto tensorType = MemRefType::get( dstShape, cast(array.getType()).getElementType()); array = - rewriter.create(loc, tensorType, array); + rewriter.create(loc, tensorType, array); } auto rank = cast(array.getType()).getRank(); auto opSplitAxes = adaptor.getSplitAxes().getAxes(); diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp index 62c1857e4b1da..c11f1bca5d49d 100644 --- a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp @@ -126,11 +126,10 @@ void AffineDataCopyGeneration::runOnBlock(Block *block, // moment; we do a check later and report an error with location info. // Get to the first load, store, or for op (that is not a copy nest itself). - auto curBegin = - std::find_if(block->begin(), block->end(), [&](Operation &op) { - return isa(op) && - copyNests.count(&op) == 0; - }); + auto curBegin = llvm::find_if(*block, [&](Operation &op) { + return isa(op) && + copyNests.count(&op) == 0; + }); // Create [begin, end) ranges. auto it = curBegin; diff --git a/mlir/lib/Dialect/Affine/Transforms/RaiseMemrefDialect.cpp b/mlir/lib/Dialect/Affine/Transforms/RaiseMemrefDialect.cpp index 491d2e03c36bc..3fc2664aefdfb 100644 --- a/mlir/lib/Dialect/Affine/Transforms/RaiseMemrefDialect.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/RaiseMemrefDialect.cpp @@ -48,7 +48,7 @@ static std::optional findInListOrAdd(Value value, llvm::SmallVectorImpl &dims, function_ref isValidElement) { - Value *loopIV = std::find(dims.begin(), dims.end(), value); + Value *loopIV = llvm::find(dims, value); if (loopIV != dims.end()) { // We found an IV that already has an index, return that index. return {std::distance(dims.begin(), loopIV)}; diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp index 84556fbefbc9f..72a05ffe97ac0 100644 --- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp +++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp @@ -497,8 +497,8 @@ coalesceTileLiveRanges(DenseMap &initialLiveRanges) { // Sort the new live ranges by starting point (ready for tile allocation). auto coalescedLiveRanges = uniqueLiveRanges.takeVector(); - std::sort(coalescedLiveRanges.begin(), coalescedLiveRanges.end(), - [](LiveRange *a, LiveRange *b) { return *a < *b; }); + llvm::sort(coalescedLiveRanges, + [](LiveRange *a, LiveRange *b) { return *a < *b; }); return std::move(coalescedLiveRanges); } @@ -824,8 +824,8 @@ LogicalResult mlir::arm_sme::allocateSMETiles(FunctionOpInterface function, [&](LiveRange const &liveRange) { return !liveRange.empty(); }); auto initialRanges = llvm::to_vector(llvm::map_range( nonEmpty, [](LiveRange const &liveRange) { return &liveRange; })); - std::sort(initialRanges.begin(), initialRanges.end(), - [](LiveRange const *a, LiveRange const *b) { return *a < *b; }); + llvm::sort(initialRanges, + [](LiveRange const *a, LiveRange const *b) { return *a < *b; }); llvm::errs() << "\n========== Initial Live Ranges:\n"; dumpLiveRanges(operationToIndexMap, initialRanges, function); } diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp index 2bdb640699d03..35f2a02cc4ec6 100644 --- a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp +++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeForLLVMExport.cpp @@ -24,6 +24,9 @@ using SdotOpLowering = OneToOneConvertToLLVMPattern; using SmmlaOpLowering = OneToOneConvertToLLVMPattern; using UdotOpLowering = OneToOneConvertToLLVMPattern; using UmmlaOpLowering = OneToOneConvertToLLVMPattern; +using UsmmlaOpLowering = OneToOneConvertToLLVMPattern; +using DupQLaneLowering = + OneToOneConvertToLLVMPattern; using ScalableMaskedAddIOpLowering = OneToOneConvertToLLVMPattern; @@ -188,24 +191,26 @@ void mlir::populateArmSVELegalizeForLLVMExportPatterns( // Populate conversion patterns // clang-format off - patterns.add(converter); + SdotOpLowering>(converter); // Add vector.create_mask conversion with a high benefit as it produces much // nicer code than the generic lowering. patterns.add(converter, /*benefit=*/4096); @@ -215,41 +220,46 @@ void mlir::populateArmSVELegalizeForLLVMExportPatterns( void mlir::configureArmSVELegalizeForExportTarget( LLVMConversionTarget &target) { // clang-format off - target.addLegalOp(); - target.addIllegalOp(); + target.addIllegalOp(); + ZipX4Op, + SdotOp>(); // clang-format on } diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp index 51dfd84d9ac60..eed7a56fff8af 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp @@ -177,8 +177,7 @@ void DeallocationState::getMemrefsToRetain( // liveOut has non-deterministic order because it was constructed by iterating // over a hash-set. SmallVector retainedByLiveness(liveOut.begin(), liveOut.end()); - std::sort(retainedByLiveness.begin(), retainedByLiveness.end(), - ValueComparator()); + llvm::sort(retainedByLiveness, ValueComparator()); toRetain.append(retainedByLiveness); } diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp index 99ffa62c41a4d..1fc34051680f1 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -624,8 +624,8 @@ bool AnalysisState::canOmitTensorCopy(OpOperand &opOperand) const { } bool AnalysisState::isInPlace(OpOperand &opOperand) const { - // ToMemrefOps are always in-place. - if (isa(opOperand.getOwner())) + // ToBufferOps are always in-place. + if (isa(opOperand.getOwner())) return true; // In the absence of analysis information, OpOperands that bufferize to a @@ -650,13 +650,13 @@ bool AnalysisState::hasUndefinedContents(OpOperand *opOperand) const { return false; } -// bufferization.to_memref is not allowed to change the rank. -static void ensureToMemrefOpIsValid(Value tensor, Type memrefType) { +// bufferization.to_buffer is not allowed to change the rank. +static void ensureToBufferOpIsValid(Value tensor, Type memrefType) { #ifndef NDEBUG auto rankedTensorType = llvm::dyn_cast(tensor.getType()); assert((!rankedTensorType || llvm::cast(memrefType).getRank() == rankedTensorType.getRank()) && - "to_memref would be invalid: mismatching ranks"); + "to_buffer would be invalid: mismatching ranks"); #endif } @@ -671,15 +671,15 @@ FailureOr bufferization::getBuffer(RewriterBase &rewriter, Value value, if (auto toTensorOp = value.getDefiningOp()) return toTensorOp.getMemref(); - // Insert to_memref op. + // Insert to_buffer op. OpBuilder::InsertionGuard g(rewriter); setInsertionPointAfter(rewriter, value); FailureOr memrefType = getBufferType(value, options); if (failed(memrefType)) return failure(); - ensureToMemrefOpIsValid(value, *memrefType); + ensureToBufferOpIsValid(value, *memrefType); return rewriter - .create(value.getLoc(), *memrefType, value) + .create(value.getLoc(), *memrefType, value) .getResult(); } diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp index 4fce9be390bd6..ecd2ef15546a4 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp @@ -81,21 +81,21 @@ FailureOr mlir::bufferization::castOrReallocMemRefValue( return copy; } -/// Try to fold to_memref(to_tensor(x)). If x's type and the result type of the -/// to_memref op are different, a memref.cast is needed. -LogicalResult mlir::bufferization::foldToMemrefToTensorPair( - RewriterBase &rewriter, ToMemrefOp toMemref, +/// Try to fold to_buffer(to_tensor(x)). If x's type and the result type of the +/// to_buffer op are different, a memref.cast is needed. +LogicalResult mlir::bufferization::foldToBufferToTensorPair( + RewriterBase &rewriter, ToBufferOp toBuffer, const BufferizationOptions &options) { - auto memrefToTensor = toMemref.getTensor().getDefiningOp(); - if (!memrefToTensor) + auto bufferToTensor = toBuffer.getTensor().getDefiningOp(); + if (!bufferToTensor) return failure(); - Type srcType = memrefToTensor.getMemref().getType(); - Type destType = toMemref.getType(); + Type srcType = bufferToTensor.getMemref().getType(); + Type destType = toBuffer.getType(); // Directly rewrite if the type did not change. if (srcType == destType) { - rewriter.replaceOp(toMemref, memrefToTensor.getMemref()); + rewriter.replaceOp(toBuffer, bufferToTensor.getMemref()); return success(); } @@ -106,11 +106,11 @@ LogicalResult mlir::bufferization::foldToMemrefToTensorPair( // Ranked memref -> Ranked memref cast. if (rankedSrcType && rankedDestType) { FailureOr replacement = castOrReallocMemRefValue( - rewriter, memrefToTensor.getMemref(), rankedDestType, options); + rewriter, bufferToTensor.getMemref(), rankedDestType, options); if (failed(replacement)) return failure(); - rewriter.replaceOp(toMemref, *replacement); + rewriter.replaceOp(toBuffer, *replacement); return success(); } @@ -123,8 +123,8 @@ LogicalResult mlir::bufferization::foldToMemrefToTensorPair( // Ranked memref -> unranked memref cast: No copy needed. assert(memref::CastOp::areCastCompatible(srcType, destType) && "expected that types are cast compatible"); - rewriter.replaceOpWithNewOp(toMemref, destType, - memrefToTensor.getMemref()); + rewriter.replaceOpWithNewOp(toBuffer, destType, + bufferToTensor.getMemref()); return success(); } @@ -738,12 +738,12 @@ bool ToTensorOp::isWritable(Value value, const AnalysisState &state) { } OpFoldResult ToTensorOp::fold(FoldAdaptor) { - if (auto toMemref = getMemref().getDefiningOp()) + if (auto toBuffer = getMemref().getDefiningOp()) // Approximate alias analysis by conservatively folding only when no there // is no interleaved operation. - if (toMemref->getBlock() == this->getOperation()->getBlock() && - toMemref->getNextNode() == this->getOperation()) - return toMemref.getTensor(); + if (toBuffer->getBlock() == this->getOperation()->getBlock() && + toBuffer->getNextNode() == this->getOperation()) + return toBuffer.getTensor(); return {}; } @@ -770,10 +770,10 @@ void ToTensorOp::getCanonicalizationPatterns(RewritePatternSet &results, } //===----------------------------------------------------------------------===// -// ToMemrefOp +// ToBufferOp //===----------------------------------------------------------------------===// -OpFoldResult ToMemrefOp::fold(FoldAdaptor) { +OpFoldResult ToBufferOp::fold(FoldAdaptor) { if (auto memrefToTensor = getTensor().getDefiningOp()) if (memrefToTensor.getMemref().getType() == getType()) return memrefToTensor.getMemref(); @@ -782,14 +782,14 @@ OpFoldResult ToMemrefOp::fold(FoldAdaptor) { namespace { -/// Replace tensor.cast + to_memref by to_memref + memref.cast. -struct ToMemrefOfCast : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +/// Replace tensor.cast + to_buffer by to_buffer + memref.cast. +struct ToBufferOfCast : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(ToMemrefOp toMemref, + LogicalResult matchAndRewrite(ToBufferOp toBuffer, PatternRewriter &rewriter) const final { auto tensorCastOperand = - toMemref.getOperand().getDefiningOp(); + toBuffer.getOperand().getDefiningOp(); if (!tensorCastOperand) return failure(); auto srcTensorType = llvm::dyn_cast( @@ -798,51 +798,51 @@ struct ToMemrefOfCast : public OpRewritePattern { return failure(); auto memrefType = MemRefType::get(srcTensorType.getShape(), srcTensorType.getElementType()); - Value memref = rewriter.create(toMemref.getLoc(), memrefType, + Value memref = rewriter.create(toBuffer.getLoc(), memrefType, tensorCastOperand.getOperand()); - rewriter.replaceOpWithNewOp(toMemref, toMemref.getType(), + rewriter.replaceOpWithNewOp(toBuffer, toBuffer.getType(), memref); return success(); } }; -/// Canonicalize bufferization.to_tensor + bufferization.to_memref. Insert a +/// Canonicalize bufferization.to_tensor + bufferization.to_buffer. Insert a /// cast if necessary. -struct ToMemrefToTensorFolding : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct ToBufferToTensorFolding : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(ToMemrefOp toMemref, + LogicalResult matchAndRewrite(ToBufferOp toBuffer, PatternRewriter &rewriter) const final { BufferizationOptions options; options.bufferAlignment = 0; - return foldToMemrefToTensorPair(rewriter, toMemref, options); + return foldToBufferToTensorPair(rewriter, toBuffer, options); } }; -/// Fold a load on a to_memref operation into an tensor.extract on the +/// Fold a load on a to_buffer operation into an tensor.extract on the /// corresponding tensor. -struct LoadOfToMemref : public OpRewritePattern { +struct LoadOfToBuffer : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(memref::LoadOp load, PatternRewriter &rewriter) const override { - auto toMemref = load.getMemref().getDefiningOp(); - if (!toMemref) + auto toBuffer = load.getMemref().getDefiningOp(); + if (!toBuffer) return failure(); - rewriter.replaceOpWithNewOp(load, toMemref.getTensor(), + rewriter.replaceOpWithNewOp(load, toBuffer.getTensor(), load.getIndices()); return success(); } }; -/// Fold dim of a to_memref into the dim of the tensor. +/// Fold dim of a to_buffer into the dim of the tensor. struct DimOfCastOp : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(memref::DimOp dimOp, PatternRewriter &rewriter) const override { - auto castOp = dimOp.getSource().getDefiningOp(); + auto castOp = dimOp.getSource().getDefiningOp(); if (!castOp) return failure(); Value newSource = castOp.getOperand(); @@ -854,16 +854,16 @@ struct DimOfCastOp : public OpRewritePattern { } // namespace -void ToMemrefOp::getCanonicalizationPatterns(RewritePatternSet &results, +void ToBufferOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add(context); } -LogicalResult ToMemrefOp::bufferize(RewriterBase &rewriter, +LogicalResult ToBufferOp::bufferize(RewriterBase &rewriter, const BufferizationOptions &options) { - // Fold to_memref(to_tensor(x)) to x. Insert a cast if necessary. - (void)foldToMemrefToTensorPair(rewriter, *this, options); + // Fold to_buffer(to_tensor(x)) to x. Insert a cast if necessary. + (void)foldToBufferToTensorPair(rewriter, *this, options); // Note: The return value of `bufferize` indicates whether there was an error // or not. (And not whether the pattern matched or not.) return success(); diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp index 0b60c44ece5fd..824b505517119 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -201,11 +201,11 @@ namespace { class BufferizationRewriter : public IRRewriter, public RewriterBase::Listener { public: BufferizationRewriter(MLIRContext *ctx, DenseSet &erasedOps, - DenseSet &toMemrefOps, + DenseSet &toBufferOps, SmallVector &worklist, const BufferizationOptions &options, BufferizationStatistics *statistics) - : IRRewriter(ctx), erasedOps(erasedOps), toMemrefOps(toMemrefOps), + : IRRewriter(ctx), erasedOps(erasedOps), toBufferOps(toBufferOps), worklist(worklist), analysisState(options), statistics(statistics) { setListener(this); } @@ -214,7 +214,7 @@ class BufferizationRewriter : public IRRewriter, public RewriterBase::Listener { void notifyOperationErased(Operation *op) override { erasedOps.insert(op); // Erase if present. - toMemrefOps.erase(op); + toBufferOps.erase(op); } void notifyOperationInserted(Operation *op, InsertPoint previous) override { @@ -231,9 +231,9 @@ class BufferizationRewriter : public IRRewriter, public RewriterBase::Listener { sideEffectingOp.hasEffect()); } - // Keep track of to_memref ops. - if (isa(op)) { - toMemrefOps.insert(op); + // Keep track of to_buffer ops. + if (isa(op)) { + toBufferOps.insert(op); return; } @@ -258,8 +258,8 @@ class BufferizationRewriter : public IRRewriter, public RewriterBase::Listener { /// A set of all erased ops. DenseSet &erasedOps; - /// A set of all to_memref ops. - DenseSet &toMemrefOps; + /// A set of all to_buffer ops. + DenseSet &toBufferOps; /// The worklist of ops to be bufferized. SmallVector &worklist; @@ -282,9 +282,9 @@ LogicalResult bufferization::bufferizeOp(Operation *op, return failure(); } - // Keep track of to_memref ops. - DenseSet toMemrefOps; - op->walk([&](ToMemrefOp toMemrefOp) { toMemrefOps.insert(toMemrefOp); }); + // Keep track of to_buffer ops. + DenseSet toBufferOps; + op->walk([&](ToBufferOp toBufferOp) { toBufferOps.insert(toBufferOp); }); // Gather all bufferizable ops in top-to-bottom order. // @@ -303,7 +303,7 @@ LogicalResult bufferization::bufferizeOp(Operation *op, DenseSet erasedOps; // Bufferize all ops. - BufferizationRewriter rewriter(op->getContext(), erasedOps, toMemrefOps, + BufferizationRewriter rewriter(op->getContext(), erasedOps, toBufferOps, worklist, options, statistics); for (unsigned i = 0; i < worklist.size(); ++i) { Operation *nextOp = worklist[i]; @@ -346,11 +346,11 @@ LogicalResult bufferization::bufferizeOp(Operation *op, if (erasedOps.contains(op)) return success(); - // Fold all to_memref(to_tensor(x)) pairs. - for (Operation *op : toMemrefOps) { + // Fold all to_buffer(to_tensor(x)) pairs. + for (Operation *op : toBufferOps) { rewriter.setInsertionPoint(op); - (void)bufferization::foldToMemrefToTensorPair( - rewriter, cast(op), options); + (void)bufferization::foldToBufferToTensorPair( + rewriter, cast(op), options); } // Remove all dead to_tensor ops. @@ -381,8 +381,8 @@ LogicalResult bufferization::bufferizeOp(Operation *op, // Ops without any uses and no side effects will fold away. if (op->getUses().empty() && isMemoryEffectFree(op)) continue; - // ToTensorOps/ToMemrefOps are allowed in the output. - if (isa(op)) + // ToTensorOps/ToBufferOps are allowed in the output. + if (isa(op)) continue; return op->emitError("op was not bufferized"); } @@ -463,7 +463,7 @@ bufferization::bufferizeBlockSignature(Block *block, RewriterBase &rewriter, if (failed(operandBufferType)) return failure(); rewriter.setInsertionPointAfterValue(operand); - Value bufferizedOperand = rewriter.create( + Value bufferizedOperand = rewriter.create( operand.getLoc(), *operandBufferType, operand); // A cast is needed if the operand and the block argument have different // bufferized types. diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp index 0b0dcc9162a9a..755477713668e 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp @@ -76,13 +76,29 @@ getBufferizedFunctionArgType(FuncOp funcOp, int64_t index, } /// Return the FuncOp called by `callOp`. -static FuncOp getCalledFunction(CallOpInterface callOp) { +static FuncOp getCalledFunction(CallOpInterface callOp, + SymbolTableCollection &symbolTables) { SymbolRefAttr sym = llvm::dyn_cast_if_present(callOp.getCallableForCallee()); if (!sym) return nullptr; return dyn_cast_or_null( - SymbolTable::lookupNearestSymbolFrom(callOp, sym)); + symbolTables.lookupNearestSymbolFrom(callOp, sym)); +} + +/// Return the FuncOp called by `callOp`. +static FuncOp getCalledFunction(CallOpInterface callOp, + const AnalysisState &state) { + auto &oneShotAnalysisState = static_cast(state); + + if (auto *funcAnalysisState = + oneShotAnalysisState.getExtension()) { + // Use the cached symbol tables. + return getCalledFunction(callOp, funcAnalysisState->symbolTables); + } + + SymbolTableCollection symbolTables; + return getCalledFunction(callOp, symbolTables); } /// Get FuncAnalysisState. @@ -135,7 +151,7 @@ struct CallOpInterface bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand, const AnalysisState &state) const { func::CallOp callOp = cast(op); - FuncOp funcOp = getCalledFunction(callOp); + FuncOp funcOp = getCalledFunction(callOp, state); assert(funcOp && "expected CallOp to a FuncOp"); if (getFuncOpAnalysisState(state, funcOp) != FuncOpAnalysisState::Analyzed) @@ -150,7 +166,7 @@ struct CallOpInterface bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand, const AnalysisState &state) const { func::CallOp callOp = cast(op); - FuncOp funcOp = getCalledFunction(callOp); + FuncOp funcOp = getCalledFunction(callOp, state); assert(funcOp && "expected CallOp to a FuncOp"); if (getFuncOpAnalysisState(state, funcOp) != FuncOpAnalysisState::Analyzed) @@ -165,7 +181,7 @@ struct CallOpInterface AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand, const AnalysisState &state) const { func::CallOp callOp = cast(op); - FuncOp funcOp = getCalledFunction(callOp); + FuncOp funcOp = getCalledFunction(callOp, state); assert(funcOp && "expected CallOp to a FuncOp"); if (getFuncOpAnalysisState(state, funcOp) != FuncOpAnalysisState::Analyzed) // FuncOp not analyzed yet. Any OpResult may be aliasing. @@ -199,7 +215,11 @@ struct CallOpInterface getBufferType(Operation *op, Value value, const BufferizationOptions &options, SmallVector &invocationStack) const { auto callOp = cast(op); - FuncOp funcOp = getCalledFunction(callOp); + + // TODO Avoid recomputing the symbol tables every time. + SymbolTableCollection symbolTable; + + FuncOp funcOp = getCalledFunction(callOp, symbolTable); assert(funcOp && "expected CallOp to a FuncOp"); // If the callee was already bufferized, we can directly take the type from @@ -243,7 +263,11 @@ struct CallOpInterface // 2. Rewrite tensor operands as memrefs based on type of the already // bufferized callee. SmallVector newOperands; - FuncOp funcOp = getCalledFunction(callOp); + + // TODO Avoid recomputing the symbol tables every time. + SymbolTableCollection symbolTable; + + FuncOp funcOp = getCalledFunction(callOp, symbolTable); assert(funcOp && "expected CallOp to a FuncOp"); FunctionType funcType = funcOp.getFunctionType(); @@ -275,7 +299,7 @@ struct CallOpInterface memRefType = *maybeMemRefType; } - // Since we don't yet have a clear layout story, to_memref may + // Since we don't yet have a clear layout story, to_buffer may // conservatively turn tensors into more dynamic memref than necessary. // If the memref type of the callee fails, introduce an extra memref.cast // that will either canonicalize away or fail compilation until we can do @@ -456,9 +480,9 @@ struct FuncOpInterface // Note: If `inferFunctionResultLayout = true`, casts are later folded // away. - Value toMemrefOp = rewriter.create( + Value toBufferOp = rewriter.create( returnOp.getLoc(), bufferizedType, returnVal); - returnValues.push_back(toMemrefOp); + returnValues.push_back(toBufferOp); } returnOp.getOperandsMutable().assign(returnValues); diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp index 1eaf999d11c08..6e93b36d2d5a2 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp @@ -31,7 +31,7 @@ // Ops that do not implement `BufferizableOpInterface` can be analyzed but are // treated conservatively. E.g., the analysis has to assume that their tensor // OpOperands bufferize to memory writes. While such ops can be analyzed, they -// are not bufferized and remain in the IR. to_tensor and to_memref ops are +// are not bufferized and remain in the IR. to_tensor and to_buffer ops are // inserted at the bufferization boundary. // // This analysis caters to high-performance codegen where buffer reuse is deemed diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp index edd6bcf84f460..a025da8635135 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp @@ -280,13 +280,15 @@ static void removeBufferizationAttributes(BlockArgument bbArg) { } /// Return the func::FuncOp called by `callOp`. -static func::FuncOp getCalledFunction(func::CallOp callOp) { +static func::FuncOp +getCalledFunction(func::CallOp callOp, + mlir::SymbolTableCollection &symbolTable) { SymbolRefAttr sym = llvm::dyn_cast_if_present(callOp.getCallableForCallee()); if (!sym) return nullptr; return dyn_cast_or_null( - SymbolTable::lookupNearestSymbolFrom(callOp, sym)); + symbolTable.lookupNearestSymbolFrom(callOp, sym)); } /// Return "true" if the given function signature has tensor semantics. @@ -314,11 +316,15 @@ static LogicalResult getFuncOpsOrderedByCalls( DenseMap> calledBy; // For each FuncOp, the number of func::CallOp it contains. DenseMap numberCallOpsContainedInFuncOp; + + // TODO Avoid recomputing the symbol tables every time. + mlir::SymbolTableCollection symbolTable; + for (func::FuncOp funcOp : moduleOp.getOps()) { // Collect function calls and populate the caller map. numberCallOpsContainedInFuncOp[funcOp] = 0; WalkResult res = funcOp.walk([&](func::CallOp callOp) -> WalkResult { - func::FuncOp calledFunction = getCalledFunction(callOp); + func::FuncOp calledFunction = getCalledFunction(callOp, symbolTable); assert(calledFunction && "could not retrieved called func::FuncOp"); // If the called function does not have any tensors in its signature, then // it is not necessary to bufferize the callee before the caller. diff --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt index a473f2ff317c9..3cc52ebc0a8d9 100644 --- a/mlir/lib/Dialect/CMakeLists.txt +++ b/mlir/lib/Dialect/CMakeLists.txt @@ -28,7 +28,6 @@ add_subdirectory(OpenACCMPCommon) add_subdirectory(OpenMP) add_subdirectory(PDL) add_subdirectory(PDLInterp) -add_subdirectory(Polynomial) add_subdirectory(Ptr) add_subdirectory(Quant) add_subdirectory(SCF) diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp index 319bb90d9b601..0acd5c7fd80e3 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp @@ -11,6 +11,7 @@ #include "mlir/IR/DialectImplementation.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" using namespace mlir; @@ -58,7 +59,9 @@ void LLVMStructType::print(AsmPrinter &printer) const { if (isIdentified()) { cyclicPrint = printer.tryStartCyclicPrint(*this); - printer << '"' << getName() << '"'; + printer << '"'; + llvm::printEscapedString(getName(), printer.getStream()); + printer << '"'; // If we are printing a reference to one of the enclosing structs, just // print the name and stop to avoid infinitely long output. if (failed(cyclicPrint)) { diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index fbe7593420102..a9370dc003830 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -718,6 +718,54 @@ static Operation *replaceForAllWithNewSignature( return newforallOp; } +/// Given two operands coming from a loop iter arg, 'src' and 'dst', return true +/// if the operand 'src' is equal to 'dst' or equal to a iter arg present in a +/// outer loop. To determine the second condition, this function iterates +/// using a worklist over the enclosing loops, trying to find 'src' in any of +/// the parent loop's iter args. +static bool sameOrEquivalentIterArg(Value src, Value dst) { + // Stack like vector containing possible iterArgs candidates. The first one + // is dst, and we will transverse the IR from there. + SmallVector destWorklist; + destWorklist.push_back(dst); + + while (!destWorklist.empty()) { + Value currentDst = destWorklist.pop_back_val(); + + // We have found the same operand in some iter arg in the loop structure, + // so src and dst are equivalent. + if (src == currentDst) + return true; + + // The operands are not equivalent, look for enclosing loops over + // currentDst. + auto bbArg = dyn_cast(currentDst); + if (!bbArg) + continue; + + Block *parentBlock = bbArg.getOwner(); + assert(parentBlock && "unlinked block argument"); + + Operation *parentOp = parentBlock->getParentOp(); + assert(parentOp && "expected block argument with parent operation"); + + // Check if parent is loop-like. If it's not, do not add it to the worklist. + auto parentLoop = dyn_cast(parentOp); + if (!parentLoop) + continue; + + for (auto innerIterArg : parentLoop.getRegionIterArgs()) { + // No need to check for null as innerIterArg is tied to parentLoop. + OpOperand *operand = parentLoop.getTiedLoopInit(innerIterArg); + Value loopBlockArgument = + parentLoop->getOperand(operand->getOperandNumber()); + destWorklist.push_back(loopBlockArgument); + } + } + + return false; +} + /// Find the first "extract" user of `producerOp` and tile it right before its /// use. The tiled op is fused under the `containingOp`. /// Return this fused op on success or nullptr if anything fails. @@ -755,6 +803,40 @@ tileAndFuseFirstExtractUse(RewriterBase &rewriter, Diagnostic &diag, OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(sliceOpToTile); + // Clone the producer inside the consumer and try to update the producer init + // operands using the loop bbArgs if applicable. More precisely, if the bbArg + // of the container loop points to a value that it is used by the consumer op, + // then, instead of using such value on the consumer, use the value coming + // from the bbArg instead. This allows to reuse the output tensor (instead of + // creating a new one) of the container when both producer and container write + // to the same output. + if (LoopLikeOpInterface containerLoop = + dyn_cast(sliceOpToTile->getParentOp())) { + Operation *clone = rewriter.clone(*producerOp); + rewriter.modifyOpInPlace(clone, [&]() { + // Iterate over the outputs of the producer and over the loop bbArgs and + // check if any bbArg points to the same value as the producer output. In + // such case, make the producer output point to the bbArg directly. + for (OpOperand &initOperandPtr : + cast(clone).getDpsInitsMutable()) { + Value producerOperand = + clone->getOperand(initOperandPtr.getOperandNumber()); + for (BlockArgument containerIterArg : + containerLoop.getRegionIterArgs()) { + OpOperand *bbArg = containerLoop.getTiedLoopInit(containerIterArg); + Value consumerOperand = + containerLoop->getOperand(bbArg->getOperandNumber()); + // The producer has the same init as the loop bbArg, use it. + if (sameOrEquivalentIterArg(producerOperand, consumerOperand)) { + initOperandPtr.set(containerIterArg); + } + } + } + }); + + tileableProducer = dyn_cast(clone); + } + // Tile the producer. int64_t resultNumber = cast(sliceOpToTile.getSource()).getResultNumber(); @@ -797,6 +879,10 @@ tileAndFuseFirstExtractUse(RewriterBase &rewriter, Diagnostic &diag, rewriter, diag, producerOp, containingOp, *tileAndFuseResult, resultNumber, offsets, sizes); + // Cleanup clone. + if (dyn_cast(containingOp)) + rewriter.eraseOp(tileableProducer); + return std::make_tuple(tileAndFuseResult->tiledOps, newContainingOp); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp index 6c1087730ebba..b1340be04e011 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp @@ -75,19 +75,19 @@ static void createMemcpy(OpBuilder &b, Location loc, Value tensorSource, // TODO: Support custom memory space on source. // We do not know the layout map of the source yet, so use a fully dynamic // layout for best compatibility. - Value toMemref = b.create( + Value toBuffer = b.create( loc, bufferization::getMemRefTypeWithFullyDynamicLayout(tensorType), tensorSource, /*readOnly=*/true); - b.create(loc, toMemref, memrefDest); + b.create(loc, toBuffer, memrefDest); } break; case linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy: { // TODO: Support custom memory space on source. // We do not know the layout map of the source yet, so use a fully dynamic // layout for best compatibility. - Value toMemref = b.create( + Value toBuffer = b.create( loc, bufferization::getMemRefTypeWithFullyDynamicLayout(tensorType), tensorSource, /*readOnly=*/true); - b.create(loc, toMemref, memrefDest); + b.create(loc, toBuffer, memrefDest); } break; }; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp index 26904f1f40d12..dd8ef9608a821 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @@ -312,10 +312,17 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, SmallVector inputOperands; SmallVector inputOperandsFromUnpackedSource; SmallVector indexingMaps; + auto hasEquivalentTiles = [](PackOp packOp, UnPackOp unPackOp) { + return packOp.getOuterDimsPerm() == unPackOp.getOuterDimsPerm() && + packOp.getInnerDimsPos() == unPackOp.getInnerDimsPos() && + llvm::equal(packOp.getMixedTiles(), unPackOp.getMixedTiles()); + }; for (OpOperand *inputOperand : genericOp.getDpsInputOperands()) { auto [packedOperand, packedIndexingMap] = getOrCreatePackedViewOfOperand( rewriter, loc, packInfo, genericOp, inputOperand); - if (auto unpackOp = inputOperand->get().getDefiningOp()) { + auto unpackOp = inputOperand->get().getDefiningOp(); + auto packOp = packedOperand.getDefiningOp(); + if (packOp && unpackOp && hasEquivalentTiles(packOp, unpackOp)) { inputOperandsFromUnpackedSource.push_back(unpackOp.getSource()); } else { inputOperandsFromUnpackedSource.push_back(packedOperand); @@ -324,14 +331,16 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, indexingMaps.push_back(packedIndexingMap); } - // If the pack and unpack op can be folded: - // 1) use unpack op source op for operand to fold unpack -> pack sequence. - // 2) init tensor of the generic op can be replaced by the destination of the - // pack op. + // If the unpack->pack sequences can be folded, replace use the sources of + // the unpack ops in any unpack->pack chains on the generic op operands. if (isFoldableUnpackPack) { inputOperands = inputOperandsFromUnpackedSource; - if (auto destPack = dest.getDefiningOp()) - dest = destPack.getDest(); + if (auto destPack = dest.getDefiningOp()) { + auto destUnPack = destPack.getSource().getDefiningOp(); + if (destUnPack && hasEquivalentTiles(destPack, destUnPack)) { + dest = destUnPack.getSource(); + } + } } int64_t numInnerLoops = packInfo.getNumTiledLoops(); diff --git a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp index ae8cb94661c76..daeae2c9d947d 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp @@ -103,7 +103,7 @@ computeTransposeBroadcast(AffineMap &map) { // If dims are not monotonically increasing then transpose is present. SmallVector sortedResMap(minorResult); - std::sort(sortedResMap.begin(), sortedResMap.end()); + llvm::sort(sortedResMap); bool hasTranspose = !std::equal(minorResult.begin(), minorResult.end(), sortedResMap.begin(), sortedResMap.end()); diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index 0cc840403a020..e8d460020cf69 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -511,7 +511,7 @@ struct LinalgOpPartialReductionInterface for (auto [resultNum, dimExpr] : llvm::enumerate(partialMap.getResults())) { unsigned dim = cast(dimExpr).getPosition(); - if (llvm::find(reductionDims, dim) != reductionDims.end()) { + if (llvm::is_contained(reductionDims, dim)) { partialReductionDims.push_back(resultNum); } } @@ -553,7 +553,7 @@ struct LinalgOpPartialReductionInterface unsigned dim = cast(dimExpr).getPosition(); resultSizes.push_back(sizes[dim]); - if (llvm::find(reductionDims, dim) != reductionDims.end()) { + if (llvm::is_contained(reductionDims, dim)) { // Reduction dims are reduced, and are always outputed in the same // place. So use offset 0 for them. resultOffsets.push_back(b.getIndexAttr(0)); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 63f88d02ff3a0..c5b62227777a7 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1506,72 +1506,69 @@ static SmallVector getTiledPackShape(linalg::PackOp packOp, return applyPermutation(destShape, linalg::getPackInverseDestPerm(packOp)); } -/// Creates a TransferWriteOp to write `input` into a newly initialized -/// output tensor. +/// Creates an optionally masked TransferWriteOp /// -/// Given: -/// - an input vector to write, -/// - the mixed destination sizes for the output tensor, -/// - and the vector sizes used for vectorization (i.e., the leading N dims, -/// for some value of N), -/// -/// this function generates the following sequence of ops: -/// -/// %dest = tensor.empty(%destSizes) -/// %res = vector.transfer_write %input into %dest +/// Generates the following operation: +/// %res = vector.transfer_write %vectorToStore into %dest /// /// If the leading N dimensions of the destination tensor do not match -/// `inputVecSizesForLeadingDims` (where N = -/// rank(`inputVecSizesForLeadingDims`)), masking is applied to ensure -/// correctness: +/// `inputVecSizesForLeadingDims` (N = rank(inputVecSizesForLeadingDims)), +/// masking is applied to ensure correctness: /// -/// %dest = tensor.empty(%destSizes) -/// %write = vector.transfer_write %input into %dest -/// %mask = vector.create_mask(%destSizes) -/// %res = vector.mask %mask { %write } +/// %mask = vector.create_mask(%destShape) +/// %res = vector.mask %mask { +/// vector.transfer_write %vectorToStore into %dest +/// } /// /// If `useInBoundsInsteadOfMasking` is set to `true`, the `in_bounds` attribute /// is used instead of masking: /// -/// %dest = tensor.empty(%destSizes) +/// %write = vector.transfer_write %vectorToStore into %dest /// in_bounds_flags = (...) /// %res = vector.transfer_write %input into %dest /// {in_bounds = in_bounds_flags} /// -/// NOTE: all write offsets are set to 0. +/// NOTE: All write offsets are set to 0. +/// TODO: Allow specyfying write offsets. /// NOTE: When N < rank(input), the missing vector sizes are effectively /// extracted from the trailing sizes of `destSizes`. This means those sizes -/// must be static. Supporting dynamic sizes will require the user to specify -/// the remaining vector sizes. This is left as a TODO. +/// must be static. +/// TODO: Support cases where an arbitrary dim is dynamic - this will require +/// specifying all the vector sizes. static Operation * -createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value input, - SmallVector destSizes, +createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore, + Value dest, ArrayRef inputVecSizesForLeadingDims, bool useInBoundsInsteadOfMasking = false) { - auto inputType = cast(input.getType()); - assert(inputType.getRank() == static_cast(destSizes.size()) && + ShapedType destType = cast(dest.getType()); + assert(cast(vectorToStore.getType()).getRank() == + static_cast(destType.getRank()) && "Rank mismatch!"); + (void)destType; - Value dest = builder.create(loc, destSizes, - inputType.getElementType()); int64_t rank = cast(dest.getType()).getRank(); - auto zero = builder.create(loc, 0); auto destShape = cast(dest.getType()).getShape(); + + // Compute the in_bounds attribute SmallVector inBoundsVal(rank, true); if (useInBoundsInsteadOfMasking) { // In this case, assume that all the required vector sizes have been // provided. - assert(inputVecSizesForLeadingDims.size() == destSizes.size() && + assert(inputVecSizesForLeadingDims.size() == + static_cast(destType.getRank()) && "Insufficient number of input vector sizes!"); // Update the inBounds attribute. for (unsigned i = 0; i < rank; i++) inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) && !ShapedType::isDynamic(destShape[i]); } + + // Generate the xfer_write Op + auto zero = builder.create(loc, 0); Operation *write = builder.create( loc, - /*vector=*/input, + /*vector=*/vectorToStore, /*source=*/dest, /*indices=*/SmallVector(rank, zero), /*inBounds=*/inBoundsVal); @@ -1579,11 +1576,17 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value input, destShape.drop_front(inputVecSizesForLeadingDims.size()), [](int64_t size) { return size == ShapedType::kDynamic; }) && "Only dims aligned with inputVecSizesForLeadingDims may be dynamic"); + + // If masking is disabled, exit. if (useInBoundsInsteadOfMasking) return write; + + // Check if masking is needed. bool needMaskForWrite = !llvm::equal(inputVecSizesForLeadingDims, destShape.take_front(inputVecSizesForLeadingDims.size())); + + // If masking is needed, generate the mask and mask the operation. if (needMaskForWrite) { SmallVector writeMaskShape; writeMaskShape.append(inputVecSizesForLeadingDims.begin(), @@ -1592,10 +1595,11 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value input, inputVecSizesForLeadingDims.size(), destShape.end()); auto writeMaskType = VectorType::get(writeMaskShape, builder.getI1Type()); - Value maskForWrite = - builder.create(loc, writeMaskType, destSizes); + Value maskForWrite = builder.create( + loc, writeMaskType, tensor::getMixedSizes(builder, loc, dest)); write = mlir::vector::maskOperation(builder, write, maskForWrite); } + return write; } @@ -1693,9 +1697,11 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, loc, shapeCastOp.getResult(), destPermutation); // Create TransferWriteOp. + Value dest = rewriter.create( + loc, reifiedReturnShapes[0], + transposeOp.getResult().getType().getElementType()); Operation *write = - createWriteOrMaskedWrite(rewriter, loc, transposeOp.getResult(), - /*destSizes=*/reifiedReturnShapes[0], + createWriteOrMaskedWrite(rewriter, loc, transposeOp.getResult(), dest, /*inputVecSizesForLeadingDims=*/inputVectorSizes, /*useInBoundsInsteadOfMasking=*/false); newResults.push_back(write->getResult(0)); @@ -1830,10 +1836,13 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, unpackOp.getDestType().hasStaticShape() ? vectorSizes : shapeCastOp.getResultVectorType().getShape()); - Operation *write = createWriteOrMaskedWrite( - rewriter, loc, shapeCastOp.getResult(), /*destSizes=*/reifiedRetShapes[0], - /*inputVecSizesForLeadingDims=*/writeVectorSizes, - useInBoundsInsteadOfMasking); + Value dest = rewriter.create( + loc, reifiedRetShapes[0], + shapeCastOp.getResult().getType().getElementType()); + Operation *write = + createWriteOrMaskedWrite(rewriter, loc, shapeCastOp.getResult(), dest, + /*inputVecSizesForLeadingDims=*/writeVectorSizes, + useInBoundsInsteadOfMasking); newResults.push_back(write->getResult(0)); return success(); } @@ -1861,10 +1870,14 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp, auto maskedRead = vector::createReadOrMaskedRead( rewriter, loc, padOp.getSource(), inputVectorSizes, padValue, /*useInBoundsInsteadOfMasking=*/false); - Operation *write = createWriteOrMaskedWrite( - rewriter, loc, maskedRead, reifiedReturnShapes[0], - /*inputVecSizesForLeadingDims=*/inputVectorSizes, - /*useInBoundsInsteadOfMasking=*/false); + + // Create Xfer write Op + Value dest = rewriter.create( + loc, reifiedReturnShapes[0], padOp.getResultType().getElementType()); + Operation *write = + createWriteOrMaskedWrite(rewriter, loc, maskedRead, dest, + /*inputVecSizesForLeadingDims=*/inputVectorSizes, + /*useInBoundsInsteadOfMasking=*/false); newResults.push_back(write->getResult(0)); return success(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp index f1059ddf5da2c..c6ebd3a53d981 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp @@ -960,7 +960,7 @@ winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp, F_2_3, F_4_3, F_2_5}; TransformMapKeyTy key = {m, r}; - auto it = std::find(validConfigs.begin(), validConfigs.end(), key); + auto it = llvm::find(validConfigs, key); // If we cannot find the constant transformation matrix, it means we do // not support this configuration yet. if (it == validConfigs.end()) diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp index 1a1334f0ea474..2bdb58892937f 100644 --- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp +++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp @@ -1505,7 +1505,7 @@ LogicalResult ShiftOp::verifySymbolUses(SymbolTableCollection &symbolTable) { auto meshAxes = getMeshAxes(); auto shiftAxis = getShiftAxis().getZExtValue(); - if (llvm::find(meshAxes, shiftAxis) == meshAxes.end()) { + if (!llvm::is_contained(meshAxes, shiftAxis)) { return emitError() << "Invalid shift axis " << shiftAxis << ". It must be one of the grouping mesh axes."; } diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 7eb72d433c972..b401d2ec7894a 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -272,11 +272,12 @@ static LogicalResult checkWaitAndAsyncConflict(Op op) { ++dtypeInt) { auto dtype = static_cast(dtypeInt); - // The async attribute represent the async clause without value. Therefore - // the attribute and operand cannot appear at the same time. + // The asyncOnly attribute represent the async clause without value. + // Therefore the attribute and operand cannot appear at the same time. if (hasDeviceType(op.getAsyncOperandsDeviceType(), dtype) && op.hasAsyncOnly(dtype)) - return op.emitError("async attribute cannot appear with asyncOperand"); + return op.emitError( + "asyncOnly attribute cannot appear with asyncOperand"); // The wait attribute represent the wait clause without values. Therefore // the attribute and operands cannot appear at the same time. @@ -1683,6 +1684,90 @@ static void printDeviceTypeOperandsWithKeywordOnly( p << ")"; } +static ParseResult parseOperandWithKeywordOnly( + mlir::OpAsmParser &parser, + std::optional &operand, + mlir::Type &operandType, mlir::UnitAttr &attr) { + // Keyword only + if (failed(parser.parseOptionalLParen())) { + attr = mlir::UnitAttr::get(parser.getContext()); + return success(); + } + + OpAsmParser::UnresolvedOperand op; + if (failed(parser.parseOperand(op))) + return failure(); + operand = op; + if (failed(parser.parseColon())) + return failure(); + if (failed(parser.parseType(operandType))) + return failure(); + if (failed(parser.parseRParen())) + return failure(); + + return success(); +} + +static void printOperandWithKeywordOnly(mlir::OpAsmPrinter &p, + mlir::Operation *op, + std::optional operand, + mlir::Type operandType, + mlir::UnitAttr attr) { + if (attr) + return; + + p << "("; + p.printOperand(*operand); + p << " : "; + p.printType(operandType); + p << ")"; +} + +static ParseResult parseOperandsWithKeywordOnly( + mlir::OpAsmParser &parser, + llvm::SmallVectorImpl &operands, + llvm::SmallVectorImpl &types, mlir::UnitAttr &attr) { + // Keyword only + if (failed(parser.parseOptionalLParen())) { + attr = mlir::UnitAttr::get(parser.getContext()); + return success(); + } + + if (failed(parser.parseCommaSeparatedList([&]() { + if (parser.parseOperand(operands.emplace_back())) + return failure(); + return success(); + }))) + return failure(); + if (failed(parser.parseColon())) + return failure(); + if (failed(parser.parseCommaSeparatedList([&]() { + if (parser.parseType(types.emplace_back())) + return failure(); + return success(); + }))) + return failure(); + if (failed(parser.parseRParen())) + return failure(); + + return success(); +} + +static void printOperandsWithKeywordOnly(mlir::OpAsmPrinter &p, + mlir::Operation *op, + mlir::OperandRange operands, + mlir::TypeRange types, + mlir::UnitAttr attr) { + if (attr) + return; + + p << "("; + llvm::interleaveComma(operands, p, [&](auto it) { p << it; }); + p << " : "; + llvm::interleaveComma(types, p, [&](auto it) { p << it; }); + p << ")"; +} + static ParseResult parseCombinedConstructsLoop(mlir::OpAsmParser &parser, mlir::acc::CombinedConstructsTypeAttr &attr) { @@ -3505,7 +3590,7 @@ bool UpdateOp::hasAsyncOnly() { } bool UpdateOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) { - return hasDeviceType(getAsync(), deviceType); + return hasDeviceType(getAsyncOnly(), deviceType); } mlir::Value UpdateOp::getAsyncValue() { diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 2bf7aaa46db11..deff86d5c5ecb 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1521,6 +1521,9 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) { if (mapTypeMod == "delete") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; + if (mapTypeMod == "return_param") + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM; + return success(); }; @@ -1583,6 +1586,12 @@ static void printMapClause(OpAsmPrinter &p, Operation *op, emitAllocRelease = false; mapTypeStrs.push_back("delete"); } + if (mapTypeToBitFlag( + mapTypeBits, + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM)) { + emitAllocRelease = false; + mapTypeStrs.push_back("return_param"); + } if (emitAllocRelease) mapTypeStrs.push_back("exit_release_or_enter_alloc"); @@ -1777,6 +1786,17 @@ static LogicalResult verifyPrivateVarsMapping(TargetOp targetOp) { // MapInfoOp //===----------------------------------------------------------------------===// +static LogicalResult verifyMapInfoDefinedArgs(Operation *op, + StringRef clauseName, + OperandRange vars) { + for (Value var : vars) + if (!llvm::isa_and_present(var.getDefiningOp())) + return op->emitOpError() + << "'" << clauseName + << "' arguments must be defined by 'omp.map.info' ops"; + return success(); +} + LogicalResult MapInfoOp::verify() { if (getMapperId() && !SymbolTable::lookupNearestSymbolFrom( @@ -1784,6 +1804,9 @@ LogicalResult MapInfoOp::verify() { return emitError("invalid mapper id"); } + if (failed(verifyMapInfoDefinedArgs(*this, "members", getMembers()))) + return failure(); + return success(); } @@ -1805,6 +1828,15 @@ LogicalResult TargetDataOp::verify() { "At least one of map, use_device_ptr_vars, or " "use_device_addr_vars operand must be present"); } + + if (failed(verifyMapInfoDefinedArgs(*this, "use_device_ptr", + getUseDevicePtrVars()))) + return failure(); + + if (failed(verifyMapInfoDefinedArgs(*this, "use_device_addr", + getUseDeviceAddrVars()))) + return failure(); + return verifyMapClause(*this, getMapVars()); } @@ -1889,16 +1921,15 @@ void TargetOp::build(OpBuilder &builder, OperationState &state, } LogicalResult TargetOp::verify() { - LogicalResult verifyDependVars = - verifyDependVarList(*this, getDependKinds(), getDependVars()); - - if (failed(verifyDependVars)) - return verifyDependVars; + if (failed(verifyDependVarList(*this, getDependKinds(), getDependVars()))) + return failure(); - LogicalResult verifyMapVars = verifyMapClause(*this, getMapVars()); + if (failed(verifyMapInfoDefinedArgs(*this, "has_device_addr", + getHasDeviceAddrVars()))) + return failure(); - if (failed(verifyMapVars)) - return verifyMapVars; + if (failed(verifyMapClause(*this, getMapVars()))) + return failure(); return verifyPrivateVarsMapping(*this); } diff --git a/mlir/lib/Dialect/Polynomial/CMakeLists.txt b/mlir/lib/Dialect/Polynomial/CMakeLists.txt deleted file mode 100644 index f33061b2d87cf..0000000000000 --- a/mlir/lib/Dialect/Polynomial/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(IR) diff --git a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt deleted file mode 100644 index 975315ff49158..0000000000000 --- a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS PolynomialCanonicalization.td) -mlir_tablegen(PolynomialCanonicalization.inc -gen-rewriters) -add_public_tablegen_target(MLIRPolynomialCanonicalizationIncGen) - -add_mlir_dialect_library(MLIRPolynomialDialect - Polynomial.cpp - PolynomialAttributes.cpp - PolynomialDialect.cpp - PolynomialOps.cpp - - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Polynomial - - DEPENDS - MLIRPolynomialIncGen - MLIRPolynomialAttributesIncGen - MLIRPolynomialCanonicalizationIncGen - MLIRBuiltinAttributesIncGen - - LINK_LIBS PUBLIC - MLIRArithDialect - MLIRSupport - MLIRDialect - MLIRIR - MLIRInferTypeOpInterface - ) diff --git a/mlir/lib/Dialect/Polynomial/IR/Polynomial.cpp b/mlir/lib/Dialect/Polynomial/IR/Polynomial.cpp deleted file mode 100644 index 650a369a2abab..0000000000000 --- a/mlir/lib/Dialect/Polynomial/IR/Polynomial.cpp +++ /dev/null @@ -1,68 +0,0 @@ -//===- Polynomial.cpp - MLIR storage type for static Polynomial -*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Polynomial/IR/Polynomial.h" - -#include "llvm/ADT/SmallVector.h" - -namespace mlir { -namespace polynomial { - -template -FailureOr fromMonomialsImpl(ArrayRef monomials) { - // A polynomial's terms are canonically stored in order of increasing degree. - auto monomialsCopy = llvm::SmallVector(monomials); - std::sort(monomialsCopy.begin(), monomialsCopy.end()); - - // Ensure non-unique exponents are not present. Since we sorted the list by - // exponent, a linear scan of adjancent monomials suffices. - if (std::adjacent_find(monomialsCopy.begin(), monomialsCopy.end(), - [](const MonomialT &lhs, const MonomialT &rhs) { - return lhs.getExponent() == rhs.getExponent(); - }) != monomialsCopy.end()) { - return failure(); - } - - return PolyT(monomialsCopy); -} - -FailureOr -IntPolynomial::fromMonomials(ArrayRef monomials) { - return fromMonomialsImpl(monomials); -} - -FailureOr -FloatPolynomial::fromMonomials(ArrayRef monomials) { - return fromMonomialsImpl(monomials); -} - -template -PolyT fromCoefficientsImpl(ArrayRef coeffs) { - llvm::SmallVector monomials; - auto size = coeffs.size(); - monomials.reserve(size); - for (size_t i = 0; i < size; i++) { - monomials.emplace_back(coeffs[i], i); - } - auto result = PolyT::fromMonomials(monomials); - // Construction guarantees unique exponents, so the failure mode of - // fromMonomials can be bypassed. - assert(succeeded(result)); - return result.value(); -} - -IntPolynomial IntPolynomial::fromCoefficients(ArrayRef coeffs) { - return fromCoefficientsImpl(coeffs); -} - -FloatPolynomial FloatPolynomial::fromCoefficients(ArrayRef coeffs) { - return fromCoefficientsImpl(coeffs); -} - -} // namespace polynomial -} // namespace mlir diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp deleted file mode 100644 index cd7789a2e9531..0000000000000 --- a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp +++ /dev/null @@ -1,236 +0,0 @@ -//===- PolynomialAttributes.cpp - Polynomial dialect attrs ------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h" - -#include "mlir/Dialect/Polynomial/IR/Polynomial.h" -#include "mlir/Support/LLVM.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSet.h" - -namespace mlir { -namespace polynomial { - -void IntPolynomialAttr::print(AsmPrinter &p) const { - p << '<' << getPolynomial() << '>'; -} - -void FloatPolynomialAttr::print(AsmPrinter &p) const { - p << '<' << getPolynomial() << '>'; -} - -/// A callable that parses the coefficient using the appropriate method for the -/// given monomial type, and stores the parsed coefficient value on the -/// monomial. -template -using ParseCoefficientFn = std::function; - -/// Try to parse a monomial. If successful, populate the fields of the outparam -/// `monomial` with the results, and the `variable` outparam with the parsed -/// variable name. Sets shouldParseMore to true if the monomial is followed by -/// a '+'. -/// -template -ParseResult -parseMonomial(AsmParser &parser, Monomial &monomial, llvm::StringRef &variable, - bool &isConstantTerm, bool &shouldParseMore, - ParseCoefficientFn parseAndStoreCoefficient) { - OptionalParseResult parsedCoeffResult = parseAndStoreCoefficient(monomial); - - isConstantTerm = false; - shouldParseMore = false; - - // A + indicates it's a constant term with more to go, as in `1 + x`. - if (succeeded(parser.parseOptionalPlus())) { - // If no coefficient was parsed, and there's a +, then it's effectively - // parsing an empty string. - if (!parsedCoeffResult.has_value()) { - return failure(); - } - monomial.setExponent(APInt(apintBitWidth, 0)); - isConstantTerm = true; - shouldParseMore = true; - return success(); - } - - // A monomial can be a trailing constant term, as in `x + 1`. - if (failed(parser.parseOptionalKeyword(&variable))) { - // If neither a coefficient nor a variable was found, then it's effectively - // parsing an empty string. - if (!parsedCoeffResult.has_value()) { - return failure(); - } - - monomial.setExponent(APInt(apintBitWidth, 0)); - isConstantTerm = true; - return success(); - } - - // Parse exponentiation symbol as `**`. We can't use caret because it's - // reserved for basic block identifiers If no star is present, it's treated - // as a polynomial with exponent 1. - if (succeeded(parser.parseOptionalStar())) { - // If there's one * there must be two. - if (failed(parser.parseStar())) { - return failure(); - } - - // If there's a **, then the integer exponent is required. - APInt parsedExponent(apintBitWidth, 0); - if (failed(parser.parseInteger(parsedExponent))) { - parser.emitError(parser.getCurrentLocation(), - "found invalid integer exponent"); - return failure(); - } - - monomial.setExponent(parsedExponent); - } else { - monomial.setExponent(APInt(apintBitWidth, 1)); - } - - if (succeeded(parser.parseOptionalPlus())) { - shouldParseMore = true; - } - return success(); -} - -template -LogicalResult -parsePolynomialAttr(AsmParser &parser, llvm::SmallVector &monomials, - llvm::StringSet<> &variables, - ParseCoefficientFn parseAndStoreCoefficient) { - while (true) { - Monomial parsedMonomial; - llvm::StringRef parsedVariableRef; - bool isConstantTerm; - bool shouldParseMore; - if (failed(parseMonomial( - parser, parsedMonomial, parsedVariableRef, isConstantTerm, - shouldParseMore, parseAndStoreCoefficient))) { - parser.emitError(parser.getCurrentLocation(), "expected a monomial"); - return failure(); - } - - if (!isConstantTerm) { - std::string parsedVariable = parsedVariableRef.str(); - variables.insert(parsedVariable); - } - monomials.push_back(parsedMonomial); - - if (shouldParseMore) - continue; - - if (succeeded(parser.parseOptionalGreater())) { - break; - } - parser.emitError( - parser.getCurrentLocation(), - "expected + and more monomials, or > to end polynomial attribute"); - return failure(); - } - - if (variables.size() > 1) { - std::string vars = llvm::join(variables.keys(), ", "); - parser.emitError( - parser.getCurrentLocation(), - "polynomials must have one indeterminate, but there were multiple: " + - vars); - return failure(); - } - - return success(); -} - -Attribute IntPolynomialAttr::parse(AsmParser &parser, Type type) { - if (failed(parser.parseLess())) - return {}; - - llvm::SmallVector monomials; - llvm::StringSet<> variables; - - if (failed(parsePolynomialAttr( - parser, monomials, variables, - [&](IntMonomial &monomial) -> OptionalParseResult { - APInt parsedCoeff(apintBitWidth, 1); - OptionalParseResult result = - parser.parseOptionalInteger(parsedCoeff); - monomial.setCoefficient(parsedCoeff); - return result; - }))) { - return {}; - } - - auto result = IntPolynomial::fromMonomials(monomials); - if (failed(result)) { - parser.emitError(parser.getCurrentLocation()) - << "parsed polynomial must have unique exponents among monomials"; - return {}; - } - return IntPolynomialAttr::get(parser.getContext(), result.value()); -} -Attribute FloatPolynomialAttr::parse(AsmParser &parser, Type type) { - if (failed(parser.parseLess())) - return {}; - - llvm::SmallVector monomials; - llvm::StringSet<> variables; - - ParseCoefficientFn parseAndStoreCoefficient = - [&](FloatMonomial &monomial) -> OptionalParseResult { - double coeffValue = 1.0; - ParseResult result = parser.parseFloat(coeffValue); - monomial.setCoefficient(APFloat(coeffValue)); - return OptionalParseResult(result); - }; - - if (failed(parsePolynomialAttr(parser, monomials, variables, - parseAndStoreCoefficient))) { - return {}; - } - - auto result = FloatPolynomial::fromMonomials(monomials); - if (failed(result)) { - parser.emitError(parser.getCurrentLocation()) - << "parsed polynomial must have unique exponents among monomials"; - return {}; - } - return FloatPolynomialAttr::get(parser.getContext(), result.value()); -} - -LogicalResult -RingAttr::verify(function_ref emitError, - Type coefficientType, IntegerAttr coefficientModulus, - IntPolynomialAttr polynomialModulus) { - if (coefficientModulus) { - auto coeffIntType = llvm::dyn_cast(coefficientType); - if (!coeffIntType) { - return emitError() << "coefficientModulus specified but coefficientType " - "is not integral"; - } - APInt coeffModValue = coefficientModulus.getValue(); - if (coeffModValue == 0) { - return emitError() << "coefficientModulus should not be 0"; - } - if (coeffModValue.slt(0)) { - return emitError() << "coefficientModulus should be positive"; - } - auto coeffModWidth = (coeffModValue - 1).getActiveBits(); - auto coeffWidth = coeffIntType.getWidth(); - if (coeffModWidth > coeffWidth) { - return emitError() << "coefficientModulus needs bit width of " - << coeffModWidth - << " but coefficientType can only contain " - << coeffWidth << " bits"; - } - } - return success(); -} - -} // namespace polynomial -} // namespace mlir diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td deleted file mode 100644 index 28c45e6846380..0000000000000 --- a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td +++ /dev/null @@ -1,44 +0,0 @@ -//===- PolynomialCanonicalization.td - Polynomial patterns -*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef POLYNOMIAL_CANONICALIZATION -#define POLYNOMIAL_CANONICALIZATION - -include "mlir/Dialect/Arith/IR/ArithOps.td" -include "mlir/Dialect/Polynomial/IR/Polynomial.td" -include "mlir/IR/OpBase.td" -include "mlir/IR/PatternBase.td" - -def Equal : Constraint>; - -// Get a -1 integer attribute of the same type as the polynomial SSA value's -// ring coefficient type. -def getMinusOne - : NativeCodeCall< - "$_builder.getIntegerAttr(" - "cast($0.getType()).getRing().getCoefficientType(), -1)">; - -def SubAsAdd : Pat< - (Polynomial_SubOp $f, $g), - (Polynomial_AddOp $f, - (Polynomial_MulScalarOp $g, - (Arith_ConstantOp (getMinusOne $g))))>; - -def INTTAfterNTT : Pat< - (Polynomial_INTTOp (Polynomial_NTTOp $poly, $r1), $r2), - (replaceWithValue $poly), - [(Equal $r1, $r2)] ->; - -def NTTAfterINTT : Pat< - (Polynomial_NTTOp (Polynomial_INTTOp $tensor, $r1), $r2), - (replaceWithValue $tensor), - [(Equal $r1, $r2)] ->; - -#endif // POLYNOMIAL_CANONICALIZATION diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp deleted file mode 100644 index 7f8ba0670d65e..0000000000000 --- a/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp +++ /dev/null @@ -1,49 +0,0 @@ -//===- PolynomialDialect.cpp - Polynomial dialect ---------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Polynomial/IR/Polynomial.h" - -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialOps.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialTypes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Dialect.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/Interfaces/InferTypeOpInterface.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/TypeSwitch.h" - -using namespace mlir; -using namespace mlir::polynomial; - -#include "mlir/Dialect/Polynomial/IR/PolynomialDialect.cpp.inc" - -#define GET_ATTRDEF_CLASSES -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.cpp.inc" -#define GET_TYPEDEF_CLASSES -#include "mlir/Dialect/Polynomial/IR/PolynomialTypes.cpp.inc" -#define GET_OP_CLASSES -#include "mlir/Dialect/Polynomial/IR/Polynomial.cpp.inc" - -void PolynomialDialect::initialize() { - addAttributes< -#define GET_ATTRDEF_LIST -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.cpp.inc" - >(); - addTypes< -#define GET_TYPEDEF_LIST -#include "mlir/Dialect/Polynomial/IR/PolynomialTypes.cpp.inc" - >(); - addOperations< -#define GET_OP_LIST -#include "mlir/Dialect/Polynomial/IR/Polynomial.cpp.inc" - >(); -} diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp deleted file mode 100644 index 460ef17167e80..0000000000000 --- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp +++ /dev/null @@ -1,298 +0,0 @@ -//===- PolynomialOps.cpp - Polynomial dialect ops ---------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Polynomial/IR/PolynomialOps.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Polynomial/IR/Polynomial.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialTypes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Dialect.h" -#include "mlir/IR/PatternMatch.h" -#include "llvm/ADT/APInt.h" - -using namespace mlir; -using namespace mlir::polynomial; - -void FromTensorOp::build(OpBuilder &builder, OperationState &result, - Value input, RingAttr ring) { - TensorType tensorType = dyn_cast(input.getType()); - auto bitWidth = tensorType.getElementTypeBitWidth(); - APInt cmod(1 + bitWidth, 1); - cmod = cmod << bitWidth; - Type resultType = PolynomialType::get(builder.getContext(), ring); - build(builder, result, resultType, input); -} - -LogicalResult FromTensorOp::verify() { - ArrayRef tensorShape = getInput().getType().getShape(); - RingAttr ring = getOutput().getType().getRing(); - IntPolynomialAttr polyMod = ring.getPolynomialModulus(); - if (polyMod) { - unsigned polyDegree = polyMod.getPolynomial().getDegree(); - bool compatible = tensorShape.size() == 1 && tensorShape[0] <= polyDegree; - if (!compatible) { - InFlightDiagnostic diag = emitOpError() - << "input type " << getInput().getType() - << " does not match output type " - << getOutput().getType(); - diag.attachNote() - << "the input type must be a tensor of shape [d] where d " - "is at most the degree of the polynomialModulus of " - "the output type's ring attribute"; - return diag; - } - } - - unsigned inputBitWidth = getInput().getType().getElementTypeBitWidth(); - if (inputBitWidth > ring.getCoefficientType().getIntOrFloatBitWidth()) { - InFlightDiagnostic diag = emitOpError() - << "input tensor element type " - << getInput().getType().getElementType() - << " is too large to fit in the coefficients of " - << getOutput().getType(); - diag.attachNote() << "the input tensor's elements must be rescaled" - " to fit before using from_tensor"; - return diag; - } - - return success(); -} - -LogicalResult ToTensorOp::verify() { - ArrayRef tensorShape = getOutput().getType().getShape(); - IntPolynomialAttr polyMod = - getInput().getType().getRing().getPolynomialModulus(); - if (polyMod) { - unsigned polyDegree = polyMod.getPolynomial().getDegree(); - bool compatible = tensorShape.size() == 1 && tensorShape[0] == polyDegree; - - if (compatible) - return success(); - - InFlightDiagnostic diag = emitOpError() - << "input type " << getInput().getType() - << " does not match output type " - << getOutput().getType(); - diag.attachNote() - << "the output type must be a tensor of shape [d] where d " - "is at most the degree of the polynomialModulus of " - "the input type's ring attribute"; - return diag; - } - - return success(); -} - -LogicalResult MulScalarOp::verify() { - Type argType = getPolynomial().getType(); - PolynomialType polyType; - - if (auto shapedPolyType = dyn_cast(argType)) { - polyType = cast(shapedPolyType.getElementType()); - } else { - polyType = cast(argType); - } - - Type coefficientType = polyType.getRing().getCoefficientType(); - - if (coefficientType != getScalar().getType()) - return emitOpError() << "polynomial coefficient type " << coefficientType - << " does not match scalar type " - << getScalar().getType(); - - return success(); -} - -/// Test if a value is a primitive nth root of unity modulo cmod. -bool isPrimitiveNthRootOfUnity(const APInt &root, const APInt &n, - const APInt &cmod) { - // The first or subsequent multiplications, may overflow the input bit width, - // so scale them up to ensure they do not overflow. - unsigned requiredBitWidth = - std::max(root.getActiveBits() * 2, cmod.getActiveBits() * 2); - APInt r = APInt(root).zextOrTrunc(requiredBitWidth); - APInt cmodExt = APInt(cmod).zextOrTrunc(requiredBitWidth); - assert(r.ule(cmodExt) && "root must be less than cmod"); - uint64_t upperBound = n.getZExtValue(); - - APInt a = r; - for (size_t k = 1; k < upperBound; k++) { - if (a.isOne()) - return false; - a = (a * r).urem(cmodExt); - } - return a.isOne(); -} - -/// Verify that the types involved in an NTT or INTT operation are -/// compatible. -static LogicalResult verifyNTTOp(Operation *op, RingAttr ring, - RankedTensorType tensorType, - std::optional root) { - Attribute encoding = tensorType.getEncoding(); - if (!encoding) { - return op->emitOpError() - << "expects a ring encoding to be provided to the tensor"; - } - auto encodedRing = dyn_cast(encoding); - if (!encodedRing) { - return op->emitOpError() - << "the provided tensor encoding is not a ring attribute"; - } - - if (encodedRing != ring) { - return op->emitOpError() - << "encoded ring type " << encodedRing - << " is not equivalent to the polynomial ring " << ring; - } - - unsigned polyDegree = ring.getPolynomialModulus().getPolynomial().getDegree(); - ArrayRef tensorShape = tensorType.getShape(); - bool compatible = tensorShape.size() == 1 && tensorShape[0] == polyDegree; - if (!compatible) { - InFlightDiagnostic diag = op->emitOpError() - << "tensor type " << tensorType - << " does not match output type " << ring; - diag.attachNote() << "the tensor must have shape [d] where d " - "is exactly the degree of the polynomialModulus of " - "the polynomial type's ring attribute"; - return diag; - } - - if (root.has_value()) { - APInt rootValue = root.value().getValue().getValue(); - APInt rootDegree = root.value().getDegree().getValue(); - APInt cmod = ring.getCoefficientModulus().getValue(); - if (!isPrimitiveNthRootOfUnity(rootValue, rootDegree, cmod)) { - return op->emitOpError() - << "provided root " << rootValue.getZExtValue() - << " is not a primitive root " - << "of unity mod " << cmod.getZExtValue() - << ", with the specified degree " << rootDegree.getZExtValue(); - } - } - - return success(); -} - -LogicalResult NTTOp::verify() { - return verifyNTTOp(this->getOperation(), getInput().getType().getRing(), - getOutput().getType(), getRoot()); -} - -LogicalResult INTTOp::verify() { - return verifyNTTOp(this->getOperation(), getOutput().getType().getRing(), - getInput().getType(), getRoot()); -} - -ParseResult ConstantOp::parse(OpAsmParser &parser, OperationState &result) { - // Using the built-in parser.parseAttribute requires the full - // #polynomial.typed_int_polynomial syntax, which is excessive. - // Instead we parse a keyword int to signal it's an integer polynomial - Type type; - if (succeeded(parser.parseOptionalKeyword("float"))) { - Attribute floatPolyAttr = FloatPolynomialAttr::parse(parser, nullptr); - if (floatPolyAttr) { - if (parser.parseColon() || parser.parseType(type)) - return failure(); - result.addAttribute("value", - TypedFloatPolynomialAttr::get(type, floatPolyAttr)); - result.addTypes(type); - return success(); - } - } - - if (succeeded(parser.parseOptionalKeyword("int"))) { - Attribute intPolyAttr = IntPolynomialAttr::parse(parser, nullptr); - if (intPolyAttr) { - if (parser.parseColon() || parser.parseType(type)) - return failure(); - - result.addAttribute("value", - TypedIntPolynomialAttr::get(type, intPolyAttr)); - result.addTypes(type); - return success(); - } - } - - // In the worst case, still accept the verbose versions. - TypedIntPolynomialAttr typedIntPolyAttr; - OptionalParseResult res = - parser.parseOptionalAttribute( - typedIntPolyAttr, "value", result.attributes); - if (res.has_value() && succeeded(res.value())) { - result.addTypes(typedIntPolyAttr.getType()); - return success(); - } - - TypedFloatPolynomialAttr typedFloatPolyAttr; - res = parser.parseAttribute( - typedFloatPolyAttr, "value", result.attributes); - if (res.has_value() && succeeded(res.value())) { - result.addTypes(typedFloatPolyAttr.getType()); - return success(); - } - - return failure(); -} - -void ConstantOp::print(OpAsmPrinter &p) { - p << " "; - if (auto intPoly = dyn_cast(getValue())) { - p << "int"; - intPoly.getValue().print(p); - } else if (auto floatPoly = dyn_cast(getValue())) { - p << "float"; - floatPoly.getValue().print(p); - } else { - assert(false && "unexpected attribute type"); - } - p << " : "; - p.printType(getOutput().getType()); -} - -LogicalResult ConstantOp::inferReturnTypes( - MLIRContext *context, std::optional location, - ConstantOp::Adaptor adaptor, - llvm::SmallVectorImpl &inferredReturnTypes) { - Attribute operand = adaptor.getValue(); - if (auto intPoly = dyn_cast(operand)) { - inferredReturnTypes.push_back(intPoly.getType()); - } else if (auto floatPoly = dyn_cast(operand)) { - inferredReturnTypes.push_back(floatPoly.getType()); - } else { - assert(false && "unexpected attribute type"); - return failure(); - } - return success(); -} - -//===----------------------------------------------------------------------===// -// TableGen'd canonicalization patterns -//===----------------------------------------------------------------------===// - -namespace { -#include "PolynomialCanonicalization.inc" -} // namespace - -void SubOp::getCanonicalizationPatterns(RewritePatternSet &results, - MLIRContext *context) { - results.add(context); -} - -void NTTOp::getCanonicalizationPatterns(RewritePatternSet &results, - MLIRContext *context) { - results.add(context); -} - -void INTTOp::getCanonicalizationPatterns(RewritePatternSet &results, - MLIRContext *context) { - results.add(context); -} diff --git a/mlir/lib/Dialect/Quant/IR/QuantOps.cpp b/mlir/lib/Dialect/Quant/IR/QuantOps.cpp index 94e1c8b8ba296..e23a0d6aba825 100644 --- a/mlir/lib/Dialect/Quant/IR/QuantOps.cpp +++ b/mlir/lib/Dialect/Quant/IR/QuantOps.cpp @@ -122,7 +122,7 @@ LogicalResult verifySubChannelQuantization( // // Therefore, we explicitly disallow the case where d = 0 to maintain // consistency and avoid these issues. - if (llvm::find(tensorType.getShape(), 0) != tensorType.getShape().end()) { + if (llvm::is_contained(tensorType.getShape(), 0)) { return op->emitError() << "tensor dimension size of zero is not allowed " "with sub-channel quantization"; } diff --git a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp index 40d2e254fb7dd..09326242eec2a 100644 --- a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp @@ -99,7 +99,7 @@ class ConvertForOpTypes // PR47938 tracks this issue, but it seems hard to fix. Instead, we need // to clone the op. // - // 2. We need to resue the original region instead of cloning it, otherwise + // 2. We need to reuse the original region instead of cloning it, otherwise // the dialect conversion framework thinks that we just inserted all the // cloned child ops. But what we want is to "take" the child regions and let // the dialect conversion framework continue recursively into ops inside diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index fcbef0c14739f..2196199816292 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -791,7 +791,7 @@ LogicalResult SparseTensorEncodingAttr::verify( return emitError() << "unexpected coordinate bitwidth: " << crdWidth; // Verify every COO segment. - auto *it = std::find_if(lvlTypes.begin(), lvlTypes.end(), isSingletonLT); + auto *it = llvm::find_if(lvlTypes, isSingletonLT); while (it != lvlTypes.end()) { if (it == lvlTypes.begin() || !(it - 1)->isa()) @@ -829,7 +829,7 @@ LogicalResult SparseTensorEncodingAttr::verify( } // TODO: audit formats that actually are supported by backend. - if (auto it = std::find_if(lvlTypes.begin(), lvlTypes.end(), isNOutOfMLT); + if (auto it = llvm::find_if(lvlTypes, isNOutOfMLT); it != std::end(lvlTypes)) { if (it != lvlTypes.end() - 1) return emitError() << "expected n_out_of_m to be the last level type"; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp index 01651b1f0ac9c..e5f2418367a58 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -212,7 +212,7 @@ static Value genTensorToMemref(PatternRewriter &rewriter, Location loc, auto tensorType = llvm::cast(tensor.getType()); auto memrefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType()); - return rewriter.create(loc, memrefType, tensor); + return rewriter.create(loc, memrefType, tensor); } /// Prepares the outlined arguments, passing scalars and buffers in. Here we diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp index badcc583bbca2..2d604ed7a8ffc 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -1128,7 +1128,7 @@ static bool startLoopSeq(CodegenEnv &env, OpBuilder &builder, ExprId exp, // TODO: remove this! The same tensor level might be added for multiple // times due to the special handling for all-dense "sparse" output tensor // (see L1038). - if (llvm::find(tidLvls, tl) != tidLvls.end()) + if (llvm::is_contained(tidLvls, tl)) return; tidLvls.emplace_back(tl); }); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp index 86c13d03c7ec6..b94091cfa5f58 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp @@ -31,11 +31,10 @@ static bool isMaterializing(Value val) { /// Sorts the dependent loops such that it is ordered in the same sequence in /// which loops will be generated. static void sortDependentLoops(std::vector &target) { - std::sort(target.begin(), target.end(), - [](const LoopCoeffPair &l, const LoopCoeffPair &r) { - assert(std::addressof(l) == std::addressof(r) || l != r); - return l.first < r.first; - }); + llvm::sort(target, [](const LoopCoeffPair &l, const LoopCoeffPair &r) { + assert(std::addressof(l) == std::addressof(r) || l != r); + return l.first < r.first; + }); } //===----------------------------------------------------------------------===// // Code generation environment constructor and general methods diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp index 0ebdc3a54e61b..57291064eba22 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp @@ -459,26 +459,25 @@ void sparse_tensor::foreachInSparseConstant( } // Sorts the sparse element attribute based on coordinates. - std::sort(elems.begin(), elems.end(), - [order](const ElementAttr &lhs, const ElementAttr &rhs) { - if (std::addressof(lhs) == std::addressof(rhs)) - return false; - - auto lhsCoords = llvm::map_to_vector( - lhs.first, [](IntegerAttr i) { return i.getInt(); }); - auto rhsCoords = llvm::map_to_vector( - rhs.first, [](IntegerAttr i) { return i.getInt(); }); - - SmallVector lhsLvlCrds = order.compose(lhsCoords); - SmallVector rhsLvlCrds = order.compose(rhsCoords); - // Sort the element based on the lvl coordinates. - for (Level l = 0; l < order.getNumResults(); l++) { - if (lhsLvlCrds[l] == rhsLvlCrds[l]) - continue; - return lhsLvlCrds[l] < rhsLvlCrds[l]; - } - llvm_unreachable("no equal coordinate in sparse element attr"); - }); + llvm::sort(elems, [order](const ElementAttr &lhs, const ElementAttr &rhs) { + if (std::addressof(lhs) == std::addressof(rhs)) + return false; + + auto lhsCoords = llvm::map_to_vector( + lhs.first, [](IntegerAttr i) { return i.getInt(); }); + auto rhsCoords = llvm::map_to_vector( + rhs.first, [](IntegerAttr i) { return i.getInt(); }); + + SmallVector lhsLvlCrds = order.compose(lhsCoords); + SmallVector rhsLvlCrds = order.compose(rhsCoords); + // Sort the element based on the lvl coordinates. + for (Level l = 0; l < order.getNumResults(); l++) { + if (lhsLvlCrds[l] == rhsLvlCrds[l]) + continue; + return lhsLvlCrds[l] < rhsLvlCrds[l]; + } + llvm_unreachable("no equal coordinate in sparse element attr"); + }); SmallVector cvs; cvs.reserve(dimRank); @@ -550,7 +549,7 @@ TypedValue sparse_tensor::genToMemref(OpBuilder &builder, Location loc, Value tensor) { auto tTp = llvm::cast(tensor.getType()); auto mTp = MemRefType::get(tTp.getShape(), tTp.getElementType()); - return builder.create(loc, mTp, tensor) + return builder.create(loc, mTp, tensor) .getResult(); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp index a2c6314d2a61e..3a77ce347b1c0 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp @@ -263,7 +263,7 @@ void LoopEmitter::initializeLoopEmit( denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp); Value denseVal = - builder.create(loc, denseTp, tensor); + builder.create(loc, denseTp, tensor); // Dense outputs need special handling. if (isOutput && updater) denseVal = updater(builder, loc, denseVal, tensor); diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 29da32cd1791c..815806f06b472 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -1356,7 +1356,7 @@ RankedTensorType GatherOp::inferResultType(RankedTensorType sourceType, SmallVector resultShape(indicesType.getShape().drop_back()); resultShape.reserve(resultShape.size() + sourceType.getRank()); for (int64_t idx : llvm::seq(0, sourceType.getRank())) { - if (std::binary_search(gatherDims.begin(), gatherDims.end(), idx)) { + if (llvm::binary_search(gatherDims, idx)) { if (!rankReduced) resultShape.push_back(1); continue; diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp index 31014172a9555..c0e697292d2a0 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -215,7 +215,7 @@ struct CollapseShapeOpInterface MemRefType::get(collapseShapeOp.getSrcType().getShape(), collapseShapeOp.getSrcType().getElementType(), AffineMap(), bufferType.getMemorySpace()); - buffer = rewriter.create( + buffer = rewriter.create( op->getLoc(), memrefType, *tensorAlloc); } @@ -491,7 +491,7 @@ struct FromElementsOpInterface bufferization::getBufferType(*tensorAlloc, options); if (failed(memrefType)) return failure(); - Value buffer = rewriter.create( + Value buffer = rewriter.create( op->getLoc(), *memrefType, *tensorAlloc); // Case: tensor<0xelem_type>. @@ -894,7 +894,7 @@ struct ReshapeOpInterface srcType.getShape(), srcType.getElementType(), AffineMap(), cast(srcBuffer->getType()).getMemorySpace()); srcBuffer = rewriter - .create( + .create( op->getLoc(), memrefType, *tensorAlloc) .getResult(); } diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp index 3938c3731c47f..66ea00b23b9d4 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp @@ -444,9 +444,8 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension( // Ensure the profile inference match the profile knowledge of the // specification. for (const auto &cands : specRequiredModeSet) { - for (size_t i = 0; i < opRequiredMode.size(); i++) { - if (std::find(cands.begin(), cands.end(), opRequiredMode[i]) == - cands.end()) { + for (const auto &mode : opRequiredMode) { + if (!llvm::is_contained(cands, mode)) { op->emitOpError() << "illegal: requires [" << llvm::join(stringifyProfile(opRequiredMode), ", ") diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp index fcb736aa031f3..fac836ebd7a36 100644 --- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp +++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp @@ -237,8 +237,8 @@ getValuesSortedByKeyImpl(ArrayRef keys, ArrayRef values, return SmallVector{values}; assert(keys.size() == values.size() && "unexpected mismatching sizes"); auto indices = llvm::to_vector(llvm::seq(0, values.size())); - std::sort(indices.begin(), indices.end(), - [&](int64_t i, int64_t j) { return compare(keys[i], keys[j]); }); + llvm::sort(indices, + [&](int64_t i, int64_t j) { return compare(keys[i], keys[j]); }); SmallVector res; res.reserve(values.size()); for (int64_t i = 0, e = indices.size(); i < e; ++i) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index f6c3c6a61afb6..79bf87ccd34af 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -5573,13 +5573,11 @@ LogicalResult ShapeCastOp::verify() { return success(); } -namespace { - /// Return true if `transpose` does not permute a pair of non-unit dims. /// By `order preserving` we mean that the flattened versions of the input and /// output vectors are (numerically) identical. In other words `transpose` is /// effectively a shape cast. -bool isOrderPreserving(TransposeOp transpose) { +static bool isOrderPreserving(TransposeOp transpose) { ArrayRef permutation = transpose.getPermutation(); VectorType sourceType = transpose.getSourceVectorType(); ArrayRef inShape = sourceType.getShape(); @@ -5599,8 +5597,6 @@ bool isOrderPreserving(TransposeOp transpose) { return true; } -} // namespace - OpFoldResult ShapeCastOp::fold(FoldAdaptor adaptor) { VectorType resultType = getType(); @@ -5997,18 +5993,22 @@ OpFoldResult vector::TransposeOp::fold(FoldAdaptor adaptor) { if (llvm::dyn_cast_if_present(adaptor.getVector())) return ub::PoisonAttr::get(getContext()); - // Eliminate identity transpose ops. This happens when the dimensions of the - // input vector remain in their original order after the transpose operation. - ArrayRef perm = getPermutation(); - - // Check if the permutation of the dimensions contains sequential values: - // {0, 1, 2, ...}. - for (int64_t i = 0, e = perm.size(); i < e; i++) { - if (perm[i] != i) - return {}; - } + // Eliminate identity transposes, and more generally any transposes that + // preserves the shape without permuting elements. + // + // Examples of what to fold: + // %0 = vector.transpose %arg, [0, 1] : vector<1x1xi8> to vector<1x1xi8> + // %0 = vector.transpose %arg, [0, 1] : vector<2x2xi8> to vector<2x2xi8> + // %0 = vector.transpose %arg, [1, 0] : vector<1x1xi8> to vector<1x1xi8> + // + // Example of what NOT to fold: + // %0 = vector.transpose %arg, [1, 0] : vector<2x2xi8> to vector<2x2xi8> + // + if (getSourceVectorType() == getResultVectorType() && + isOrderPreserving(*this)) + return getVector(); - return getVector(); + return {}; } LogicalResult vector::TransposeOp::verify() { diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp index b9cef003fa365..060ce7d1d6643 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp @@ -109,17 +109,110 @@ struct LinearizeVectorizable final } }; -/// This pattern converts the ExtractStridedSliceOp into a ShuffleOp that works -/// on a linearized vector. -/// Following, +template +static bool stridesAllOne(TOp op) { + static_assert( + std::is_same_v || + std::is_same_v, + "expected vector.extract_strided_slice or vector.insert_strided_slice"); + ArrayAttr strides = op.getStrides(); + return llvm::all_of( + strides, [](auto stride) { return isConstantIntValue(stride, 1); }); +} + +/// Convert an array of attributes into a vector of integers, if possible. +static FailureOr> intsFromArrayAttr(ArrayAttr attrs) { + if (!attrs) + return failure(); + SmallVector ints; + ints.reserve(attrs.size()); + for (auto attr : attrs) { + if (auto intAttr = dyn_cast(attr)) { + ints.push_back(intAttr.getInt()); + } else { + return failure(); + } + } + return ints; +} + +/// Consider inserting a vector of shape `small` into a vector of shape `large`, +/// at position `offsets`: this function enumeratates all the indices in `large` +/// that are written to. The enumeration is with row-major ordering. +/// +/// Example: insert a 1x2 vector into a 4x5 vector at position (1,3). The 2 +/// positions written to are (1,3) and (1,4), which have linearized indices 8 +/// and 9. So [8,9] is returned. +/// +/// The length of the returned vector is equal to the number of elements in +/// the shape `small` (i.e. the product of dimensions of `small`). +SmallVector static getStridedSliceInsertionIndices( + ArrayRef small, ArrayRef large, + ArrayRef offsets) { + + // Example of alignment between, `large`, `small` and `offsets`: + // large = 4, 5, 6, 7, 8 + // small = 1, 6, 7, 8 + // offsets = 2, 3, 0 + // + // `offsets` has implicit trailing 0s, `small` has implicit leading 1s. + assert((large.size() >= small.size()) && + "rank of 'large' cannot be lower than rank of 'small'"); + assert((large.size() >= offsets.size()) && + "rank of 'large' cannot be lower than the number of offsets"); + unsigned delta = large.size() - small.size(); + unsigned nOffsets = offsets.size(); + auto getSmall = [&](int64_t i) -> int64_t { + return i >= delta ? small[i - delta] : 1; + }; + auto getOffset = [&](int64_t i) -> int64_t { + return i < nOffsets ? offsets[i] : 0; + }; + + // Using 2 vectors of indices, at each iteration populate the updated set of + // indices based on the old set of indices, and the size of the small vector + // in the current iteration. + SmallVector indices{0}; + int64_t stride = 1; + for (int i = large.size() - 1; i >= 0; --i) { + int64_t currentSize = indices.size(); + int64_t smallSize = getSmall(i); + int64_t nextSize = currentSize * smallSize; + SmallVector nextIndices(nextSize); + int64_t *base = nextIndices.begin(); + int64_t offset = getOffset(i) * stride; + for (int j = 0; j < smallSize; ++j) { + for (int k = 0; k < currentSize; ++k) { + base[k] = indices[k] + offset; + } + offset += stride; + base += currentSize; + } + stride *= large[i]; + indices = std::move(nextIndices); + } + return indices; +} + +/// This pattern converts a vector.extract_strided_slice operation into a +/// vector.shuffle operation that has a rank-1 (linearized) operand and result. +/// +/// For example, the following: +/// +/// ``` /// vector.extract_strided_slice %source /// { offsets = [..], strides = [..], sizes = [..] } +/// ``` +/// /// is converted to : +/// ``` /// %source_1d = vector.shape_cast %source -/// %out_1d = vector.shuffle %source_1d, %source_1d [ shuffle_indices_1d ] -/// %out_nd = vector.shape_cast %out_1d -/// `shuffle_indices_1d` is computed using the offsets and sizes of the -/// extraction. +/// %out_1d = vector.shuffle %source_1d, %source_1d [ shuffle_indices_1d ] +/// %out_nd = vector.shape_cast %out_1d +/// ``` +/// +/// `shuffle_indices_1d` is computed using the offsets and sizes of the original +/// vector.extract_strided_slice operation. struct LinearizeVectorExtractStridedSlice final : public mlir::OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -129,88 +222,116 @@ struct LinearizeVectorExtractStridedSlice final : OpConversionPattern(typeConverter, context, benefit) {} LogicalResult - matchAndRewrite(vector::ExtractStridedSliceOp extractOp, OpAdaptor adaptor, + matchAndRewrite(vector::ExtractStridedSliceOp extractStridedSliceOp, + OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - VectorType dstType = - getTypeConverter()->convertType(extractOp.getType()); - assert(dstType && "vector type destination expected."); - if (extractOp.getVector().getType().isScalable() || dstType.isScalable()) - return rewriter.notifyMatchFailure(extractOp, - "scalable vectors are not supported."); - ArrayAttr offsets = extractOp.getOffsets(); - ArrayAttr sizes = extractOp.getSizes(); - ArrayAttr strides = extractOp.getStrides(); - if (!isConstantIntValue(strides[0], 1)) + VectorType flatOutputType = getTypeConverter()->convertType( + extractStridedSliceOp.getType()); + assert(flatOutputType && "vector type expected"); + + // Expect a legalization failure if the strides are not all 1 (if ever the + // verifier for extract_strided_slice allows non-1 strides). + if (!stridesAllOne(extractStridedSliceOp)) { return rewriter.notifyMatchFailure( - extractOp, "Strided slice with stride != 1 is not supported."); - Value srcVector = adaptor.getVector(); - // If kD offsets are specified for nD source vector (n > k), the granularity - // of the extraction is greater than 1. In this case last (n-k) dimensions - // form the extraction granularity. - // Example : - // vector.extract_strided_slice %src { - // offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : - // vector<4x8x8xf32> to vector<2x2x8xf32> - // Here, extraction granularity is 8. - int64_t extractGranularitySize = 1; - int64_t nD = extractOp.getSourceVectorType().getRank(); - int64_t kD = (int64_t)offsets.size(); - int64_t k = kD; - while (k < nD) { - extractGranularitySize *= extractOp.getSourceVectorType().getShape()[k]; - ++k; + extractStridedSliceOp, + "extract_strided_slice with strides != 1 not supported"); } - // Get total number of extracted slices. - int64_t nExtractedSlices = 1; - for (Attribute size : sizes) { - nExtractedSlices *= cast(size).getInt(); + + FailureOr> offsets = + intsFromArrayAttr(extractStridedSliceOp.getOffsets()); + if (failed(offsets)) { + return rewriter.notifyMatchFailure(extractStridedSliceOp, + "failed to get integer offsets"); } - // Compute the strides of the source vector considering first k dimensions. - llvm::SmallVector sourceStrides(kD, extractGranularitySize); - for (int i = kD - 2; i >= 0; --i) { - sourceStrides[i] = sourceStrides[i + 1] * - extractOp.getSourceVectorType().getShape()[i + 1]; + + ArrayRef inputShape = + extractStridedSliceOp.getSourceVectorType().getShape(); + + ArrayRef outputShape = extractStridedSliceOp.getType().getShape(); + + SmallVector indices = getStridedSliceInsertionIndices( + outputShape, inputShape, offsets.value()); + + Value srcVector = adaptor.getVector(); + rewriter.replaceOpWithNewOp( + extractStridedSliceOp, flatOutputType, srcVector, srcVector, indices); + return success(); + } +}; + +/// This pattern converts a vector.insert_strided_slice operation into a +/// vector.shuffle operation that has rank-1 (linearized) operands and result. +/// +/// For example, the following: +/// ``` +/// %0 = vector.insert_strided_slice %to_store, %into +/// {offsets = [1, 0, 0, 0], strides = [1, 1]} +/// : vector<2x2xi8> into vector<2x1x3x2xi8> +/// ``` +/// +/// is converted to +/// ``` +/// %to_store_1d +/// = vector.shape_cast %to_store : vector<2x2xi8> to vector<4xi8> +/// %into_1d = vector.shape_cast %into : vector<2x1x3x2xi8> to vector<12xi8> +/// %out_1d = vector.shuffle %into_1d, %to_store_1d [ shuffle_indices_1d ] +/// %out_nd = vector.shape_cast %out_1d : vector<12xi8> to vector<2x1x3x2xi8> +/// ``` +/// +/// where shuffle_indices_1d in this case is +/// [0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 10, 11]. +/// ^^^^^^^^^^^^^^ +/// to_store_1d +/// +struct LinearizeVectorInsertStridedSlice final + : public mlir::OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LinearizeVectorInsertStridedSlice(const TypeConverter &typeConverter, + MLIRContext *context, + PatternBenefit benefit = 1) + : OpConversionPattern(typeConverter, context, benefit) {} + + LogicalResult + matchAndRewrite(vector::InsertStridedSliceOp insertStridedSliceOp, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + // Expect a legalization failure if the strides are not all 1 (if ever the + // verifier for insert_strided_slice allows non-1 strides). + if (!stridesAllOne(insertStridedSliceOp)) { + return rewriter.notifyMatchFailure( + insertStridedSliceOp, + "insert_strided_slice with strides != 1 not supported"); } - // Final shuffle indices has nExtractedSlices * extractGranularitySize - // elements. - llvm::SmallVector indices(nExtractedSlices * - extractGranularitySize); - // Compute the strides of the extracted kD vector. - llvm::SmallVector extractedStrides(kD, 1); - // Compute extractedStrides. - for (int i = kD - 2; i >= 0; --i) { - extractedStrides[i] = - extractedStrides[i + 1] * cast(sizes[i + 1]).getInt(); + + VectorType inputType = insertStridedSliceOp.getValueToStore().getType(); + ArrayRef inputShape = inputType.getShape(); + + VectorType outputType = insertStridedSliceOp.getType(); + ArrayRef outputShape = outputType.getShape(); + int64_t nOutputElements = outputType.getNumElements(); + + FailureOr> offsets = + intsFromArrayAttr(insertStridedSliceOp.getOffsets()); + if (failed(offsets)) { + return rewriter.notifyMatchFailure(insertStridedSliceOp, + "failed to get integer offsets"); } - // Iterate over all extracted slices from 0 to nExtractedSlices - 1 - // and compute the multi-dimensional index and the corresponding linearized - // index within the source vector. - for (int64_t i = 0; i < nExtractedSlices; ++i) { - int64_t index = i; - // Compute the corresponding multi-dimensional index. - llvm::SmallVector multiDimIndex(kD, 0); - for (int64_t j = 0; j < kD; ++j) { - multiDimIndex[j] = (index / extractedStrides[j]); - index -= multiDimIndex[j] * extractedStrides[j]; - } - // Compute the corresponding linearized index in the source vector - // i.e. shift the multiDimIndex by the offsets. - int64_t linearizedIndex = 0; - for (int64_t j = 0; j < kD; ++j) { - linearizedIndex += - (cast(offsets[j]).getInt() + multiDimIndex[j]) * - sourceStrides[j]; - } - // Fill the indices array form linearizedIndex to linearizedIndex + - // extractGranularitySize. - for (int64_t j = 0; j < extractGranularitySize; ++j) { - indices[i * extractGranularitySize + j] = linearizedIndex + j; - } + SmallVector sliceIndices = getStridedSliceInsertionIndices( + inputShape, outputShape, offsets.value()); + + SmallVector indices(nOutputElements); + std::iota(indices.begin(), indices.end(), 0); + for (auto [index, sliceIndex] : llvm::enumerate(sliceIndices)) { + indices[sliceIndex] = index + nOutputElements; } - // Perform a shuffle to extract the kD vector. - rewriter.replaceOpWithNewOp( - extractOp, dstType, srcVector, srcVector, indices); + + Value flatToStore = adaptor.getValueToStore(); + Value flatDest = adaptor.getDest(); + rewriter.replaceOpWithNewOp(insertStridedSliceOp, + flatDest.getType(), flatDest, + flatToStore, indices); return success(); } }; @@ -296,7 +417,7 @@ struct LinearizeVectorExtract final // Skip if result is not a vector type if (!isa(extractOp.getType())) return rewriter.notifyMatchFailure(extractOp, - "scalar extract is not supported."); + "scalar extract not supported"); Type dstTy = getTypeConverter()->convertType(extractOp.getType()); assert(dstTy && "expected 1-D vector type"); @@ -445,47 +566,109 @@ struct LinearizeVectorSplat final } }; -} // namespace +/// This pattern converts the CreateMaskOp to work on a linearized vector. +/// It currently supports only 2D masks with a unit outer dimension. +/// Following, +/// vector.create_mask %arg0, %arg1 : vector<1x4xi1> +/// is converted to: +/// %zero = arith.constant 0 : index +/// %cmpi = arith.cmpi sgt, %arg0, %zero : index +/// %index = arith.index_cast %cmpi : i1 to index +/// %mul = arith.andi %index, %arg1 : index +/// %mask = vector.create_mask %mul : vector<4xi1> +/// %shape_cast = vector.shape_cast %mask : vector<4xi1> to vector<1x4xi1> +struct LinearizeVectorCreateMask final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; -/// Return true if the operation `op` does not support scalable vectors and -/// has at least 1 scalable vector result. These ops should all eventually -/// support scalable vectors, and this function should be removed. -static bool isNotLinearizableBecauseScalable(Operation *op) { + LinearizeVectorCreateMask(const TypeConverter &typeConverter, + MLIRContext *context, PatternBenefit benefit = 1) + : OpConversionPattern(typeConverter, context, benefit) {} - bool unsupported = - isa( - op); - if (!unsupported) - return false; + LogicalResult + matchAndRewrite(vector::CreateMaskOp createMaskOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = createMaskOp.getLoc(); + VectorType srcTy = createMaskOp.getType(); + auto srcShape = srcTy.getShape(); + if (srcShape.size() != 2) + return rewriter.notifyMatchFailure(createMaskOp, + "only 2D mask is supported."); + + if (srcShape[0] != 1) + return rewriter.notifyMatchFailure( + createMaskOp, "only unit outer dimension is supported."); - // Check if any of the results is a scalable vector type. - auto types = op->getResultTypes(); - bool containsScalableResult = - std::any_of(types.begin(), types.end(), [](Type type) { - auto vecType = dyn_cast(type); - return vecType && vecType.isScalable(); - }); + auto dstTy = getTypeConverter()->convertType(srcTy); + if (!dstTy) + return rewriter.notifyMatchFailure(createMaskOp, "cannot convert type."); + + // Compare the first operand with 0. If it is greater than 0, the + // corresponding mask element is set to true, otherwise false. + // The result of the comparison is then multiplied with + // the second operand of create_mask to get the 1D mask. + auto firstOperand = adaptor.getOperands().front(); + auto zero = rewriter.create(loc, 0); + auto isNonZero = rewriter.createOrFold( + loc, mlir::arith::CmpIPredicate::sgt, firstOperand, zero); + auto isNonZeroIndex = rewriter.createOrFold( + loc, rewriter.getIndexType(), isNonZero); + auto secondOperand = adaptor.getOperands().back(); + auto maskSize = rewriter.createOrFold( + loc, rewriter.getIndexType(), isNonZeroIndex, secondOperand); + + auto newMask = + rewriter.create(loc, dstTy, maskSize); + rewriter.replaceOp(createMaskOp, newMask); + return success(); + } +}; - return containsScalableResult; -} +} // namespace -static bool isNotLinearizable(Operation *op) { +/// This method defines the set of operations that are linearizable, and hence +/// that are considered illegal for the conversion target. +static bool isLinearizable(Operation *op) { // Only ops that are in the vector dialect, are ConstantLike, or // are Vectorizable might be linearized currently. StringLiteral vectorDialect = vector::VectorDialect::getDialectNamespace(); StringRef opDialect = op->getDialect()->getNamespace(); - bool unsupported = (opDialect != vectorDialect) && - !op->hasTrait() && - !op->hasTrait(); - if (unsupported) - return true; - - // Some ops currently don't support scalable vectors. - if (isNotLinearizableBecauseScalable(op)) - return true; + bool supported = (opDialect == vectorDialect) || + op->hasTrait() || + op->hasTrait(); + if (!supported) + return false; - return false; + return TypeSwitch(op) + // As type legalization is done with vector.shape_cast, shape_cast + // itself cannot be linearized (will create new shape_casts to linearize + // ad infinitum). + .Case([&](auto) { return false; }) + // The operations + // - vector.extract_strided_slice + // - vector.extract + // - vector.insert_strided_slice + // - vector.insert + // are linearized to a rank-1 vector.shuffle by the current patterns. + // vector.shuffle only supports fixed size vectors, so it is impossible to + // use this approach to linearize these ops if they operate on scalable + // vectors. + .Case( + [&](vector::ExtractStridedSliceOp extractOp) { + return !extractOp.getType().isScalable(); + }) + .Case( + [&](vector::InsertStridedSliceOp insertOp) { + return !insertOp.getType().isScalable(); + }) + .Case([&](vector::InsertOp insertOp) { + return !insertOp.getType().isScalable(); + }) + .Case([&](vector::ExtractOp extractOp) { + return !extractOp.getSourceVectorType().isScalable(); + }) + .Default([&](auto) { return true; }); } void mlir::vector::populateForVectorLinearize(TypeConverter &typeConverter, @@ -519,7 +702,7 @@ void mlir::vector::populateForVectorLinearize(TypeConverter &typeConverter, target.markUnknownOpDynamicallyLegal( [=](Operation *op) -> std::optional { - if (isNotLinearizable(op)) + if (!isLinearizable(op)) return true; // This will return true if, for all operand and result types `t`, // convertType(t) = t. This is true if there are no rank>=2 vectors. @@ -530,15 +713,17 @@ void mlir::vector::populateForVectorLinearize(TypeConverter &typeConverter, void mlir::vector::populateVectorLinearizeBasePatterns( const TypeConverter &typeConverter, const ConversionTarget &target, RewritePatternSet &patterns) { - patterns.add( - typeConverter, patterns.getContext()); + patterns + .add( + typeConverter, patterns.getContext()); } void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns( const TypeConverter &typeConverter, const ConversionTarget &target, RewritePatternSet &patterns) { patterns.add( - typeConverter, patterns.getContext()); + LinearizeVectorInsert, LinearizeVectorExtractStridedSlice, + LinearizeVectorInsertStridedSlice>(typeConverter, + patterns.getContext()); } diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp index 399e902af4062..d5dd6f2027be8 100644 --- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp @@ -291,8 +291,7 @@ vector::createUnrollIterator(VectorType vType, int64_t targetRank) { // cannot be unrolled). auto shapeToUnroll = vType.getShape().drop_back(targetRank); auto scalableDimsToUnroll = vType.getScalableDims().drop_back(targetRank); - auto it = - std::find(scalableDimsToUnroll.begin(), scalableDimsToUnroll.end(), true); + auto it = llvm::find(scalableDimsToUnroll, true); auto firstScalableDim = it - scalableDimsToUnroll.begin(); if (firstScalableDim == 0) return {}; diff --git a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp index 8d383b1f8103b..cc7ab7f3f3895 100644 --- a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp +++ b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp @@ -31,24 +31,11 @@ void x86vector::X86VectorDialect::initialize() { >(); } -static SmallVector -getMemrefBuffPtr(Location loc, ::mlir::TypedValue<::mlir::MemRefType> memrefVal, - RewriterBase &rewriter, - const LLVMTypeConverter &typeConverter) { - SmallVector operands; - auto opType = memrefVal.getType(); - - Type llvmStructType = typeConverter.convertType(opType); - Value llvmStruct = - rewriter - .create(loc, llvmStructType, memrefVal) - .getResult(0); - MemRefDescriptor memRefDescriptor(llvmStruct); - - Value ptr = memRefDescriptor.bufferPtr(rewriter, loc, typeConverter, opType); - operands.push_back(ptr); - - return operands; +static Value getMemrefBuffPtr(Location loc, MemRefType type, Value buffer, + const LLVMTypeConverter &typeConverter, + RewriterBase &rewriter) { + MemRefDescriptor memRefDescriptor(buffer); + return memRefDescriptor.bufferPtr(rewriter, loc, typeConverter, type); } LogicalResult x86vector::MaskCompressOp::verify() { @@ -66,48 +53,61 @@ LogicalResult x86vector::MaskCompressOp::verify() { } SmallVector x86vector::MaskCompressOp::getIntrinsicOperands( - RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) { + ArrayRef operands, const LLVMTypeConverter &typeConverter, + RewriterBase &rewriter) { auto loc = getLoc(); + Adaptor adaptor(operands, *this); - auto opType = getA().getType(); + auto opType = adaptor.getA().getType(); Value src; - if (getSrc()) { - src = getSrc(); - } else if (getConstantSrc()) { - src = rewriter.create(loc, opType, getConstantSrcAttr()); + if (adaptor.getSrc()) { + src = adaptor.getSrc(); + } else if (adaptor.getConstantSrc()) { + src = rewriter.create(loc, opType, + adaptor.getConstantSrcAttr()); } else { auto zeroAttr = rewriter.getZeroAttr(opType); src = rewriter.create(loc, opType, zeroAttr); } - return SmallVector{getA(), src, getK()}; + return SmallVector{adaptor.getA(), src, adaptor.getK()}; } SmallVector -x86vector::DotOp::getIntrinsicOperands(RewriterBase &rewriter, - const LLVMTypeConverter &typeConverter) { - SmallVector operands(getOperands()); +x86vector::DotOp::getIntrinsicOperands(ArrayRef operands, + const LLVMTypeConverter &typeConverter, + RewriterBase &rewriter) { + SmallVector intrinsicOperands(operands); // Dot product of all elements, broadcasted to all elements. Value scale = rewriter.create(getLoc(), rewriter.getI8Type(), 0xff); - operands.push_back(scale); + intrinsicOperands.push_back(scale); - return operands; + return intrinsicOperands; } SmallVector x86vector::BcstToPackedF32Op::getIntrinsicOperands( - RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) { - return getMemrefBuffPtr(getLoc(), getA(), rewriter, typeConverter); + ArrayRef operands, const LLVMTypeConverter &typeConverter, + RewriterBase &rewriter) { + Adaptor adaptor(operands, *this); + return {getMemrefBuffPtr(getLoc(), getA().getType(), adaptor.getA(), + typeConverter, rewriter)}; } SmallVector x86vector::CvtPackedEvenIndexedToF32Op::getIntrinsicOperands( - RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) { - return getMemrefBuffPtr(getLoc(), getA(), rewriter, typeConverter); + ArrayRef operands, const LLVMTypeConverter &typeConverter, + RewriterBase &rewriter) { + Adaptor adaptor(operands, *this); + return {getMemrefBuffPtr(getLoc(), getA().getType(), adaptor.getA(), + typeConverter, rewriter)}; } SmallVector x86vector::CvtPackedOddIndexedToF32Op::getIntrinsicOperands( - RewriterBase &rewriter, const LLVMTypeConverter &typeConverter) { - return getMemrefBuffPtr(getLoc(), getA(), rewriter, typeConverter); + ArrayRef operands, const LLVMTypeConverter &typeConverter, + RewriterBase &rewriter) { + Adaptor adaptor(operands, *this); + return {getMemrefBuffPtr(getLoc(), getA().getType(), adaptor.getA(), + typeConverter, rewriter)}; } #define GET_OP_CLASSES diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp index 9ee44a63ba2e4..483c1f5c3e4c6 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp +++ b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp @@ -84,20 +84,23 @@ LogicalResult intrinsicRewrite(Operation *op, StringAttr intrinsic, /// Generic one-to-one conversion of simply mappable operations into calls /// to their respective LLVM intrinsics. struct OneToOneIntrinsicOpConversion - : public OpInterfaceRewritePattern { - using OpInterfaceRewritePattern< - x86vector::OneToOneIntrinsicOp>::OpInterfaceRewritePattern; + : public OpInterfaceConversionPattern { + using OpInterfaceConversionPattern< + x86vector::OneToOneIntrinsicOp>::OpInterfaceConversionPattern; OneToOneIntrinsicOpConversion(const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) - : OpInterfaceRewritePattern(&typeConverter.getContext(), benefit), + : OpInterfaceConversionPattern(typeConverter, &typeConverter.getContext(), + benefit), typeConverter(typeConverter) {} - LogicalResult matchAndRewrite(x86vector::OneToOneIntrinsicOp op, - PatternRewriter &rewriter) const override { - return intrinsicRewrite(op, rewriter.getStringAttr(op.getIntrinsicName()), - op.getIntrinsicOperands(rewriter, typeConverter), - typeConverter, rewriter); + LogicalResult + matchAndRewrite(x86vector::OneToOneIntrinsicOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + return intrinsicRewrite( + op, rewriter.getStringAttr(op.getIntrinsicName()), + op.getIntrinsicOperands(operands, typeConverter, rewriter), + typeConverter, rewriter); } private: diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp index cf462ddf6f17c..2107df37d1997 100644 --- a/mlir/lib/ExecutionEngine/JitRunner.cpp +++ b/mlir/lib/ExecutionEngine/JitRunner.cpp @@ -222,9 +222,14 @@ static Error compileAndExecuteVoidFunction( CompileAndExecuteConfig config, std::unique_ptr tm) { auto mainFunction = dyn_cast_or_null( SymbolTable::lookupSymbolIn(module, entryPoint)); - if (!mainFunction || mainFunction.empty()) + if (!mainFunction || mainFunction.isExternal()) return makeStringError("entry point not found"); + if (cast(mainFunction.getFunctionType()) + .getNumParams() != 0) + return makeStringError( + "JIT can't invoke a main function expecting arguments"); + auto resultType = dyn_cast( mainFunction.getFunctionType().getReturnType()); if (!resultType) @@ -274,7 +279,8 @@ Error compileAndExecuteSingleReturnFunction( if (cast(mainFunction.getFunctionType()) .getNumParams() != 0) - return makeStringError("function inputs not supported"); + return makeStringError( + "JIT can't invoke a main function expecting arguments"); if (Error error = checkCompatibleReturnType(mainFunction)) return error; diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp index 8d1fbcdb19a9d..e0e9b5f54042a 100644 --- a/mlir/lib/Pass/Pass.cpp +++ b/mlir/lib/Pass/Pass.cpp @@ -689,7 +689,7 @@ LogicalResult OpToOpPassAdaptor::tryMergeInto(MLIRContext *ctx, } return false; // lhs(op-agnostic) > rhs(op-specific) }; - std::sort(rhs.mgrs.begin(), rhs.mgrs.end(), compareFn); + llvm::sort(rhs.mgrs, compareFn); return success(); } diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp index ab605391faf6a..13541de66578d 100644 --- a/mlir/lib/TableGen/Pattern.cpp +++ b/mlir/lib/TableGen/Pattern.cpp @@ -304,8 +304,8 @@ std::string SymbolInfoMap::SymbolInfo::getValueAndRangeUse( assert(index < 0); auto *operand = cast(op->getArg(getArgIndex())); if (operand->isOptional()) { - auto repl = - formatv(fmt, formatv("({0}.empty() ? Value() : *{0}.begin())", name)); + auto repl = formatv( + fmt, formatv("({0}.empty() ? ::mlir::Value() : *{0}.begin())", name)); LLVM_DEBUG(dbgs() << repl << " (OptionalOperand)\n"); return std::string(repl); } diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp index b21db4aa18284..187e2a9b75a9b 100644 --- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp @@ -44,6 +44,12 @@ void registerFromLLVMIRTranslation() { "of using dialect supported intrinsics"), llvm::cl::init(false)); + static llvm::cl::opt importStructsAsLiterals( + "import-structs-as-literals", + llvm::cl::desc("Controls if structs should be imported as literal " + "structs, i.e., nameless structs."), + llvm::cl::init(false)); + TranslateToMLIRRegistration registration( "import-llvm", "Translate LLVMIR to MLIR", [](llvm::SourceMgr &sourceMgr, @@ -70,7 +76,7 @@ void registerFromLLVMIRTranslation() { return translateLLVMIRToModule( std::move(llvmModule), context, emitExpensiveWarnings, dropDICompositeTypeElements, /*loadAllDialects=*/true, - preferUnregisteredIntrinsics); + preferUnregisteredIntrinsics, importStructsAsLiterals); }, [](DialectRegistry ®istry) { // Register the DLTI dialect used to express the data layout diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index 4ea313019f34d..d9605b19aaaa5 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -515,6 +515,8 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, if (!attr) continue; DictionaryAttr dAttr = cast(attr); + if (dAttr.empty()) + continue; TypeAttr tAttr = cast(dAttr.get(InlineAsmOp::getElementTypeAttrName())); llvm::AttrBuilder b(moduleTranslation.getLLVMContext()); @@ -690,19 +692,13 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, // Emit blockaddress. We first need to find the LLVM block referenced by this // operation and then create a LLVM block address for it. if (auto blockAddressOp = dyn_cast(opInst)) { - // getBlockTagOp() walks a function to search for block labels. Check - // whether it's in cache first. BlockAddressAttr blockAddressAttr = blockAddressOp.getBlockAddr(); - BlockTagOp blockTagOp = moduleTranslation.lookupBlockTag(blockAddressAttr); - if (!blockTagOp) { - blockTagOp = blockAddressOp.getBlockTagOp(); - moduleTranslation.mapBlockTag(blockAddressAttr, blockTagOp); - } + llvm::BasicBlock *llvmBlock = + moduleTranslation.lookupBlockAddress(blockAddressAttr); llvm::Value *llvmValue = nullptr; StringRef fnName = blockAddressAttr.getFunction().getValue(); - if (llvm::BasicBlock *llvmBlock = - moduleTranslation.lookupBlock(blockTagOp->getBlock())) { + if (llvmBlock) { llvm::Function *llvmFn = moduleTranslation.lookupFunction(fnName); llvmValue = llvm::BlockAddress::get(llvmFn, llvmBlock); } else { @@ -736,7 +732,8 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, FlatSymbolRefAttr::get(&moduleTranslation.getContext(), funcOp.getName()), blockTagOp.getTag()); - moduleTranslation.mapBlockTag(blockAddressAttr, blockTagOp); + moduleTranslation.mapBlockAddress(blockAddressAttr, + builder.GetInsertBlock()); return success(); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 9f7b5605556e6..010c46358f7df 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -3720,6 +3720,9 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers( LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl, MapInfosTy &combinedInfo, MapInfoData &mapData, uint64_t mapDataIndex, bool isTargetParams) { + assert(!ompBuilder.Config.isTargetDevice() && + "function only supported for host device codegen"); + // Map the first segment of our structure combinedInfo.Types.emplace_back( isTargetParams @@ -3828,6 +3831,8 @@ static void processMapMembersWithParent( llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl, MapInfosTy &combinedInfo, MapInfoData &mapData, uint64_t mapDataIndex, llvm::omp::OpenMPOffloadMappingFlags memberOfFlag) { + assert(!ompBuilder.Config.isTargetDevice() && + "function only supported for host device codegen"); auto parentClause = llvm::cast(mapData.MapClause[mapDataIndex]); @@ -3941,6 +3946,9 @@ static void processMapWithMembersOf(LLVM::ModuleTranslation &moduleTranslation, DataLayout &dl, MapInfosTy &combinedInfo, MapInfoData &mapData, uint64_t mapDataIndex, bool isTargetParams) { + assert(!ompBuilder.Config.isTargetDevice() && + "function only supported for host device codegen"); + auto parentClause = llvm::cast(mapData.MapClause[mapDataIndex]); @@ -3982,6 +3990,8 @@ static void createAlteredByCaptureMap(MapInfoData &mapData, LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder) { + assert(!moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice() && + "function only supported for host device codegen"); for (size_t i = 0; i < mapData.MapClause.size(); ++i) { // if it's declare target, skip it, it's handled separately. if (!mapData.IsDeclareTarget[i]) { @@ -4046,6 +4056,9 @@ static void genMapInfos(llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, DataLayout &dl, MapInfosTy &combinedInfo, MapInfoData &mapData, bool isTargetParams = false) { + assert(!moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice() && + "function only supported for host device codegen"); + // We wish to modify some of the methods in which arguments are // passed based on their capture type by the target region, this can // involve generating new loads and stores, which changes the @@ -4057,8 +4070,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder, // kernel arg structure. It primarily becomes relevant in cases like // bycopy, or byref range'd arrays. In the default case, we simply // pass thee pointer byref as both basePointer and pointer. - if (!moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice()) - createAlteredByCaptureMap(mapData, moduleTranslation, builder); + createAlteredByCaptureMap(mapData, moduleTranslation, builder); llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); @@ -4092,6 +4104,8 @@ emitUserDefinedMapper(Operation *declMapperOp, llvm::IRBuilderBase &builder, static llvm::Expected getOrCreateUserDefinedMapperFunc(Operation *op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { + assert(!moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice() && + "function only supported for host device codegen"); auto declMapperOp = cast(op); std::string mapperFuncName = moduleTranslation.getOpenMPBuilder()->createPlatformSpecificName( @@ -4108,6 +4122,8 @@ static llvm::Expected emitUserDefinedMapper(Operation *op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, llvm::StringRef mapperFuncName) { + assert(!moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice() && + "function only supported for host device codegen"); auto declMapperOp = cast(op); auto declMapperInfoOp = declMapperOp.getDeclareMapperInfo(); DataLayout dl = DataLayout(declMapperOp->getParentOfType()); @@ -4597,6 +4613,8 @@ static void handleDeclareTargetMapVar(MapInfoData &mapData, LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder, llvm::Function *func) { + assert(moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice() && + "function only supported for target device codegen"); for (size_t i = 0; i < mapData.MapClause.size(); ++i) { // In the case of declare target mapped variables, the basePointer is // the reference pointer generated by the convertDeclareTargetAttr @@ -4689,6 +4707,8 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg, LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase::InsertPoint allocaIP, llvm::IRBuilderBase::InsertPoint codeGenIP) { + assert(ompBuilder.Config.isTargetDevice() && + "function only supported for target device codegen"); builder.restoreIP(allocaIP); omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef; diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 7d7d0bb02a3d7..8945ae933dd65 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -164,11 +164,12 @@ ModuleImport::ModuleImport(ModuleOp mlirModule, std::unique_ptr llvmModule, bool emitExpensiveWarnings, bool importEmptyDICompositeTypes, - bool preferUnregisteredIntrinsics) + bool preferUnregisteredIntrinsics, + bool importStructsAsLiterals) : builder(mlirModule->getContext()), context(mlirModule->getContext()), mlirModule(mlirModule), llvmModule(std::move(llvmModule)), iface(mlirModule->getContext()), - typeTranslator(*mlirModule->getContext()), + typeTranslator(*mlirModule->getContext(), importStructsAsLiterals), debugImporter(std::make_unique( mlirModule, importEmptyDICompositeTypes)), loopAnnotationImporter( @@ -2073,6 +2074,40 @@ LogicalResult ModuleImport::convertIntrinsic(llvm::CallInst *inst) { return emitError(loc) << "unhandled intrinsic: " << diag(*inst); } +ArrayAttr +ModuleImport::convertAsmInlineOperandAttrs(const llvm::CallBase &llvmCall) { + const auto *ia = cast(llvmCall.getCalledOperand()); + unsigned argIdx = 0; + SmallVector opAttrs; + bool hasIndirect = false; + + for (const llvm::InlineAsm::ConstraintInfo &ci : ia->ParseConstraints()) { + // Only deal with constraints that correspond to call arguments. + if (ci.Type == llvm::InlineAsm::isLabel || !ci.hasArg()) + continue; + + // Only increment `argIdx` in terms of constraints containing arguments, + // which are guaranteed to happen in the same order of the call arguments. + if (ci.isIndirect) { + if (llvm::Type *paramEltType = llvmCall.getParamElementType(argIdx)) { + SmallVector attrs; + attrs.push_back(builder.getNamedAttr( + mlir::LLVM::InlineAsmOp::getElementTypeAttrName(), + mlir::TypeAttr::get(convertType(paramEltType)))); + opAttrs.push_back(builder.getDictionaryAttr(attrs)); + hasIndirect = true; + } + } else { + opAttrs.push_back(builder.getDictionaryAttr({})); + } + argIdx++; + } + + // Avoid emitting an array where all entries are empty dictionaries. + return hasIndirect ? ArrayAttr::get(mlirModule->getContext(), opAttrs) + : nullptr; +} + LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) { // Convert all instructions that do not provide an MLIR builder. Location loc = translateLoc(inst->getDebugLoc()); @@ -2159,14 +2194,17 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) { Type resultTy = convertType(callInst->getType()); if (!resultTy) return failure(); + ArrayAttr operandAttrs = convertAsmInlineOperandAttrs(*callInst); return builder .create( loc, resultTy, *operands, builder.getStringAttr(asmI->getAsmString()), builder.getStringAttr(asmI->getConstraintString()), - /*has_side_effects=*/true, - /*is_align_stack=*/false, /*asm_dialect=*/nullptr, - /*operand_attrs=*/nullptr) + asmI->hasSideEffects(), asmI->isAlignStack(), + AsmDialectAttr::get( + mlirModule.getContext(), + convertAsmDialectFromLLVM(asmI->getDialect())), + operandAttrs) .getOperation(); } bool isIncompatibleCall; @@ -3081,7 +3119,8 @@ ModuleImport::translateDereferenceableAttr(const llvm::MDNode *node, OwningOpRef mlir::translateLLVMIRToModule( std::unique_ptr llvmModule, MLIRContext *context, bool emitExpensiveWarnings, bool dropDICompositeTypeElements, - bool loadAllDialects, bool preferUnregisteredIntrinsics) { + bool loadAllDialects, bool preferUnregisteredIntrinsics, + bool importStructsAsLiterals) { // Preload all registered dialects to allow the import to iterate the // registered LLVMImportDialectInterface implementations and query the // supported LLVM IR constructs before starting the translation. Assumes the @@ -3099,7 +3138,8 @@ OwningOpRef mlir::translateLLVMIRToModule( ModuleImport moduleImport(module.get(), std::move(llvmModule), emitExpensiveWarnings, dropDICompositeTypeElements, - preferUnregisteredIntrinsics); + preferUnregisteredIntrinsics, + importStructsAsLiterals); if (failed(moduleImport.initializeImportInterface())) return {}; if (failed(moduleImport.convertDataLayout())) diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 1168b9f339904..95b8ee0331c55 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -1843,17 +1843,13 @@ LogicalResult ModuleTranslation::convertComdats() { LogicalResult ModuleTranslation::convertUnresolvedBlockAddress() { for (auto &[blockAddressOp, llvmCst] : unresolvedBlockAddressMapping) { BlockAddressAttr blockAddressAttr = blockAddressOp.getBlockAddr(); - BlockTagOp blockTagOp = lookupBlockTag(blockAddressAttr); - assert(blockTagOp && "expected all block tags to be already seen"); - - llvm::BasicBlock *llvmBlock = lookupBlock(blockTagOp->getBlock()); + llvm::BasicBlock *llvmBlock = lookupBlockAddress(blockAddressAttr); assert(llvmBlock && "expected LLVM blocks to be already translated"); // Update mapping with new block address constant. auto *llvmBlockAddr = llvm::BlockAddress::get( lookupFunction(blockAddressAttr.getFunction().getValue()), llvmBlock); llvmCst->replaceAllUsesWith(llvmBlockAddr); - mapValue(blockAddressOp.getResult(), llvmBlockAddr); assert(llvmCst->use_empty() && "expected all uses to be replaced"); cast(llvmCst)->eraseFromParent(); } diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp index c46aa3e80d51a..5d9345d707a44 100644 --- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp +++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp @@ -12,7 +12,6 @@ #include "mlir/IR/MLIRContext.h" #include "llvm/ADT/TypeSwitch.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" @@ -25,7 +24,9 @@ namespace detail { class TypeFromLLVMIRTranslatorImpl { public: /// Constructs a class creating types in the given MLIR context. - TypeFromLLVMIRTranslatorImpl(MLIRContext &context) : context(context) {} + TypeFromLLVMIRTranslatorImpl(MLIRContext &context, + bool importStructsAsLiterals) + : context(context), importStructsAsLiterals(importStructsAsLiterals) {} /// Translates the given type. Type translateType(llvm::Type *type) { @@ -103,7 +104,7 @@ class TypeFromLLVMIRTranslatorImpl { /// Translates the given structure type. Type translate(llvm::StructType *type) { SmallVector subtypes; - if (type->isLiteral()) { + if (type->isLiteral() || importStructsAsLiterals) { translateTypes(type->subtypes(), subtypes); return LLVM::LLVMStructType::getLiteral(&context, subtypes, type->isPacked()); @@ -132,7 +133,7 @@ class TypeFromLLVMIRTranslatorImpl { Type translate(llvm::ScalableVectorType *type) { return VectorType::get(type->getMinNumElements(), translateType(type->getElementType()), - /*scalable=*/true); + /*scalableDims=*/true); } /// Translates the given target extension type. @@ -158,14 +159,20 @@ class TypeFromLLVMIRTranslatorImpl { /// The context in which MLIR types are created. MLIRContext &context; + + /// Controls if structs should be imported as literal structs, i.e., nameless + /// structs. + bool importStructsAsLiterals; }; } // namespace detail } // namespace LLVM } // namespace mlir -LLVM::TypeFromLLVMIRTranslator::TypeFromLLVMIRTranslator(MLIRContext &context) - : impl(new detail::TypeFromLLVMIRTranslatorImpl(context)) {} +LLVM::TypeFromLLVMIRTranslator::TypeFromLLVMIRTranslator( + MLIRContext &context, bool importStructsAsLiterals) + : impl(std::make_unique( + context, importStructsAsLiterals)) {} LLVM::TypeFromLLVMIRTranslator::~TypeFromLLVMIRTranslator() = default; diff --git a/mlir/lib/Transforms/CompositePass.cpp b/mlir/lib/Transforms/CompositePass.cpp index b388a28da6424..16e276e3f41b7 100644 --- a/mlir/lib/Transforms/CompositePass.cpp +++ b/mlir/lib/Transforms/CompositePass.cpp @@ -35,7 +35,9 @@ struct CompositeFixedPointPass final populateFunc(dynamicPM); llvm::raw_string_ostream os(pipelineStr); - dynamicPM.printAsTextualPipeline(os); + llvm::interleave( + dynamicPM, [&](mlir::Pass &pass) { pass.printAsTextualPipeline(os); }, + [&]() { os << ","; }); } LogicalResult initializeOptions( diff --git a/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir b/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir index 23756bb66928d..d314ad3ac30fd 100644 --- a/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir +++ b/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir @@ -193,7 +193,7 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 24> } { // CHECK-NEXT: [[vc44_i32:%.*]] = arith.constant 44 : i32 // CHECK-NEXT: [[vc4_i32:%.*]] = arith.constant 4 : i32 // CHECK-NEXT: [[vc91_i32:%.*]] = arith.constant 91 : i32 - // CHECK-NEXT: [[v0:%.*]] = bufferization.to_memref [[varg0]] : tensor<120x120x120xi8> to memref<120x120x120xi8> + // CHECK-NEXT: [[v0:%.*]] = bufferization.to_buffer [[varg0]] : tensor<120x120x120xi8> to memref<120x120x120xi8> // CHECK-NEXT: [[v1:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc:%.*]] = memref.alloc() : memref<117x113x5xi8> // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[v0]][1, 3, 109] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index c7a6eca158276..8d720ce62a91b 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -680,3 +680,28 @@ llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) { nvvm.barrier.arrive id = %barID number_of_threads = %numberOfThreads llvm.return } + + +// ----- + +llvm.func @init_mbarrier( + %barrier_gen : !llvm.ptr, + %barrier : !llvm.ptr<3>, + %count : i32, + %pred : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.init.b64 [$0], $1;", "l,r" + nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count) : !llvm.ptr, i32 + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" + nvvm.inline_ptx "mbarrier.init.b64 [$0], $1;" (%barrier_gen, %count), predicate = %pred : !llvm.ptr, i32, i1 + llvm.return +} +// ----- + +llvm.func @ex2(%input : f32, %pred : i1) { + // CHECK: %{{.*}} = llvm.inline_asm has_side_effects asm_dialect = att "ex2.approx.ftz.f32 $0, $1;", "=f,f" %{{.*}} : (f32) -> f32 + %0 = nvvm.inline_ptx "ex2.approx.ftz.f32 $0, $1;" (%input) : f32 -> f32 + + // CHECK: %{{.*}} = llvm.inline_asm has_side_effects asm_dialect = att "@$1 ex2.approx.ftz.f32 $0, $1;", "=f,f,b" %{{.*}}, %{{.*}} : (f32, i1) -> f32 + %1 = nvvm.inline_ptx "ex2.approx.ftz.f32 $0, $1;" (%input), predicate = %pred : f32, i1 -> f32 + llvm.return +} diff --git a/mlir/test/Conversion/OpenACCToSCF/convert-openacc-to-scf.mlir b/mlir/test/Conversion/OpenACCToSCF/convert-openacc-to-scf.mlir index d8e89f64f8bc0..c08fd860e738b 100644 --- a/mlir/test/Conversion/OpenACCToSCF/convert-openacc-to-scf.mlir +++ b/mlir/test/Conversion/OpenACCToSCF/convert-openacc-to-scf.mlir @@ -68,20 +68,20 @@ func.func @update_false(%arg0: memref) { func.func @enter_data_true(%d1 : memref) { %true = arith.constant true %0 = acc.create varPtr(%d1 : memref) -> memref - acc.enter_data if(%true) dataOperands(%0 : memref) attributes {async} + acc.enter_data async if(%true) dataOperands(%0 : memref) return } // CHECK-LABEL: func.func @enter_data_true // CHECK-NOT: if -// CHECK: acc.enter_data dataOperands +// CHECK: acc.enter_data async dataOperands // ----- func.func @enter_data_false(%d1 : memref) { %false = arith.constant false %0 = acc.create varPtr(%d1 : memref) -> memref - acc.enter_data if(%false) dataOperands(%0 : memref) attributes {async} + acc.enter_data async if(%false) dataOperands(%0 : memref) return } @@ -93,21 +93,21 @@ func.func @enter_data_false(%d1 : memref) { func.func @exit_data_true(%d1 : memref) { %true = arith.constant true %0 = acc.getdeviceptr varPtr(%d1 : memref) -> memref - acc.exit_data if(%true) dataOperands(%0 : memref) attributes {async} + acc.exit_data async if(%true) dataOperands(%0 : memref) acc.delete accPtr(%0 : memref) return } // CHECK-LABEL: func.func @exit_data_true // CHECK-NOT:if -// CHECK:acc.exit_data dataOperands +// CHECK:acc.exit_data async dataOperands // ----- func.func @exit_data_false(%d1 : memref) { %false = arith.constant false %0 = acc.getdeviceptr varPtr(%d1 : memref) -> memref - acc.exit_data if(%false) dataOperands(%0 : memref) attributes {async} + acc.exit_data async if(%false) dataOperands(%0 : memref) acc.delete accPtr(%0 : memref) return } diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir index 4b9eca45492fb..ca8099b9bb51f 100644 --- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir @@ -247,7 +247,7 @@ module { ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index): tensor.yield %cst_f32 : f32 } : tensor<1x32x32x8xf32> to tensor<1x40x8229x8xf32> - %1 = bufferization.to_memref %padded : tensor<1x40x8229x8xf32> to memref<1x40x8229x8xf32> + %1 = bufferization.to_buffer %padded : tensor<1x40x8229x8xf32> to memref<1x40x8229x8xf32> %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x32x32x8xf32> affine.for %arg1 = 0 to 1 { affine.for %arg2 = 0 to 32 { diff --git a/mlir/test/Dialect/Arith/bufferize.mlir b/mlir/test/Dialect/Arith/bufferize.mlir index 0b7838e1471d3..d9d0cde642bef 100644 --- a/mlir/test/Dialect/Arith/bufferize.mlir +++ b/mlir/test/Dialect/Arith/bufferize.mlir @@ -7,7 +7,7 @@ func.func @index_cast(%tensor: tensor, %scalar: i32) -> (tensor, ind %index_scalar = arith.index_cast %scalar : i32 to index return %index_tensor, %index_scalar : tensor, index } -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor // CHECK-NEXT: %[[INDEX_MEMREF:.*]] = arith.index_cast %[[MEMREF]] // CHECK-SAME: memref to memref // CHECK-NEXT: %[[INDEX_TENSOR:.*]] = bufferization.to_tensor %[[INDEX_MEMREF]] @@ -83,8 +83,8 @@ func.func @non_tensor() { // CHECK-SAME: %[[PRED:.*]]: i1, // CHECK-SAME: %[[TRUE_VAL:.*]]: tensor, // CHECK-SAME: %[[FALSE_VAL:.*]]: tensor) -> tensor { -// CHECK-DAG: %[[TRUE_VAL_MEMREF:.*]] = bufferization.to_memref %[[TRUE_VAL]] : tensor -// CHECK-DAG: %[[FALSE_VAL_MEMREF:.*]] = bufferization.to_memref %[[FALSE_VAL]] : tensor +// CHECK-DAG: %[[TRUE_VAL_MEMREF:.*]] = bufferization.to_buffer %[[TRUE_VAL]] : tensor +// CHECK-DAG: %[[FALSE_VAL_MEMREF:.*]] = bufferization.to_buffer %[[FALSE_VAL]] : tensor // CHECK: %[[RET_MEMREF:.*]] = arith.select %[[PRED]], %[[TRUE_VAL_MEMREF]], %[[FALSE_VAL_MEMREF]] : memref // CHECK: %[[RET:.*]] = bufferization.to_tensor %[[RET_MEMREF]] : memref // CHECK: return %[[RET]] : tensor diff --git a/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir b/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir index bdb69a95a52de..8c658db009adf 100644 --- a/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir +++ b/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir @@ -48,6 +48,18 @@ func.func @arm_sve_ummla(%a: vector<[16]xi8>, // ----- +func.func @arm_sve_usmmla(%a: vector<[16]xi8>, + %b: vector<[16]xi8>, + %c: vector<[4]xi32>) + -> vector<[4]xi32> { + // CHECK: arm_sve.intr.usmmla + %0 = arm_sve.usmmla %c, %a, %b : + vector<[16]xi8> to vector<[4]xi32> + return %0 : vector<[4]xi32> +} + +// ----- + func.func @arm_sve_arithi_masked(%a: vector<[4]xi32>, %b: vector<[4]xi32>, %c: vector<[4]xi32>, @@ -271,3 +283,44 @@ func.func @arm_sve_psel_mixed_predicate_types(%p0: vector<[8]xi1>, %p1: vector<[ %0 = arm_sve.psel %p0, %p1[%index] : vector<[8]xi1>, vector<[16]xi1> return %0 : vector<[8]xi1> } + +// ----- + +// CHECK-LABEL: @arm_sve_dupq_lane( +// CHECK-SAME: %[[A0:[a-z0-9]+]]: vector<[16]xi8> +// CHECK-SAME: %[[A1:[a-z0-9]+]]: vector<[8]xi16> +// CHECK-SAME: %[[A2:[a-z0-9]+]]: vector<[8]xf16> +// CHECK-SAME: %[[A3:[a-z0-9]+]]: vector<[8]xbf16> +// CHECK-SAME: %[[A4:[a-z0-9]+]]: vector<[4]xi32> +// CHECK-SAME: %[[A5:[a-z0-9]+]]: vector<[4]xf32> +// CHECK-SAME: %[[A6:[a-z0-9]+]]: vector<[2]xi64> +// CHECK-SAME: %[[A7:[a-z0-9]+]]: vector<[2]xf64> +// CHECK-SAME: -> !llvm.struct<(vector<[16]xi8>, vector<[8]xi16>, vector<[8]xf16>, vector<[8]xbf16>, vector<[4]xi32>, vector<[4]xf32>, vector<[2]xi64>, vector<[2]xf64>)> { +func.func @arm_sve_dupq_lane( + %v16i8: vector<[16]xi8>, %v8i16: vector<[8]xi16>, + %v8f16: vector<[8]xf16>, %v8bf16: vector<[8]xbf16>, + %v4i32: vector<[4]xi32>, %v4f32: vector<[4]xf32>, + %v2i64: vector<[2]xi64>, %v2f64: vector<[2]xf64>) + -> (vector<[16]xi8>, vector<[8]xi16>, vector<[8]xf16>, vector<[8]xbf16>, + vector<[4]xi32>, vector<[4]xf32>, vector<[2]xi64>, vector<[2]xf64>) { +// CHECK: "arm_sve.intr.dupq_lane"(%[[A0]]) <{lane = 0 : i64}> : (vector<[16]xi8>) -> vector<[16]xi8> + %0 = arm_sve.dupq_lane %v16i8[0] : vector<[16]xi8> +// CHECK: "arm_sve.intr.dupq_lane"(%[[A1]]) <{lane = 1 : i64}> : (vector<[8]xi16>) -> vector<[8]xi16> + %1 = arm_sve.dupq_lane %v8i16[1] : vector<[8]xi16> +// CHECK: "arm_sve.intr.dupq_lane"(%[[A2]]) <{lane = 2 : i64}> : (vector<[8]xf16>) -> vector<[8]xf16> + %2 = arm_sve.dupq_lane %v8f16[2] : vector<[8]xf16> +// CHECK: "arm_sve.intr.dupq_lane"(%[[A3]]) <{lane = 3 : i64}> : (vector<[8]xbf16>) -> vector<[8]xbf16> + %3 = arm_sve.dupq_lane %v8bf16[3] : vector<[8]xbf16> +// CHECK: "arm_sve.intr.dupq_lane"(%[[A4]]) <{lane = 4 : i64}> : (vector<[4]xi32>) -> vector<[4]xi32> + %4 = arm_sve.dupq_lane %v4i32[4] : vector<[4]xi32> +// CHECK: "arm_sve.intr.dupq_lane"(%[[A5]]) <{lane = 5 : i64}> : (vector<[4]xf32>) -> vector<[4]xf32> + %5 = arm_sve.dupq_lane %v4f32[5] : vector<[4]xf32> +// CHECK: "arm_sve.intr.dupq_lane"(%[[A6]]) <{lane = 6 : i64}> : (vector<[2]xi64>) -> vector<[2]xi64> + %6 = arm_sve.dupq_lane %v2i64[6] : vector<[2]xi64> +// CHECK: "arm_sve.intr.dupq_lane"(%[[A7]]) <{lane = 7 : i64}> : (vector<[2]xf64>) -> vector<[2]xf64> + %7 = arm_sve.dupq_lane %v2f64[7] : vector<[2]xf64> + + return %0, %1, %2, %3, %4, %5, %6, %7 + : vector<[16]xi8>, vector<[8]xi16>, vector<[8]xf16>, vector<[8]xbf16>, + vector<[4]xi32>, vector<[4]xf32>, vector<[2]xi64>, vector<[2]xf64> +} diff --git a/mlir/test/Dialect/ArmSVE/roundtrip.mlir b/mlir/test/Dialect/ArmSVE/roundtrip.mlir index 0f0c5a8575772..64e0cff39eb06 100644 --- a/mlir/test/Dialect/ArmSVE/roundtrip.mlir +++ b/mlir/test/Dialect/ArmSVE/roundtrip.mlir @@ -44,6 +44,17 @@ func.func @arm_sve_ummla(%a: vector<[16]xi8>, // ----- +func.func @arm_sve_usmmla(%a: vector<[16]xi8>, + %b: vector<[16]xi8>, + %c: vector<[4]xi32>) -> vector<[4]xi32> { + // CHECK: arm_sve.usmmla {{.*}}: vector<[16]xi8> to vector<[4]xi3 + %0 = arm_sve.usmmla %c, %a, %b : + vector<[16]xi8> to vector<[4]xi32> + return %0 : vector<[4]xi32> +} + +// ----- + func.func @arm_sve_masked_arithi(%a: vector<[4]xi32>, %b: vector<[4]xi32>, %c: vector<[4]xi32>, diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir index 5d0657eb38baa..2204c6fae50d0 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir @@ -5,11 +5,11 @@ // no memref operands. // CHECK-LABEL: func private @no_interface_no_operands( -// CHECK-NEXT: %[[m:.*]] = bufferization.to_memref +// CHECK-NEXT: %[[m:.*]] = bufferization.to_buffer // CHECK-NEXT: %[[clone:.*]] = bufferization.clone %[[m]] // CHECK-NEXT: return %[[clone]] func.func private @no_interface_no_operands(%t : tensor) -> memref { - %0 = bufferization.to_memref %t : tensor to memref + %0 = bufferization.to_buffer %t : tensor to memref return %0 : memref } diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir index 8f0170b17381a..4c7683ec211e4 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir @@ -8,7 +8,7 @@ // CHECK-LABEL: func @buffer_not_deallocated( // CHECK-SAME: %[[t:.*]]: tensor func.func @buffer_not_deallocated(%t : tensor, %c : i1) -> tensor { - // CHECK: %[[m:.*]] = bufferization.to_memref %[[t]] + // CHECK: %[[m:.*]] = bufferization.to_buffer %[[t]] // CHECK: %[[r:.*]] = scf.if %{{.*}} { %r = scf.if %c -> tensor { // CHECK: %[[some_op:.*]] = "test.some_op" @@ -37,7 +37,7 @@ func.func @write_to_alloc_tensor_or_readonly_tensor(%arg0: tensor, %cond: i1, %val: i32) -> tensor { - // CHECK: %[[arg0_m:.*]] = bufferization.to_memref %[[arg0]] + // CHECK: %[[arg0_m:.*]] = bufferization.to_buffer %[[arg0]] // CHECK: %[[r:.*]] = scf.if {{.*}} { // CHECK: scf.yield %[[arg0_m]] // CHECK: } else { diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir index 7d429e4840114..454c17aef4d8a 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir @@ -87,32 +87,32 @@ func.func @read_of_alloc_tensor_is_not_a_conflict(%f: f32, %idx: index) -> f32 { // ----- -// CHECK-LABEL: func @to_memref_not_read_only( -func.func @to_memref_not_read_only(%idx : index, %f: f32) -> f32 { +// CHECK-LABEL: func @to_buffer_not_read_only( +func.func @to_buffer_not_read_only(%idx : index, %f: f32) -> f32 { %t = tensor.generate { ^bb0(%i : index): tensor.yield %f : f32 } : tensor<5xf32> - // Some op may write into the result of to_memref later. - // CHECK: bufferization.to_memref + // Some op may write into the result of to_buffer later. + // CHECK: bufferization.to_buffer // CHECK-SAME: {__inplace_operands_attr__ = ["false"]} - %m = bufferization.to_memref %t : tensor<5xf32> to memref<5xf32> + %m = bufferization.to_buffer %t : tensor<5xf32> to memref<5xf32> %2 = tensor.extract %t[%idx] : tensor<5xf32> return %2 : f32 } // ----- -// CHECK-LABEL: func @to_memref_read_only( -func.func @to_memref_read_only(%idx : index, %f: f32) -> f32 { +// CHECK-LABEL: func @to_buffer_read_only( +func.func @to_buffer_read_only(%idx : index, %f: f32) -> f32 { %t = tensor.generate { ^bb0(%i : index): tensor.yield %f : f32 } : tensor<5xf32> - // Some op may write into the result of to_memref later. - // CHECK: bufferization.to_memref + // Some op may write into the result of to_buffer later. + // CHECK: bufferization.to_buffer // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} - %m = bufferization.to_memref %t {read_only} : tensor<5xf32> to memref<5xf32> + %m = bufferization.to_buffer %t {read_only} : tensor<5xf32> to memref<5xf32> %2 = tensor.extract %t[%idx] : tensor<5xf32> return %2 : f32 } diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir index c26f1681e4d96..e97777c3e3d13 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir @@ -47,7 +47,7 @@ func.func @alloc_tesor_copy_from_default_space(%arg0: tensor<128xf32>) -> tensor // CHECK-LABEL: @alloc_tesor_copy_from_default_space // CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32>) -> tensor<128xf32> { -// CHECK: %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32> to memref<128xf32, strided<[?], offset: ?>> +// CHECK: %[[v0:.+]] = bufferization.to_buffer %[[arg0]] : tensor<128xf32> to memref<128xf32, strided<[?], offset: ?>> // CHECK: %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1> // CHECK: memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>> to memref<128xf32, 1> // CHECK: %[[v1:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 1> to tensor<128xf32> @@ -63,7 +63,7 @@ func.func @alloc_tesor_copy_from_non_default_space(%arg0: tensor<128xf32, 1>) -> // CHECK-LABEL: @alloc_tesor_copy_from_non_default_space // CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>) -> tensor<128xf32, 2 : i64> { -// CHECK: %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> +// CHECK: %[[v0:.+]] = bufferization.to_buffer %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 2> // CHECK: memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 2> // CHECK: %[[v1:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 2> to tensor<128xf32, 2 : i64> @@ -82,9 +82,9 @@ func.func @alloc_tesor_copy_from_non_default_space_no_cast(%arg0: tensor<128xf32 // CHECK-LABEL: @alloc_tesor_copy_from_non_default_space_no_cast // CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>, %[[arg1:.+]]: tensor<4xf32, 1 : i64>) -> tensor<128xf32, 1 : i64> { -// CHECK: %[[v0:.+]] = bufferization.to_memref %[[arg1]] : tensor<4xf32, 1 : i64> to memref<4xf32, strided<[?], offset: ?>, 1> -// CHECK: %[[v1:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> -// CHECK: %[[v2:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> +// CHECK: %[[v0:.+]] = bufferization.to_buffer %[[arg1]] : tensor<4xf32, 1 : i64> to memref<4xf32, strided<[?], offset: ?>, 1> +// CHECK: %[[v1:.+]] = bufferization.to_buffer %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> +// CHECK: %[[v2:.+]] = bufferization.to_buffer %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 2> // CHECK: memref.copy %[[v2]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 2> // CHECK: %[[v3:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 2> to tensor<128xf32, 1 : i64> @@ -104,7 +104,7 @@ func.func @materialize_in_destination(%arg0: tensor<128xf32, 1>) -> tensor<128xf // CHECK-LABEL: @materialize_in_destination // CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>) -> tensor<128xf32, 2 : i64> { -// CHECK: %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> +// CHECK: %[[v0:.+]] = bufferization.to_buffer %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 2> // CHECK: memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 2> // CHECK: %[[v1:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 2> to tensor<128xf32, 2 : i64> diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir index 194c3278c78a1..908c760d9a0cd 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir @@ -25,9 +25,9 @@ func.func @use_of_unknown_op_1(%t1: tensor) %idx = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : tensor to memref> + // CHECK: %[[dummy_memref:.*]] = bufferization.to_buffer %[[dummy]] : tensor to memref> // CHECK: vector.transfer_read %[[dummy_memref]][%{{.*}}], %{{.*}} : memref> - // CHECK-NO-LAYOUT-MAP: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : tensor to memref + // CHECK-NO-LAYOUT-MAP: %[[dummy_memref:.*]] = bufferization.to_buffer %[[dummy]] : tensor to memref // CHECK-NO-LAYOUT-MAP: vector.transfer_read %[[dummy_memref]][%{{.*}}], %{{.*}} : memref %1 = vector.transfer_read %0[%idx], %cst : tensor, vector<5xf32> return %1 : vector<5xf32> @@ -55,13 +55,13 @@ func.func @use_of_unknown_op_3(%t1: tensor) -> (vector<5xf32>, vector<5xf32>) { %idx = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] // CHECK: %[[v1:.*]] = vector.transfer_read %[[m1]] %1 = vector.transfer_read %t1[%idx], %cst : tensor, vector<5xf32> // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]]) %0 = "test.dummy_op"(%t1) : (tensor) -> tensor - // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : tensor to memref> + // CHECK: %[[dummy_memref:.*]] = bufferization.to_buffer %[[dummy]] : tensor to memref> // CHECK: %[[v2:.*]] = vector.transfer_read %[[dummy_memref]] %2 = vector.transfer_read %0[%idx], %cst : tensor, vector<5xf32> @@ -81,7 +81,7 @@ func.func @use_of_unknown_op_4(%t1: tensor) // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]]) %0 = "test.dummy_op"(%t1) : (tensor) -> tensor - // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] + // CHECK: %[[dummy_memref:.*]] = bufferization.to_buffer %[[dummy]] // CHECK: %[[v1:.*]] = vector.transfer_read %[[dummy_memref]] %1 = vector.transfer_read %0[%idx], %cst : tensor, vector<5xf32> @@ -98,7 +98,7 @@ func.func @use_of_unknown_op_4(%t1: tensor) // CHECK-SAME: %[[t1:.*]]: tensor func.func @use_of_bufferizable_op_in_unbufferizable_op( %t1: tensor, %o: index, %s: index) -> (tensor, tensor) { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] // CHECK: %[[subview:.*]] = memref.subview %[[m1]] // The op must alloc because "test.dummy" may bufferize to a memory write. // CHECK: %[[alloc:.*]] = memref.alloc @@ -119,7 +119,7 @@ func.func @unused_unknown_op(%t1 : tensor) -> vector<5xf32> { %idx = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] // CHECK: vector.transfer_read %[[m1]] %1 = vector.transfer_read %t1[%idx], %cst : tensor, vector<5xf32> @@ -166,7 +166,7 @@ func.func @unknown_op_may_read(%v: vector<5xf32>) func.func @unknown_op_not_writable( %t1 : tensor, %v : vector<5xf32>, %idx : index) -> tensor { // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]]) - // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] + // CHECK: %[[dummy_memref:.*]] = bufferization.to_buffer %[[dummy]] %0 = "test.dummy_op"(%t1) : (tensor) -> (tensor) // The result of an unknown op is not writable. Always generate a copy. @@ -186,7 +186,7 @@ func.func @unknown_op_not_writable( // CHECK-TENSOR-LABEL: func @simple_tensor_test( // CHECK-TENSOR-SAME: %[[t1:.*]]: tensor func.func @simple_tensor_test(%t1 : tensor, %f : f32) -> tensor { - // CHECK-TENSOR: %[[t1_memref:.*]] = bufferization.to_memref %[[t1]] + // CHECK-TENSOR: %[[t1_memref:.*]] = bufferization.to_buffer %[[t1]] %c0 = arith.constant 0 : index // CHECK-TENSOR: %[[alloc:.*]] = memref.alloc // CHECK-TENSOR: memref.copy %[[t1_memref]], %[[alloc]] @@ -203,7 +203,7 @@ func.func @simple_tensor_test(%t1 : tensor, %f : f32) -> tensor { // CHECK-SCF-SAME: %[[t1:.*]]: tensor {bufferization.writable = true}, %[[c:.*]]: i1, %[[pos:.*]]: index func.func @simple_scf_if(%t1: tensor {bufferization.writable = true}, %c: i1, %pos: index, %f: f32) -> (tensor, index) { - // CHECK-SCF: %[[t1_memref:.*]] = bufferization.to_memref %[[t1]] + // CHECK-SCF: %[[t1_memref:.*]] = bufferization.to_buffer %[[t1]] // CHECK-SCF: %[[r:.*]] = scf.if %[[c]] -> (memref) { %r1, %r2 = scf.if %c -> (tensor, index) { // CHECK-SCF: scf.yield %[[t1_memref]] @@ -211,7 +211,7 @@ func.func @simple_scf_if(%t1: tensor {bufferization.writable = true}, %c: // CHECK-SCF: } else { } else { // CHECK-SCF: %[[insert:.*]] = tensor.insert %{{.*}} into %[[t1]][{{.*}}] - // CHECK-SCF: %[[insert_memref:.*]] = bufferization.to_memref %[[insert]] + // CHECK-SCF: %[[insert_memref:.*]] = bufferization.to_buffer %[[insert]] %1 = tensor.insert %f into %t1[%pos] : tensor // CHECK-SCF: scf.yield %[[insert_memref]] scf.yield %1, %pos : tensor, index diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir index e65c5b92949f6..cd19e3a5e82aa 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir @@ -39,7 +39,7 @@ func.func @use_tensor_func_arg(%A : tensor) -> (vector<4xf32>) { %c0 = arith.constant 0 : index %f0 = arith.constant 0.0 : f32 - // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]] + // CHECK: %[[A_memref:.*]] = bufferization.to_buffer %[[A]] // CHECK: %[[res:.*]] = vector.transfer_read %[[A_memref]] %0 = vector.transfer_read %A[%c0], %f0 : tensor, vector<4xf32> @@ -54,7 +54,7 @@ func.func @use_tensor_func_arg(%A : tensor) -> (vector<4xf32>) { func.func @return_tensor(%A : tensor, %v : vector<4xf32>) -> (tensor) { %c0 = arith.constant 0 : index - // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]] + // CHECK: %[[A_memref:.*]] = bufferization.to_buffer %[[A]] // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]] // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]]) // CHECK: memref.copy %[[A_memref]], %[[alloc]] @@ -102,7 +102,7 @@ func.func @read_after_write_conflict(%cst : f32, %idx : index, %idx2 : index) -> (f32, f32) { // CHECK-DAG: %[[alloc:.*]] = memref.alloc // CHECK-DAG: %[[dummy:.*]] = "test.dummy_op" - // CHECK-DAG: %[[dummy_m:.*]] = bufferization.to_memref %[[dummy]] + // CHECK-DAG: %[[dummy_m:.*]] = bufferization.to_buffer %[[dummy]] %t = "test.dummy_op"() : () -> (tensor<10xf32>) // CHECK: memref.copy %[[dummy_m]], %[[alloc]] @@ -134,7 +134,7 @@ func.func @copy_deallocated() -> tensor<10xf32> { // CHECK-LABEL: func @select_different_tensors( // CHECK-SAME: %[[t:.*]]: tensor func.func @select_different_tensors(%t: tensor, %sz: index, %pos: index, %c: i1) -> f32 { - // CHECK-DAG: %[[m:.*]] = bufferization.to_memref %[[t]] : tensor to memref + // CHECK-DAG: %[[m:.*]] = bufferization.to_buffer %[[t]] : tensor to memref // CHECK-DAG: %[[alloc:.*]] = memref.alloc(%{{.*}}) {{.*}} : memref %0 = bufferization.alloc_tensor(%sz) : tensor @@ -154,7 +154,7 @@ func.func @select_different_tensors(%t: tensor, %sz: index, %pos: index, // moment because this would create a tensor op during bufferization. That is // currently forbidden. func.func @alloc_tensor_with_copy(%t: tensor<5xf32>) -> tensor<5xf32> { - // CHECK: %[[m:.*]] = bufferization.to_memref %[[t]] + // CHECK: %[[m:.*]] = bufferization.to_buffer %[[t]] // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32> // CHECK: memref.copy %[[m]], %[[alloc]] %0 = bufferization.alloc_tensor() copy(%t) : tensor<5xf32> @@ -200,7 +200,7 @@ func.func @read_of_alias(%t: tensor<100xf32>, %pos1: index, %pos2: index, // CHECK-LABEL: func @from_unranked_to_unranked( // CHECK-SAME: %[[arg0:.*]]: tensor<*xi32> func.func @from_unranked_to_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> { - // CHECK: %[[m:.*]] = bufferization.to_memref %[[arg0]] : tensor<*xi32> to memref<*xi32> + // CHECK: %[[m:.*]] = bufferization.to_buffer %[[arg0]] : tensor<*xi32> to memref<*xi32> // CHECK: %[[t:.*]] = bufferization.to_tensor %[[m]] // CHECK: return %[[t]] : tensor<*xi32> %0 = tensor.cast %arg0 : tensor<*xi32> to tensor<*xi32> @@ -212,7 +212,7 @@ func.func @from_unranked_to_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> { // CHECK-LABEL: func @tensor_copy( // CHECK-SAME: %[[arg0:.*]]: tensor<5xf32>) func.func @tensor_copy(%arg0: tensor<5xf32>) -> tensor<5xf32> { - // CHECK: %[[m:.*]] = bufferization.to_memref %[[arg0]] + // CHECK: %[[m:.*]] = bufferization.to_buffer %[[arg0]] // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32> // CHECK: memref.copy %[[m]], %[[alloc]] // CHECK: %[[r:.*]] = bufferization.to_tensor %[[alloc]] @@ -227,7 +227,7 @@ func.func @tensor_copy(%arg0: tensor<5xf32>) -> tensor<5xf32> { // CHECK-LABEL: func @materialize_in_destination_buffer( // CHECK-SAME: %[[t:.*]]: tensor<5xf32>, %[[m:.*]]: memref<5xf32>) -// CHECK: %[[b:.*]] = bufferization.to_memref %[[t]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>> +// CHECK: %[[b:.*]] = bufferization.to_buffer %[[t]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>> // CHECK: memref.copy %[[b]], %[[m]] func.func @materialize_in_destination_buffer(%t: tensor<5xf32>, %m: memref<5xf32>) { bufferization.materialize_in_destination %t in restrict writable %m diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir index 230a0ed429489..3fb89aa1a0021 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir @@ -1,7 +1,7 @@ -// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 no-analysis-func-filter=contains_to_memref_op" -drop-equivalent-buffer-results --split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 no-analysis-func-filter=contains_to_buffer_op" -drop-equivalent-buffer-results --split-input-file | FileCheck %s // ToMemref ops do not pass analysis step. CopyBeforeWrite will be true only for the -// FuncOp "contains_to_memref_op" since it is specified in no-analysis-func-filter. +// FuncOp "contains_to_buffer_op" since it is specified in no-analysis-func-filter. // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 copy-before-write=1" -drop-equivalent-buffer-results --split-input-file | FileCheck %s --check-prefix=CHECK_COPY @@ -21,14 +21,14 @@ module { return %inserted : tensor } - // CHECK-LABEL: func.func @contains_to_memref_op( + // CHECK-LABEL: func.func @contains_to_buffer_op( // CHECK: memref.copy - // CHECK_COPY-LABEL: func.func @contains_to_memref_op( + // CHECK_COPY-LABEL: func.func @contains_to_buffer_op( // CHECK_COPY: memref.copy - func.func @contains_to_memref_op(%arg0: tensor {bufferization.writable = true}, %arg1: index) -> vector<5xf32> { - %0 = bufferization.to_memref %arg0 : tensor to memref + func.func @contains_to_buffer_op(%arg0: tensor {bufferization.writable = true}, %arg1: index) -> vector<5xf32> { + %0 = bufferization.to_buffer %arg0 : tensor to memref %cst = arith.constant 0.000000e+00 : f32 %1 = vector.transfer_read %0[%arg1], %cst : memref, vector<5xf32> return %1 : vector<5xf32> diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir index e7797d4bc50a9..2efb5893c8511 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir @@ -70,7 +70,7 @@ func.func @call_to_unknown_tensor_returning_func(%t : tensor) { // CHECK-NO-LAYOUT-MAP: %[[alloc_no_layout:.*]] = memref.alloc(%{{.*}}) {{.*}} : memref<2x?xf32> // CHECK-NO-LAYOUT-MAP: memref.copy %[[subview]], %[[alloc_no_layout]] // TODO: %alloc should be deallocated here, but we currently do not dealloc -// buffers that are inserted due to to_tensor/to_memref canonicalization (when +// buffers that are inserted due to to_tensor/to_buffer canonicalization (when // the buffer types have different layout maps). // CHECK-NO-LAYOUT-MAP: return %[[alloc_no_layout]] @@ -669,17 +669,17 @@ func.func @call_llvm_func() { // ----- -// CHECK-LABEL: func @to_memref_op_unsupported( +// CHECK-LABEL: func @to_buffer_op_unsupported( // CHECK-SAME: %[[arg0:.*]]: memref {bufferization.writable = true}, %idx1: index, %idx2: index, %idx3: index, %v1: vector<5xf32>) -> (vector<5xf32>) { // Insert a copy because we cannot analyze what happens with the result of a - // to_memref op. + // to_buffer op. // CHECK: %[[alloc:.*]] = memref.alloc // CHECK: memref.copy %[[arg0]], %[[alloc]] - %0 = bufferization.to_memref %t1 : tensor to memref + %0 = bufferization.to_buffer %t1 : tensor to memref // CHECK: "test.foo"(%[[alloc]]) "test.foo"(%0) : (memref) -> () diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensorlike-bufferlike.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensorlike-bufferlike.mlir index f8691e110aad1..d8b1a00522ab6 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/tensorlike-bufferlike.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/tensorlike-bufferlike.mlir @@ -4,7 +4,7 @@ // CHECK-SAME: {found = {operand_0 = "is_tensor_like", result_0 = "is_buffer_like"}} func.func @builtin_unranked(%t: tensor<*xf32>) -> (memref<*xf32>) { - %0 = bufferization.to_memref %t : tensor<*xf32> to memref<*xf32> + %0 = bufferization.to_buffer %t : tensor<*xf32> to memref<*xf32> return %0 : memref<*xf32> } @@ -14,7 +14,7 @@ func.func @builtin_unranked(%t: tensor<*xf32>) -> (memref<*xf32>) // CHECK-SAME: {found = {operand_0 = "is_tensor_like", result_0 = "is_buffer_like"}} func.func @builtin_ranked(%t: tensor<42xf32>) -> (memref<42xf32>) { - %0 = bufferization.to_memref %t : tensor<42xf32> to memref<42xf32> + %0 = bufferization.to_buffer %t : tensor<42xf32> to memref<42xf32> return %0 : memref<42xf32> } diff --git a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir index a2741abbda3b0..5e9ccc9c19074 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir @@ -15,7 +15,7 @@ module attributes {transform.with_named_sequence} { func.func @test_function(%A : tensor, %v : vector<4xf32>) -> (tensor) { %c0 = arith.constant 0 : index - // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]] + // CHECK: %[[A_memref:.*]] = bufferization.to_buffer %[[A]] // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]] // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]]) // CHECK: memref.copy %[[A_memref]], %[[alloc]] @@ -45,7 +45,7 @@ module attributes {transform.with_named_sequence} { func.func @test_function(%A : tensor, %v : vector<4xf32>) -> (tensor) { %c0 = arith.constant 0 : index - // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]] + // CHECK: %[[A_memref:.*]] = bufferization.to_buffer %[[A]] // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]] // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]]) // CHECK: linalg.copy ins(%[[A_memref]] : memref<{{.*}}>) outs(%[[alloc]] @@ -116,7 +116,7 @@ module attributes {transform.with_named_sequence} { func.func @test_function(%A : tensor, %v : vector<4xf32>) -> (tensor) { %c0 = arith.constant 0 : index - // CHECK: %[[A_memref:.*]] = bufferization.to_memref %[[A]] + // CHECK: %[[A_memref:.*]] = bufferization.to_buffer %[[A]] // CHECK: %[[dim:.*]] = memref.dim %[[A_memref]] // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]]) // CHECK: memref.copy %[[A_memref]], %[[alloc]] diff --git a/mlir/test/Dialect/Bufferization/canonicalize.mlir b/mlir/test/Dialect/Bufferization/canonicalize.mlir index b662e713e189c..f44e29071796d 100644 --- a/mlir/test/Dialect/Bufferization/canonicalize.mlir +++ b/mlir/test/Dialect/Bufferization/canonicalize.mlir @@ -3,10 +3,10 @@ // RUN: --split-input-file -allow-unregistered-dialect | \ // RUN: FileCheck %s -// Basic folding of to_tensor(to_memref(t)) -> t +// Basic folding of to_tensor(to_buffer(t)) -> t // CHECK-LABEL: func @tensor_load_of_buffer_cast( func.func @tensor_load_of_buffer_cast(%arg0: tensor) -> tensor { - %0 = bufferization.to_memref %arg0 : tensor to memref + %0 = bufferization.to_buffer %arg0 : tensor to memref %1 = bufferization.to_tensor %0 : memref to tensor return %1 : tensor } @@ -15,11 +15,11 @@ func.func @tensor_load_of_buffer_cast(%arg0: tensor) -> tensor { // ----- -// Basic folding of to_memref(to_tensor(m)) -> m +// Basic folding of to_buffer(to_tensor(m)) -> m // CHECK-LABEL: func @buffer_cast_of_tensor_load( func.func @buffer_cast_of_tensor_load(%arg0: memref) -> memref { %0 = bufferization.to_tensor %arg0 : memref to tensor - %1 = bufferization.to_memref %0 : tensor to memref + %1 = bufferization.to_buffer %0 : tensor to memref return %1 : memref } // CHECK-SAME: %[[MEMREF:.*]]: memref) -> memref { @@ -34,7 +34,7 @@ func.func @buffer_cast_of_tensor_load(%arg0: memref) -> memref { // CHECK-SAME: %[[MEMREF_ADDRSPACE2:.*]]: memref) // CHECK-SAME: -> memref { // CHECK-NOT: bufferization.to_tensor -// CHECK-NOT: bufferization.to_memref +// CHECK-NOT: bufferization.to_buffer // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = memref.dim %[[MEMREF_ADDRSPACE2]], %[[C0]] : memref // CHECK: %[[MEMREF_ADDRSPACE7:.*]] = memref.alloc(%[[DIM]]) : memref @@ -44,7 +44,7 @@ func.func @buffer_cast_of_tensor_load(%arg0: memref) -> memref { func.func @canonicalize_buffer_cast_of_tensor_load_different_address_space(%arg0: memref) -> memref { %0 = bufferization.to_tensor %arg0 : memref to tensor - %1 = bufferization.to_memref %0 : tensor to memref + %1 = bufferization.to_buffer %0 : tensor to memref return %1 : memref } @@ -56,7 +56,7 @@ func.func @canonicalize_buffer_cast_of_tensor_load_different_address_space(%arg0 // CHECK-SAME: %[[M:.*]]: memref>) // CHECK-SAME: -> memref> { // CHECK-NOT: bufferization.to_tensor -// CHECK-NOT: bufferization.to_memref +// CHECK-NOT: bufferization.to_buffer // CHECK: %[[R:.*]] = memref.cast %[[M]] // CHECK-SAME: memref> to memref> // CHECK: return %[[R]] @@ -65,7 +65,7 @@ func.func @canonicalize_buffer_cast_of_tensor_load( -> memref> { %0 = bufferization.to_tensor %arg0 : memref> to tensor - %1 = bufferization.to_memref %0 : tensor to memref> + %1 = bufferization.to_buffer %0 : tensor to memref> return %1 : memref> } @@ -78,13 +78,13 @@ func.func @canonicalize_buffer_cast_of_tensor_load_to_copy( %arg0: memref>) -> memref> { %0 = bufferization.to_tensor %arg0 : memref> to tensor - %1 = bufferization.to_memref %0 : tensor to memref> + %1 = bufferization.to_buffer %0 : tensor to memref> return %1 : memref> } // CHECK-SAME: %[[M:.*]]: memref>) // CHECK-SAME: -> memref> { // CHECK-NOT: bufferization.to_tensor -// CHECK-NOT: bufferization.to_memref +// CHECK-NOT: bufferization.to_buffer // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[DIM:.*]] = memref.dim %[[M]], %[[C0]] : memref> // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) : memref> @@ -250,26 +250,26 @@ func.func @clone_and_preceding_dealloc(%arg0: memref) -> memref<32xf32> { // ----- -// CHECK-LABEL: func @tensor_cast_to_memref +// CHECK-LABEL: func @tensor_cast_to_buffer // CHECK-SAME: %[[ARG0:.+]]: tensor<4x6x16x32xi8> -func.func @tensor_cast_to_memref(%arg0 : tensor<4x6x16x32xi8>) -> +func.func @tensor_cast_to_buffer(%arg0 : tensor<4x6x16x32xi8>) -> memref { %0 = tensor.cast %arg0 : tensor<4x6x16x32xi8> to tensor - %1 = bufferization.to_memref %0 : tensor to memref + %1 = bufferization.to_buffer %0 : tensor to memref return %1 : memref } -// CHECK: %[[M:.+]] = bufferization.to_memref %[[ARG0]] : tensor<4x6x16x32xi8> +// CHECK: %[[M:.+]] = bufferization.to_buffer %[[ARG0]] : tensor<4x6x16x32xi8> // CHECK: %[[M1:.+]] = memref.cast %[[M]] // CHECK-SAME: memref<4x6x16x32xi8> to memref // CHECK: return %[[M1]] : memref // ----- -// Folding of memref.load(to_memref(%v, %idxs)) -> tensor.extract(%v, %idx) +// Folding of memref.load(to_buffer(%v, %idxs)) -> tensor.extract(%v, %idx) // CHECK-LABEL: func @load_from_buffer_cast( func.func @load_from_buffer_cast(%arg0: index, %arg1: index, %arg2: tensor) -> f32 { - %0 = bufferization.to_memref %arg2 : tensor to memref + %0 = bufferization.to_buffer %arg2 : tensor to memref %1 = memref.load %0[%arg0, %arg1] : memref return %1 : f32 } diff --git a/mlir/test/Dialect/Bufferization/ops.mlir b/mlir/test/Dialect/Bufferization/ops.mlir index 7b6a6f492d069..fc6df4a09f706 100644 --- a/mlir/test/Dialect/Bufferization/ops.mlir +++ b/mlir/test/Dialect/Bufferization/ops.mlir @@ -11,12 +11,12 @@ func.func @test_clone(%buf : memref<*xf32>) -> memref<*xf32> { return %clone : memref<*xf32> } -// CHECK-LABEL: test_to_memref -func.func @test_to_memref(%arg0: tensor, %arg1: tensor<*xi64>) +// CHECK-LABEL: test_to_buffer +func.func @test_to_buffer(%arg0: tensor, %arg1: tensor<*xi64>) -> (memref (d0 + 7)>>, memref<*xi64, 1>) { - %0 = bufferization.to_memref %arg0 + %0 = bufferization.to_buffer %arg0 : tensor to memref (d0 + 7)>> - %1 = bufferization.to_memref %arg1 + %1 = bufferization.to_buffer %arg1 : tensor<*xi64> to memref<*xi64, 1> return %0, %1 : memref (d0 + 7)>>, memref<*xi64, 1> } diff --git a/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir b/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir index f5c9f81a18997..e37b63d01378b 100644 --- a/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir @@ -3,7 +3,7 @@ // CHECK-NO-FUNC-LABEL: func @br( // CHECK-NO-FUNC-SAME: %[[t:.*]]: tensor<5xf32>) -// CHECK-NO-FUNC: %[[m:.*]] = bufferization.to_memref %[[t]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>> +// CHECK-NO-FUNC: %[[m:.*]] = bufferization.to_buffer %[[t]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>> // CHECK-NO-FUNC: %[[r:.*]] = scf.execute_region -> memref<5xf32, strided<[?], offset: ?>> { // CHECK-NO-FUNC: cf.br ^[[block:.*]](%[[m]] // CHECK-NO-FUNC: ^[[block]](%[[arg1:.*]]: memref<5xf32, strided<[?], offset: ?>>): @@ -23,7 +23,7 @@ func.func @br(%t: tensor<5xf32>) { // CHECK-NO-FUNC-LABEL: func @cond_br( // CHECK-NO-FUNC-SAME: %[[t1:.*]]: tensor<5xf32>, -// CHECK-NO-FUNC: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>> +// CHECK-NO-FUNC: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>> // CHECK-NO-FUNC: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32> // CHECK-NO-FUNC: %[[r:.*]] = scf.execute_region -> memref<5xf32, strided<[?], offset: ?>> { // CHECK-NO-FUNC: cf.cond_br %{{.*}}, ^[[block1:.*]](%[[m1]] : {{.*}}), ^[[block2:.*]](%[[alloc]] : {{.*}}) diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index 2e6acc13d1627..a0273fb1e1bf4 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -1045,3 +1045,11 @@ llvm.func @llvm.aarch64.neon.st3.v8i8.p0(vector<8xi8>, vector<8xi8>, vector<8xi8 llvm.mlir.global internal thread_local unnamed_addr @myglobal(-1 : i32) {addr_space = 0 : i32, alignment = 4 : i64, dso_local} : i32 // CHECK: llvm.mlir.global internal thread_local unnamed_addr @myglobal(-1 : i32) {addr_space = 0 : i32, alignment = 4 : i64, dso_local} : i32 + +// CHECK-LABEL: llvm.func @escapedtypename +llvm.func @escapedtypename() { + %0 = llvm.mlir.constant(1 : i32) : i32 + // CHECK: llvm.alloca %0 x !llvm.struct<"bucket::Iterator", (ptr, i64, i64)> + %1 = llvm.alloca %0 x !llvm.struct<"bucket::Iterator", (ptr, i64, i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + llvm.return +} diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir index 530badebd5c70..1c6cb88fa028b 100644 --- a/mlir/test/Dialect/Linalg/bufferize.mlir +++ b/mlir/test/Dialect/Linalg/bufferize.mlir @@ -3,7 +3,7 @@ #map0 = affine_map<(d0) -> (d0)> // In-depth checking of a basic case, this is testing -// - bufferization.to_memref / bufferization.to_tensor materializations are +// - bufferization.to_buffer / bufferization.to_tensor materializations are // properly inserted // - payload is correctly carried over // - affine maps are correctly carried over @@ -12,7 +12,7 @@ // CHECK: #map = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @basic( // CHECK-SAME: %[[TENSOR:.*]]: tensor<4xf32>) -> tensor<4xf32> { -// CHECK-DAG: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<4xf32> to memref<4xf32> +// CHECK-DAG: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor<4xf32> to memref<4xf32> // CHECK-DAG: %[[RESULT_MEMREF:.*]] = memref.alloc() {{.*}} : memref<4xf32> // CHECK: linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} // CHECK-SAME: ins(%[[MEMREF]] : memref<4xf32>) @@ -46,7 +46,7 @@ func.func @basic(%arg0: tensor<4xf32>) -> tensor<4xf32> { // CHECK: #map = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @empty_tensor( // CHECK-SAME: %[[IN:.*]]: tensor, %[[SIZE:.*]]: index) -// CHECK-DAG: %[[MEMREF:.*]] = bufferization.to_memref %[[IN]] : tensor to memref +// CHECK-DAG: %[[MEMREF:.*]] = bufferization.to_buffer %[[IN]] : tensor to memref // CHECK-DAG: %[[OUT_BUF:.*]] = memref.alloc(%[[SIZE]]) {{.*}} : memref // CHECK: linalg.generic // CHECK-SAME: ins(%[[MEMREF]] : memref) @@ -105,7 +105,7 @@ func.func @multiple_results(%arg0: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf3 // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG]], %[[C1]] : tensor // CHECK-DAG: %[[RESULT0:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref // CHECK-DAG: %[[RESULT1:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref -// CHECK-DAG: %[[MEMREF_ARG:.*]] = bufferization.to_memref %[[ARG]] : tensor to memref +// CHECK-DAG: %[[MEMREF_ARG:.*]] = bufferization.to_buffer %[[ARG]] : tensor to memref // CHECK: linalg.generic // CHECK-SAME: ins(%[[MEMREF_ARG]] : memref) // CHECK-SAME: outs(%[[RESULT0]], %[[RESULT1]] : memref, memref) @@ -141,8 +141,8 @@ func.func @dynamic_results(%arg0: tensor) // CHECK-SAME: %[[ARG0_TENSOR:.*]]: tensor<2x3x4xvector<3x4xi4>>, // CHECK-SAME: %[[ARG1_TENSOR:.*]]: tensor<3x2xf32>) -> tensor<3x2xf32> { // CHECK-DAG: %[[INIT_BUFFER:.*]] = memref.alloc() {{.*}} : memref<3x2xf32> -// CHECK-DAG: %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0_TENSOR]] : tensor<2x3x4xvector<3x4xi4>> -// CHECK-DAG: %[[ARG1_MEMREF:.*]] = bufferization.to_memref %[[ARG1_TENSOR]] : tensor<3x2xf32> +// CHECK-DAG: %[[ARG0_MEMREF:.*]] = bufferization.to_buffer %[[ARG0_TENSOR]] : tensor<2x3x4xvector<3x4xi4>> +// CHECK-DAG: %[[ARG1_MEMREF:.*]] = bufferization.to_buffer %[[ARG1_TENSOR]] : tensor<3x2xf32> // CHECK: memref.copy %[[ARG1_MEMREF]], %[[INIT_BUFFER]] : memref<3x2xf32> to memref<3x2xf32> // CHECK: linalg.generic // CHECK-SAME: ins(%[[ARG0_MEMREF]] : memref<2x3x4xvector<3x4xi4>>) @@ -194,7 +194,7 @@ func.func @bufferize_dot(%in: tensor<4xf32>, %out: tensor) -> tensor { // CHECK-LABEL: func @bufferize_softmax( // CHECK-SAME: %[[arg0:.*]]: tensor<2x16x32xf32>, %[[arg1:.*]]: tensor<2x16x32xf32> -// CHECK: %[[m0:.*]] = bufferization.to_memref %[[arg0]] +// CHECK: %[[m0:.*]] = bufferization.to_buffer %[[arg0]] // CHECK: %[[alloc:.*]] = memref.alloc() // CHECK-NOT: memref.copy // CHECK: linalg.softmax dimension(2) ins(%[[m0]] : {{.*}}) outs(%[[alloc:.*]] : {{.*}}) diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir index 63f068d3f8681..31c9e9ed3c501 100644 --- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir +++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir @@ -455,10 +455,9 @@ func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56 // CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_EMPTY_UNPACK]] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]]] -// CHECK-SAME: outs(%[[EMPTY]] +// CHECK-SAME: outs(%[[ARG0]] // CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[UNPACKED_ARG0]] @@ -482,11 +481,14 @@ func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56 // CHECK-LABEL: func.func @unpack_on_input // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> +// CHECK: %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> +// CHECK: %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]] +// CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] +// CHECK-SAME: into %[[ARG1_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[ARG0]] -// CHECK-SAME: outs(%[[EMPTY]] +// CHECK-SAME: outs(%[[ARG1_PACK]] // CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1]] @@ -510,11 +512,14 @@ func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: t // CHECK-LABEL: func.func @unpack_element_type_change // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] -// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf16> +// CHECK: %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf16> +// CHECK: %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]] +// CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] +// CHECK-SAME: into %[[ARG1_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[ARG0]] -// CHECK-SAME: outs(%[[EMPTY]] +// CHECK-SAME: outs(%[[ARG1_PACK]] // CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1]] @@ -1397,10 +1402,13 @@ func.func @push_unpack_in_padded_domain_foldable(%arg0: tensor<8x8x4x8xf32>, %de // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] -// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[ARG2_PACK_EMPTY:.+]] = tensor.empty +// CHECK: %[[ARG2_PACK:.+]] = linalg.pack %[[ARG2]] +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 8] +// CHECK-SAME: into %[[ARG2_PACK_EMPTY]] // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: ins(%[[ARG0]] : tensor<8x8x4x8xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor) +// CHECK-SAME: outs(%[[ARG2_PACK]] : tensor) // CHECK: %[[UNPACK:.+]] = linalg.unpack %[[GENERIC]] // CHECK-SAME: into %[[ARG2]] // CHECK: return %[[UNPACK]] : tensor @@ -1419,10 +1427,13 @@ func.func @push_unpack_in_padded_domain_out_used(%arg0: tensor<8x8x4x8xf32>, %ar // CHECK-LABEL: func.func @push_unpack_in_padded_domain_out_used // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] -// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[ARG1_PACK_EMPTY:.+]] = tensor.empty +// CHECK: %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]] +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 8] +// CHECK-SAME: into %[[ARG1_PACK_EMPTY]] // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: ins(%[[ARG0]] : tensor<8x8x4x8xf32>) -// CHECK-SAME: outs(%[[EMPTY]] : tensor) +// CHECK-SAME: outs(%[[ARG1_PACK]] : tensor) // CHECK: %[[UNPACK2:.+]] = linalg.unpack %[[GENERIC]] // CHECK-SAME: into %[[ARG1]] // CHECK: return %[[UNPACK2]] : tensor diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir index 4e1035e038ca5..318edca73cce1 100644 --- a/mlir/test/Dialect/Linalg/hoisting.mlir +++ b/mlir/test/Dialect/Linalg/hoisting.mlir @@ -519,7 +519,7 @@ module attributes {transform.with_named_sequence} { // memory (i.e. `%collapsed_1` and `%collapsed_2` alias): // %acc = vector.transfer_read %collapsed_2[%c0] -// CHECK-LABEL: func.func @no_hoisting_write_to_memref +// CHECK-LABEL: func.func @no_hoisting_write_to_buffer // CHECK: scf.for {{.*}} { // CHECK: vector.transfer_read {{.*}} : memref<2xi32>, vector<1xi32> // CHECK-NEXT: vector.transfer_read {{.*}} : memref<2xi32>, vector<1xi32> @@ -527,7 +527,7 @@ module attributes {transform.with_named_sequence} { // CHECK-NEXT: vector.transfer_write {{.*}} : vector<1xi32>, memref<2xi32> // CHECK-NEXT: } -func.func @no_hoisting_write_to_memref(%rhs: i32, %arg1: vector<1xi32>) { +func.func @no_hoisting_write_to_buffer(%rhs: i32, %arg1: vector<1xi32>) { %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir index 35cbd7725ec50..4d7ddc8a513c4 100644 --- a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir @@ -101,7 +101,7 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func @tensor_pad_constant( // CHECK-SAME: %[[t:.*]]: tensor -// CHECK: %[[src:.*]] = bufferization.to_memref %[[t]] +// CHECK: %[[src:.*]] = bufferization.to_buffer %[[t]] // CHECK: %[[alloc:.*]] = memref.alloc // CHECK: %[[subview:.*]] = memref.subview %[[alloc]] // CHECK: memref.copy %[[src]], %[[subview]] @@ -130,7 +130,7 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func @tensor_insert( // CHECK-SAME: %[[t:.*]]: tensor -// CHECK: %[[m:.*]] = bufferization.to_memref %[[t]] +// CHECK: %[[m:.*]] = bufferization.to_buffer %[[t]] // CHECK: %[[alloc:.*]] = memref.alloc(%{{.*}}) : memref // CHECK: memref.copy %[[m]], %[[alloc]] // CHECK: memref.store %{{.*}}, %[[alloc]] diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir index 4115f2857a20c..572a2ae70e0a4 100644 --- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir @@ -206,6 +206,106 @@ module { #map1 = affine_map<(d0)[s0] -> (d0 * s0)> #map2 = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> +module { + // CHECK-LABEL: func.func @fuse_tileable_op_through_bbarg_inout + // CHECK-SAME: %[[CHUNK_SIZE:[0-9a-z]+]]: index + // CHECK-SAME: %[[INOUT:[0-9a-z]+]]: tensor + func.func @fuse_tileable_op_through_bbarg_inout(%arg0: index, %arg1: tensor) -> tensor { + %cst = arith.constant 4.200000e+01 : f32 + %c0 = arith.constant 0 : index + %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor) -> tensor + %d0 = tensor.dim %arg1, %c0 : tensor + %1 = affine.apply #map0()[%d0, %arg0] + + // CHECK: scf.forall {{.*}} shared_outs(%[[BBARGOUT:.*]] = %[[INOUT]]) -> (tensor) { + %2 = scf.forall (%arg3) in (%1) shared_outs(%o = %arg1) -> (tensor) { + %3 = affine.apply #map1(%arg3)[%arg0] + %4 = affine.min #map2(%arg3)[%d0, %arg0] + %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor to tensor + + // CHECK: %[[T0:.*]] = tensor.extract_slice %[[BBARGOUT]][%{{.*}}] [%{{.*}}] [{{.*}}] + // CHECK: %[[T1:.*]] = tensor.extract_slice %[[BBARGOUT]][%{{.*}}] [%{{.*}}] [{{.*}}] + // CHECK: %[[T2:.*]] = linalg.fill {{.*}} outs(%[[T1]] + %6 = tensor.extract_slice %0[%3] [%4] [1] : tensor to tensor + + // CHECK: %[[T3:.*]] = linalg.elemwise_unary ins(%[[T2]] : tensor) outs(%[[T0]] : tensor) + %7 = linalg.elemwise_unary ins(%6 : tensor) outs(%5 : tensor) -> tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor into tensor + } + } + // CHECK: } + func.return %2 : tensor + } + + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.any_op + + // linalg.fill is tileable. The op is tiled and fused. + transform.structured.fuse_into_containing_op %0 into %1 + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } + } +} + +// ----- + +module { + // CHECK-LABEL: func.func @fuse_tileable_op_through_bbarg_inout_nested + // CHECK-SAME: %[[ARG0:[0-9a-z]+]]: tensor + // CHECK-SAME: %[[ARG1:[0-9a-z]+]]: tensor + func.func @fuse_tileable_op_through_bbarg_inout_nested(%arg0: tensor, %arg1: tensor) -> tensor { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = linalg.elemwise_unary {fun = #linalg.unary_fn} ins(%arg0 : tensor) outs(%arg1 : tensor) -> tensor + %dim = tensor.dim %arg1, %c0 : tensor + %dim_0 = tensor.dim %arg1, %c1 : tensor + %dim_1 = tensor.dim %arg1, %c2 : tensor + // CHECK: scf.for {{.*}} iter_args(%[[BBARG0:.*]] = %[[ARG1]]) -> (tensor) { + // CHECK: scf.for {{.*}} iter_args(%[[BBARG1:.*]] = %[[BBARG0]]) -> (tensor) { + // CHECK: scf.for {{.*}} iter_args(%[[BBARG2:.*]] = %[[BBARG1]]) -> (tensor) { + %1 = scf.for %arg2 = %c0 to %dim step %c1 iter_args(%arg3 = %arg1) -> (tensor) { + %2 = scf.for %arg4 = %c0 to %dim_0 step %c1 iter_args(%arg5 = %arg3) -> (tensor) { + %3 = scf.for %arg6 = %c0 to %dim_1 step %c1 iter_args(%arg7 = %arg5) -> (tensor) { + // CHECK: %[[EX1:.*]] = tensor.extract_slice %[[BBARG2]]{{.*}}: tensor to tensor<1x1x1xf32> + // CHECK: linalg.elemwise_unary {fun = #linalg.unary_fn} ins({{.*}} : tensor<1x1x1xf32>) outs(%[[EX1]] : tensor<1x1x1xf32>) -> tensor<1x1x1xf32> + // CHECK: %[[EX2:.*]] = tensor.extract_slice %[[BBARG2]]{{.*}} : tensor to tensor<1x1x1xf32> + // CHECK: linalg.elemwise_unary {fun = #linalg.unary_fn} ins({{.*}} : tensor<1x1x1xf32>) outs(%[[EX2]] : tensor<1x1x1xf32>) -> tensor<1x1x1xf32> + %extracted_slice = tensor.extract_slice %0[%arg2, %arg4, %arg6] [1, 1, 1] [1, 1, 1] : tensor to tensor<1x1x1xf32> + %extracted_slice_2 = tensor.extract_slice %arg7[%arg2, %arg4, %arg6] [1, 1, 1] [1, 1, 1] : tensor to tensor<1x1x1xf32> + %4 = linalg.elemwise_unary {fun = #linalg.unary_fn} ins(%extracted_slice : tensor<1x1x1xf32>) outs(%extracted_slice_2 : tensor<1x1x1xf32>) -> tensor<1x1x1xf32> + %inserted_slice = tensor.insert_slice %4 into %arg7[%arg2, %arg4, %arg6] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xf32> into tensor + scf.yield %inserted_slice : tensor + } + scf.yield %3 : tensor + } + scf.yield %2 : tensor + } + return %1 : tensor + } + + module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.elemwise_unary"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %2:2 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %3:3 = transform.split_handle %1 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.structured.fuse_into_containing_op %2#0 into %3#2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } + } +} + +// ----- + +#map0 = affine_map<()[s0, s1] -> (s0 ceildiv s1)> +#map1 = affine_map<(d0)[s0] -> (d0 * s0)> +#map2 = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> + module { // CHECK-LABEL: func.func @fuse_tileable_multi_output_op // CHECK-SAME: %[[CHUNK_SIZE:[0-9a-z]+]]: index diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index 299be1296aa66..6b760a15afd56 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -641,7 +641,9 @@ func.func @test_masked_vectorize_dynamic_pad( // CHECK-SAME: } : vector<2x4xi1> -> vector<2x4xf32> // CHECK-DAG: %[[empty:.*]] = tensor.empty(%[[res_d0]], %[[res_d1]]) : tensor // CHECK-DAG: %[[c0_3:.*]] = arith.constant 0 : index - // CHECK: %[[mask_2:.*]] = vector.create_mask %[[res_d0]], %[[res_d1]] : vector<2x4xi1> + // CHECK-DAG: %[[d2:.*]] = tensor.dim %[[empty]], {{.*}} : tensor + // CHECK-DAG: %[[d3:.*]] = tensor.dim %[[empty]], {{.*}} : tensor + // CHECK: %[[mask_2:.*]] = vector.create_mask %[[d2]], %[[d3]] : vector<2x4xi1> // CHECK: %[[masked_write:.*]] = vector.mask %[[mask_2]] { // CHECK-SAME: vector.transfer_write %[[masked_read]], %[[empty]][%[[c0_3]], %[[c0_3]]] // CHECK-SAME: {in_bounds = [true, true]} : vector<2x4xf32>, tensor @@ -800,7 +802,9 @@ func.func @test_vectorize_dynamic_pack(%arg0: tensor, %arg1: tensor -// CHECK: %[[mask_0:.*]] = vector.create_mask %[[d0]], %[[d1]], %[[c16]], %[[c2]] : vector<4x1x16x2xi1> +// CHECK-DAG: %[[d2:.*]] = tensor.dim %[[empty]], {{.*}} : tensor +// CHECK-DAG: %[[d3:.*]] = tensor.dim %[[empty]], {{.*}} : tensor +// CHECK: %[[mask_0:.*]] = vector.create_mask %[[d2]], %[[d3]], %[[c16]], %[[c2]] : vector<4x1x16x2xi1> // CHECK: %[[masked_write:.*]] = vector.mask %[[mask_0]] { // CHECK-SAME: vector.transfer_write %[[transpose]], %[[empty]][%[[c0_2]], %[[c0_2]], %[[c0_2]], %[[c0_2]]] // CHECK-SAME: {in_bounds = [true, true, true, true]} : vector<4x1x16x2xf32>, tensor diff --git a/mlir/test/Dialect/MemRef/normalize-memrefs.mlir b/mlir/test/Dialect/MemRef/normalize-memrefs.mlir index 440f4776424cc..d2924fb1ecf77 100644 --- a/mlir/test/Dialect/MemRef/normalize-memrefs.mlir +++ b/mlir/test/Dialect/MemRef/normalize-memrefs.mlir @@ -374,7 +374,7 @@ func.func @neg_map() -> memref<2x3xf32, #neg> { // CHECK-LABEL: func @memref_with_strided_offset func.func @memref_with_strided_offset(%arg0: tensor<128x512xf32>, %arg1: index, %arg2: index) -> tensor<16x512xf32> { %c0 = arith.constant 0 : index - %0 = bufferization.to_memref %arg0 : tensor<128x512xf32> to memref<128x512xf32, strided<[?, ?], offset: ?>> + %0 = bufferization.to_buffer %arg0 : tensor<128x512xf32> to memref<128x512xf32, strided<[?, ?], offset: ?>> %subview = memref.subview %0[%arg2, 0] [%arg1, 512] [1, 1] : memref<128x512xf32, strided<[?, ?], offset: ?>> to memref> // CHECK: %{{.*}} = memref.cast %{{.*}} : memref> to memref<16x512xf32, strided<[?, ?], offset: ?>> %cast = memref.cast %subview : memref> to memref<16x512xf32, strided<[?, ?], offset: ?>> diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir index c8d7a87112917..aadf189273212 100644 --- a/mlir/test/Dialect/OpenACC/invalid.mlir +++ b/mlir/test/Dialect/OpenACC/invalid.mlir @@ -129,8 +129,8 @@ acc.update %cst = arith.constant 1 : index %value = memref.alloc() : memref %0 = acc.update_device varPtr(%value : memref) -> memref -// expected-error@+1 {{async attribute cannot appear with asyncOperand}} -acc.update async(%cst: index) dataOperands(%0 : memref) attributes {async = [#acc.device_type]} +// expected-error@+1 {{asyncOnly attribute cannot appear with asyncOperand}} +acc.update async(%cst: index) dataOperands(%0 : memref) attributes {asyncOnly = [#acc.device_type]} // ----- @@ -138,7 +138,7 @@ acc.update async(%cst: index) dataOperands(%0 : memref) attributes {async = %value = memref.alloc() : memref %0 = acc.update_device varPtr(%value : memref) -> memref // expected-error@+1 {{wait attribute cannot appear with waitOperands}} -acc.update wait({%cst: index}) dataOperands(%0: memref) attributes {waitOnly = [#acc.device_type]} +acc.update wait({%cst: index}) dataOperands(%0: memref) attributes {waitOnly = [#acc.device_type]} // ----- diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 4c842a26f8dc4..550f295f074a2 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -435,10 +435,10 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x } attributes {defaultAttr = #acc} acc.parallel { } attributes {defaultAttr = #acc} - acc.parallel { - } attributes {asyncAttr} - acc.parallel { - } attributes {waitAttr} + acc.parallel async { + } + acc.parallel wait { + } acc.parallel { } attributes {selfAttr} return @@ -488,10 +488,10 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x // CHECK-NEXT: } attributes {defaultAttr = #acc} // CHECK: acc.parallel { // CHECK-NEXT: } attributes {defaultAttr = #acc} -// CHECK: acc.parallel { -// CHECK-NEXT: } attributes {asyncAttr} -// CHECK: acc.parallel { -// CHECK-NEXT: } attributes {waitAttr} +// CHECK: acc.parallel async { +// CHECK-NEXT: } +// CHECK: acc.parallel wait { +// CHECK-NEXT: } // CHECK: acc.parallel { // CHECK-NEXT: } attributes {selfAttr} @@ -567,10 +567,10 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10 } attributes {defaultAttr = #acc} acc.serial { } attributes {defaultAttr = #acc} - acc.serial { - } attributes {asyncAttr} - acc.serial { - } attributes {waitAttr} + acc.serial async { + } + acc.serial wait { + } acc.serial { } attributes {selfAttr} acc.serial { @@ -604,10 +604,10 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10 // CHECK-NEXT: } attributes {defaultAttr = #acc} // CHECK: acc.serial { // CHECK-NEXT: } attributes {defaultAttr = #acc} -// CHECK: acc.serial { -// CHECK-NEXT: } attributes {asyncAttr} -// CHECK: acc.serial { -// CHECK-NEXT: } attributes {waitAttr} +// CHECK: acc.serial async { +// CHECK-NEXT: } +// CHECK: acc.serial wait { +// CHECK-NEXT: } // CHECK: acc.serial { // CHECK-NEXT: } attributes {selfAttr} // CHECK: acc.serial { @@ -639,10 +639,10 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10 } attributes {defaultAttr = #acc} acc.kernels { } attributes {defaultAttr = #acc} - acc.kernels { - } attributes {asyncAttr} - acc.kernels { - } attributes {waitAttr} + acc.kernels async { + } + acc.kernels wait { + } acc.kernels { } attributes {selfAttr} acc.kernels { @@ -673,10 +673,10 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10 // CHECK-NEXT: } attributes {defaultAttr = #acc} // CHECK: acc.kernels { // CHECK-NEXT: } attributes {defaultAttr = #acc} -// CHECK: acc.kernels { -// CHECK-NEXT: } attributes {asyncAttr} -// CHECK: acc.kernels { -// CHECK-NEXT: } attributes {waitAttr} +// CHECK: acc.kernels async { +// CHECK-NEXT: } +// CHECK: acc.kernels wait { +// CHECK-NEXT: } // CHECK: acc.kernels { // CHECK-NEXT: } attributes {selfAttr} // CHECK: acc.kernels { @@ -787,23 +787,23 @@ func.func @testdataop(%a: memref, %b: memref, %c: memref) -> () { acc.data { } attributes { defaultAttr = #acc } - acc.data { - } attributes { defaultAttr = #acc, async } + acc.data async { + } attributes { defaultAttr = #acc } %a1 = arith.constant 1 : i64 acc.data async(%a1 : i64) { - } attributes { defaultAttr = #acc, async } + } attributes { defaultAttr = #acc } - acc.data { - } attributes { defaultAttr = #acc, wait } + acc.data wait { + } attributes { defaultAttr = #acc } %w1 = arith.constant 1 : i64 acc.data wait({%w1 : i64}) { - } attributes { defaultAttr = #acc, wait } + } attributes { defaultAttr = #acc } %wd1 = arith.constant 1 : i64 acc.data wait({devnum: %wd1 : i64, %w1 : i64}) { - } attributes { defaultAttr = #acc, wait } + } attributes { defaultAttr = #acc } return } @@ -904,20 +904,20 @@ func.func @testdataop(%a: memref, %b: memref, %c: memref) -> () { // CHECK: acc.data { // CHECK-NEXT: } attributes {defaultAttr = #acc} -// CHECK: acc.data { -// CHECK-NEXT: } attributes {async, defaultAttr = #acc} +// CHECK: acc.data async { +// CHECK-NEXT: } attributes {defaultAttr = #acc} // CHECK: acc.data async(%{{.*}} : i64) { -// CHECK-NEXT: } attributes {async, defaultAttr = #acc} +// CHECK-NEXT: } attributes {defaultAttr = #acc} -// CHECK: acc.data { -// CHECK-NEXT: } attributes {defaultAttr = #acc, wait} +// CHECK: acc.data wait { +// CHECK-NEXT: } attributes {defaultAttr = #acc} // CHECK: acc.data wait({%{{.*}} : i64}) { -// CHECK-NEXT: } attributes {defaultAttr = #acc, wait} +// CHECK-NEXT: } attributes {defaultAttr = #acc} // CHECK: acc.data wait({devnum: %{{.*}} : i64, %{{.*}} : i64}) { -// CHECK-NEXT: } attributes {defaultAttr = #acc, wait} +// CHECK-NEXT: } attributes {defaultAttr = #acc} // ----- @@ -977,7 +977,7 @@ acc.wait async(%i32Value: i32) acc.wait async(%idxValue: index) acc.wait(%i32Value: i32) async(%idxValue: index) acc.wait(%i64Value: i64) wait_devnum(%i32Value: i32) -acc.wait attributes {async} +acc.wait async acc.wait(%i64Value: i64) async(%idxValue: index) wait_devnum(%i32Value: i32) acc.wait(%i64Value: i64) wait_devnum(%i32Value: i32) async(%idxValue: index) acc.wait if(%ifCond) @@ -996,7 +996,7 @@ acc.wait if(%ifCond) // CHECK: acc.wait async([[IDXVALUE]] : index) // CHECK: acc.wait([[I32VALUE]] : i32) async([[IDXVALUE]] : index) // CHECK: acc.wait([[I64VALUE]] : i64) wait_devnum([[I32VALUE]] : i32) -// CHECK: acc.wait attributes {async} +// CHECK: acc.wait async // CHECK: acc.wait([[I64VALUE]] : i64) async([[IDXVALUE]] : index) wait_devnum([[I32VALUE]] : i32) // CHECK: acc.wait([[I64VALUE]] : i64) async([[IDXVALUE]] : index) wait_devnum([[I32VALUE]] : i32) // CHECK: acc.wait if([[IFCOND]]) @@ -1078,7 +1078,7 @@ func.func @testexitdataop(%a: !llvm.ptr) -> () { acc.delete accPtr(%1 : !llvm.ptr) %2 = acc.getdeviceptr varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr - acc.exit_data dataOperands(%2 : !llvm.ptr) attributes {async,finalize} + acc.exit_data async dataOperands(%2 : !llvm.ptr) attributes {finalize} acc.delete accPtr(%2 : !llvm.ptr) %3 = acc.getdeviceptr varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr @@ -1086,11 +1086,11 @@ func.func @testexitdataop(%a: !llvm.ptr) -> () { acc.detach accPtr(%3 : !llvm.ptr) %4 = acc.getdeviceptr varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr - acc.exit_data dataOperands(%4 : !llvm.ptr) attributes {async} + acc.exit_data async dataOperands(%4 : !llvm.ptr) acc.copyout accPtr(%4 : !llvm.ptr) to varPtr(%a : !llvm.ptr) varType(f64) %5 = acc.getdeviceptr varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr - acc.exit_data dataOperands(%5 : !llvm.ptr) attributes {wait} + acc.exit_data wait dataOperands(%5 : !llvm.ptr) acc.delete accPtr(%5 : !llvm.ptr) %6 = acc.getdeviceptr varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr @@ -1127,7 +1127,7 @@ func.func @testexitdataop(%a: !llvm.ptr) -> () { // CHECK: acc.delete accPtr(%[[DEVPTR]] : !llvm.ptr) // CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr -// CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !llvm.ptr) attributes {async, finalize} +// CHECK: acc.exit_data async dataOperands(%[[DEVPTR]] : !llvm.ptr) attributes {finalize} // CHECK: acc.delete accPtr(%[[DEVPTR]] : !llvm.ptr) // CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr @@ -1135,11 +1135,11 @@ func.func @testexitdataop(%a: !llvm.ptr) -> () { // CHECK: acc.detach accPtr(%[[DEVPTR]] : !llvm.ptr) // CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr -// CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !llvm.ptr) attributes {async} +// CHECK: acc.exit_data async dataOperands(%[[DEVPTR]] : !llvm.ptr) // CHECK: acc.copyout accPtr(%[[DEVPTR]] : !llvm.ptr) to varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) // CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr -// CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !llvm.ptr) attributes {wait} +// CHECK: acc.exit_data wait dataOperands(%[[DEVPTR]] : !llvm.ptr) // CHECK: acc.delete accPtr(%[[DEVPTR]] : !llvm.ptr) // CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr @@ -1176,9 +1176,9 @@ func.func @testenterdataop(%a: !llvm.ptr, %b: !llvm.ptr, %c: !llvm.ptr) -> () { %4 = acc.attach varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr acc.enter_data dataOperands(%4 : !llvm.ptr) %5 = acc.copyin varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr - acc.enter_data dataOperands(%5 : !llvm.ptr) attributes {async} + acc.enter_data async dataOperands(%5 : !llvm.ptr) %6 = acc.create varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr - acc.enter_data dataOperands(%6 : !llvm.ptr) attributes {wait} + acc.enter_data wait dataOperands(%6 : !llvm.ptr) %7 = acc.copyin varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr acc.enter_data async(%i64Value : i64) dataOperands(%7 : !llvm.ptr) %8 = acc.copyin varPtr(%a : !llvm.ptr) varType(f64) -> !llvm.ptr @@ -1205,9 +1205,9 @@ func.func @testenterdataop(%a: !llvm.ptr, %b: !llvm.ptr, %c: !llvm.ptr) -> () { // CHECK: %[[ATTACH:.*]] = acc.attach varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr // CHECK: acc.enter_data dataOperands(%[[ATTACH]] : !llvm.ptr) // CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr -// CHECK: acc.enter_data dataOperands(%[[COPYIN]] : !llvm.ptr) attributes {async} +// CHECK: acc.enter_data async dataOperands(%[[COPYIN]] : !llvm.ptr) // CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr -// CHECK: acc.enter_data dataOperands(%[[CREATE]] : !llvm.ptr) attributes {wait} +// CHECK: acc.enter_data wait dataOperands(%[[CREATE]] : !llvm.ptr) // CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr // CHECK: acc.enter_data async([[I64VALUE]] : i64) dataOperands(%[[COPYIN]] : !llvm.ptr) // CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%[[ARGA]] : !llvm.ptr) varType(f64) -> !llvm.ptr diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index b7e16b7ec35e2..a9e4af035dbd7 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -802,10 +802,14 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref, tensor) map_clauses(always, from) capture(ByRef) -> memref {name = ""} omp.target_data if(%if_cond) device(%device : si32) map_entries(%mapv1 : memref){} - // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref, tensor) map_clauses(close, present, to) capture(ByRef) -> memref {name = ""} - // CHECK: omp.target_data map_entries(%[[MAP_A]] : memref) use_device_addr(%[[VAL_3:.*]] -> %{{.*}} : memref) use_device_ptr(%[[VAL_4:.*]] -> %{{.*}} : memref) + // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%{{.*}} : memref, tensor) map_clauses(close, present, to) capture(ByRef) -> memref {name = ""} + // CHECK: %[[DEV_ADDR:.*]] = omp.map.info var_ptr(%{{.*}} : memref, tensor) map_clauses(return_param) capture(ByRef) -> memref {name = ""} + // CHECK: %[[DEV_PTR:.*]] = omp.map.info var_ptr(%{{.*}} : memref, tensor) map_clauses(return_param) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target_data map_entries(%[[MAP_A]] : memref) use_device_addr(%[[DEV_ADDR]] -> %{{.*}} : memref) use_device_ptr(%[[DEV_PTR]] -> %{{.*}} : memref) %mapv2 = omp.map.info var_ptr(%map1 : memref, tensor) map_clauses(close, present, to) capture(ByRef) -> memref {name = ""} - omp.target_data map_entries(%mapv2 : memref) use_device_addr(%device_addr -> %arg0 : memref) use_device_ptr(%device_ptr -> %arg1 : memref) { + %device_addrv1 = omp.map.info var_ptr(%device_addr : memref, tensor) map_clauses(return_param) capture(ByRef) -> memref {name = ""} + %device_ptrv1 = omp.map.info var_ptr(%device_ptr : memref, tensor) map_clauses(return_param) capture(ByRef) -> memref {name = ""} + omp.target_data map_entries(%mapv2 : memref) use_device_addr(%device_addrv1 -> %arg0 : memref) use_device_ptr(%device_ptrv1 -> %arg1 : memref) { omp.terminator } diff --git a/mlir/test/Dialect/Polynomial/attributes.mlir b/mlir/test/Dialect/Polynomial/attributes.mlir deleted file mode 100644 index cb3216900cb43..0000000000000 --- a/mlir/test/Dialect/Polynomial/attributes.mlir +++ /dev/null @@ -1,73 +0,0 @@ -// RUN: mlir-opt %s --split-input-file --verify-diagnostics - -#my_poly = #polynomial.int_polynomial -// expected-error@below {{polynomials must have one indeterminate, but there were multiple: x, y}} -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{expected integer value}} -// expected-error@below {{expected a monomial}} -// expected-error@below {{found invalid integer exponent}} -#my_poly = #polynomial.int_polynomial<5 + x**f> -#ring1 = #polynomial.ring - -// ----- - -#my_poly = #polynomial.int_polynomial<5 + x**2 + 3x**2> -// expected-error@below {{parsed polynomial must have unique exponents among monomials}} -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{expected + and more monomials, or > to end polynomial attribute}} -#my_poly = #polynomial.int_polynomial<5 + x**2 7> -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{expected a monomial}} -#my_poly = #polynomial.int_polynomial<5 + x**2 +> -#ring1 = #polynomial.ring - - -// ----- - -#my_poly = #polynomial.int_polynomial<5 + x**2> -// expected-error@below {{failed to parse Polynomial_RingAttr parameter 'coefficientModulus' which is to be a `::mlir::IntegerAttr`}} -// expected-error@below {{expected attribute value}} -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{coefficientModulus specified but coefficientType is not integral}} -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{coefficientModulus should not be 0}} -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{coefficientModulus should be positive}} -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{coefficientModulus needs bit width of 33 but coefficientType can only contain 32 bits}} -#ring1 = #polynomial.ring - -// ----- - -#ring1 = #polynomial.ring - -// ----- - -// expected-error@below {{coefficientModulus should be positive}} -#ring1 = #polynomial.ring - -// ----- - -// unfortunately, coefficientModulus of 64bit should be contained in larger type -#ring1 = #polynomial.ring diff --git a/mlir/test/Dialect/Polynomial/canonicalization.mlir b/mlir/test/Dialect/Polynomial/canonicalization.mlir deleted file mode 100644 index c0ee514daab64..0000000000000 --- a/mlir/test/Dialect/Polynomial/canonicalization.mlir +++ /dev/null @@ -1,47 +0,0 @@ -// RUN: mlir-opt -canonicalize %s | FileCheck %s -#ntt_poly = #polynomial.int_polynomial<-1 + x**8> -#ntt_ring = #polynomial.ring -#root = #polynomial.primitive_root -!ntt_poly_ty = !polynomial.polynomial -!tensor_ty = tensor<8xi32, #ntt_ring> - -// CHECK-LABEL: @test_canonicalize_intt_after_ntt -// CHECK: (%[[P:.*]]: [[T:.*]]) -> [[T]] -func.func @test_canonicalize_intt_after_ntt(%p0 : !ntt_poly_ty) -> !ntt_poly_ty { - // CHECK-NOT: polynomial.ntt - // CHECK-NOT: polynomial.intt - // CHECK: %[[RESULT:.+]] = polynomial.add %[[P]], %[[P]] : [[T]] - %t0 = polynomial.ntt %p0 {root=#root} : !ntt_poly_ty -> !tensor_ty - %p1 = polynomial.intt %t0 {root=#root} : !tensor_ty -> !ntt_poly_ty - %p2 = polynomial.add %p1, %p1 : !ntt_poly_ty - // CHECK: return %[[RESULT]] : [[T]] - return %p2 : !ntt_poly_ty -} - -// CHECK-LABEL: @test_canonicalize_ntt_after_intt -// CHECK: (%[[X:.*]]: [[T:.*]]) -> [[T]] -func.func @test_canonicalize_ntt_after_intt(%t0 : !tensor_ty) -> !tensor_ty { - // CHECK-NOT: polynomial.intt - // CHECK-NOT: polynomial.ntt - // CHECK: %[[RESULT:.+]] = arith.addi %[[X]], %[[X]] : [[T]] - %p0 = polynomial.intt %t0 {root=#root} : !tensor_ty -> !ntt_poly_ty - %t1 = polynomial.ntt %p0 {root=#root} : !ntt_poly_ty -> !tensor_ty - %t2 = arith.addi %t1, %t1 : !tensor_ty - // CHECK: return %[[RESULT]] : [[T]] - return %t2 : !tensor_ty -} - -#cycl_2048 = #polynomial.int_polynomial<1 + x**1024> -#ring = #polynomial.ring -!sub_ty = !polynomial.polynomial - -// CHECK-LABEL: test_canonicalize_sub -// CHECK-SAME: (%[[p0:.*]]: [[T:.*]], %[[p1:.*]]: [[T]]) -> [[T]] { -func.func @test_canonicalize_sub(%poly0 : !sub_ty, %poly1 : !sub_ty) -> !sub_ty { - %0 = polynomial.sub %poly0, %poly1 : !sub_ty - // CHECK: %[[minus_one:.+]] = arith.constant -1 : i32 - // CHECK: %[[p1neg:.+]] = polynomial.mul_scalar %[[p1]], %[[minus_one]] - // CHECK: [[ADD:%.+]] = polynomial.add %[[p0]], %[[p1neg]] - return %0 : !sub_ty -} - diff --git a/mlir/test/Dialect/Polynomial/ops.mlir b/mlir/test/Dialect/Polynomial/ops.mlir deleted file mode 100644 index faeb68a8b2c09..0000000000000 --- a/mlir/test/Dialect/Polynomial/ops.mlir +++ /dev/null @@ -1,112 +0,0 @@ -// RUN: mlir-opt %s | FileCheck %s - -// This simply tests for syntax. - -#my_poly = #polynomial.int_polynomial<1 + x**1024> -#my_poly_2 = #polynomial.int_polynomial<2> -#my_poly_3 = #polynomial.int_polynomial<3x> -#my_poly_4 = #polynomial.int_polynomial -#ring1 = #polynomial.ring -#ring2 = #polynomial.ring -#one_plus_x_squared = #polynomial.int_polynomial<1 + x**2> - -#ideal = #polynomial.int_polynomial<-1 + x**1024> -#ring = #polynomial.ring -!poly_ty = !polynomial.polynomial - -#ntt_poly = #polynomial.int_polynomial<-1 + x**8> -#ntt_ring = #polynomial.ring -!ntt_poly_ty = !polynomial.polynomial - -#ntt_poly_2 = #polynomial.int_polynomial<1 + x**65536> -#ntt_ring_2 = #polynomial.ring -#ntt_ring_2_root = #polynomial.primitive_root -!ntt_poly_ty_2 = !polynomial.polynomial - -module { - func.func @test_multiply() -> !polynomial.polynomial { - %c0 = arith.constant 0 : index - %two = arith.constant 2 : i16 - %five = arith.constant 5 : i16 - %coeffs1 = tensor.from_elements %two, %two, %five : tensor<3xi16> - %coeffs2 = tensor.from_elements %five, %five, %two : tensor<3xi16> - - %poly1 = polynomial.from_tensor %coeffs1 : tensor<3xi16> -> !polynomial.polynomial - %poly2 = polynomial.from_tensor %coeffs2 : tensor<3xi16> -> !polynomial.polynomial - - %3 = polynomial.mul %poly1, %poly2 : !polynomial.polynomial - - return %3 : !polynomial.polynomial - } - - func.func @test_elementwise(%p0 : !polynomial.polynomial, %p1: !polynomial.polynomial) { - %tp0 = tensor.from_elements %p0, %p1 : tensor<2x!polynomial.polynomial> - %tp1 = tensor.from_elements %p1, %p0 : tensor<2x!polynomial.polynomial> - - %c = arith.constant 2 : i32 - %mul_const_sclr = polynomial.mul_scalar %tp0, %c : tensor<2x!polynomial.polynomial>, i32 - - %add = polynomial.add %tp0, %tp1 : tensor<2x!polynomial.polynomial> - %sub = polynomial.sub %tp0, %tp1 : tensor<2x!polynomial.polynomial> - %mul = polynomial.mul %tp0, %tp1 : tensor<2x!polynomial.polynomial> - - return - } - - func.func @test_to_from_tensor(%p0 : !polynomial.polynomial) { - %c0 = arith.constant 0 : index - %two = arith.constant 2 : i16 - %coeffs1 = tensor.from_elements %two, %two : tensor<2xi16> - // CHECK: from_tensor - %poly = polynomial.from_tensor %coeffs1 : tensor<2xi16> -> !polynomial.polynomial - // CHECK: to_tensor - %tensor = polynomial.to_tensor %poly : !polynomial.polynomial -> tensor<1024xi16> - - return - } - - func.func @test_degree(%p0 : !polynomial.polynomial) { - %0, %1 = polynomial.leading_term %p0 : !polynomial.polynomial -> (index, i32) - return - } - - func.func @test_monomial() { - %deg = arith.constant 1023 : index - %five = arith.constant 5 : i16 - %0 = polynomial.monomial %five, %deg : (i16, index) -> !polynomial.polynomial - return - } - - func.func @test_monic_monomial_mul() { - %five = arith.constant 5 : index - %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial - %1 = polynomial.monic_monomial_mul %0, %five : (!polynomial.polynomial, index) -> !polynomial.polynomial - return - } - - func.func @test_constant() { - %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial - %1 = polynomial.constant int<1 + x**2> : !polynomial.polynomial - %2 = polynomial.constant float<1.5 + 0.5 x**2> : !polynomial.polynomial - - // Test verbose fallbacks - %verb0 = polynomial.constant #polynomial.typed_int_polynomial<1 + x**2> : !polynomial.polynomial - %verb2 = polynomial.constant #polynomial.typed_float_polynomial<1.5 + 0.5 x**2> : !polynomial.polynomial - return - } - - func.func @test_ntt(%0 : !ntt_poly_ty) { - %1 = polynomial.ntt %0 {root=#polynomial.primitive_root} : !ntt_poly_ty -> tensor<8xi32, #ntt_ring> - return - } - - func.func @test_ntt_with_overflowing_root(%0 : !ntt_poly_ty_2) { - %1 = polynomial.ntt %0 {root=#ntt_ring_2_root} : !ntt_poly_ty_2 -> tensor<65536xi32, #ntt_ring_2> - return - } - - func.func @test_intt(%0 : tensor<8xi32, #ntt_ring>) { - %1 = polynomial.intt %0 {root=#polynomial.primitive_root} : tensor<8xi32, #ntt_ring> -> !ntt_poly_ty - return - } -} diff --git a/mlir/test/Dialect/Polynomial/ops_errors.mlir b/mlir/test/Dialect/Polynomial/ops_errors.mlir deleted file mode 100644 index 4937e17027afa..0000000000000 --- a/mlir/test/Dialect/Polynomial/ops_errors.mlir +++ /dev/null @@ -1,126 +0,0 @@ -// RUN: mlir-opt --split-input-file --verify-diagnostics %s - -#my_poly = #polynomial.int_polynomial<1 + x**1024> -#ring = #polynomial.ring -!ty = !polynomial.polynomial - -func.func @test_from_tensor_too_large_coeffs() { - %two = arith.constant 2 : i32 - %coeffs1 = tensor.from_elements %two, %two : tensor<2xi32> - // expected-error@below {{is too large to fit in the coefficients}} - // expected-note@below {{rescaled to fit}} - %poly = polynomial.from_tensor %coeffs1 : tensor<2xi32> -> !ty - return -} - -// ----- - -#my_poly = #polynomial.int_polynomial<1 + x**4> -#ring = #polynomial.ring -!ty = !polynomial.polynomial -func.func @test_from_tensor_wrong_tensor_type() { - %two = arith.constant 2 : i32 - %coeffs1 = tensor.from_elements %two, %two, %two, %two, %two : tensor<5xi32> - // expected-error@below {{input type 'tensor<5xi32>' does not match output type '!polynomial.polynomial>>'}} - // expected-note@below {{at most the degree of the polynomialModulus of the output type's ring attribute}} - %poly = polynomial.from_tensor %coeffs1 : tensor<5xi32> -> !ty - return -} - -// ----- - -#my_poly = #polynomial.int_polynomial<1 + x**4> -#ring = #polynomial.ring -!ty = !polynomial.polynomial -func.func @test_to_tensor_wrong_output_tensor_type(%arg0 : !ty) { - // expected-error@below {{input type '!polynomial.polynomial>>' does not match output type 'tensor<5xi32>'}} - // expected-note@below {{at most the degree of the polynomialModulus of the input type's ring attribute}} - %tensor = polynomial.to_tensor %arg0 : !ty -> tensor<5xi32> - return -} - -// ----- - -#my_poly = #polynomial.int_polynomial<1 + x**1024> -#ring = #polynomial.ring -!ty = !polynomial.polynomial - -func.func @test_mul_scalar_wrong_type(%arg0: !ty) -> !ty { - %scalar = arith.constant 2 : i32 // should be i16 - // expected-error@below {{polynomial coefficient type 'i16' does not match scalar type 'i32'}} - %poly = polynomial.mul_scalar %arg0, %scalar : !ty, i32 - return %poly : !ty -} - -// ----- - -#my_poly = #polynomial.int_polynomial<-1 + x**1024> -#ring = #polynomial.ring -!poly_ty = !polynomial.polynomial - -// CHECK-NOT: @test_invalid_ntt -// CHECK-NOT: polynomial.ntt -func.func @test_invalid_ntt(%0 : !poly_ty) { - // expected-error@below {{expects a ring encoding to be provided to the tensor}} - %1 = polynomial.ntt %0 {root=#polynomial.primitive_root} : !poly_ty -> tensor<1024xi32> - return -} - -// ----- - -#my_poly = #polynomial.int_polynomial<-1 + x**1024> -#ring = #polynomial.ring -!poly_ty = !polynomial.polynomial - -// CHECK-NOT: @test_invalid_ntt -// CHECK-NOT: polynomial.ntt -func.func @test_invalid_ntt(%0 : !poly_ty) { - // expected-error@below {{tensor encoding is not a ring attribute}} - %1 = polynomial.ntt %0 {root=#polynomial.primitive_root} : !poly_ty -> tensor<1024xi32, #my_poly> - return -} - -// ----- - -#my_poly = #polynomial.int_polynomial<-1 + x**1024> -#ring = #polynomial.ring -#ring1 = #polynomial.ring -!poly_ty = !polynomial.polynomial - -// CHECK-NOT: @test_invalid_intt -// CHECK-NOT: polynomial.intt -func.func @test_invalid_intt(%0 : tensor<1024xi32, #ring1>) { - // expected-error@below {{not equivalent to the polynomial ring}} - %1 = polynomial.intt %0 {root=#polynomial.primitive_root} : tensor<1024xi32, #ring1> -> !poly_ty - return -} - -// ----- - -#my_poly = #polynomial.int_polynomial<-1 + x**1024> -#ring = #polynomial.ring -!poly_ty = !polynomial.polynomial - -// CHECK-NOT: @test_invalid_intt -// CHECK-NOT: polynomial.intt -func.func @test_invalid_intt(%0 : tensor<1025xi32, #ring>) { - // expected-error@below {{does not match output type}} - // expected-note@below {{exactly the degree of the polynomialModulus of the polynomial type's ring attribute}} - %1 = polynomial.intt %0 {root=#polynomial.primitive_root} : tensor<1025xi32, #ring> -> !poly_ty - return -} - -// ----- - -#my_poly = #polynomial.int_polynomial<-1 + x**8> -// A valid root is 31 -#ring = #polynomial.ring -!poly_ty = !polynomial.polynomial - -// CHECK-NOT: @test_invalid_intt -// CHECK-NOT: polynomial.intt -func.func @test_invalid_intt(%0 : tensor<8xi32, #ring>) { - // expected-error@below {{provided root 32 is not a primitive root of unity mod 256, with the specified degree 8}} - %1 = polynomial.intt %0 {root=#polynomial.primitive_root} : tensor<8xi32, #ring> -> !poly_ty - return -} diff --git a/mlir/test/Dialect/Polynomial/types.mlir b/mlir/test/Dialect/Polynomial/types.mlir deleted file mode 100644 index dcc5663ceb84c..0000000000000 --- a/mlir/test/Dialect/Polynomial/types.mlir +++ /dev/null @@ -1,65 +0,0 @@ -// RUN: mlir-opt %s | FileCheck %s - -// CHECK-LABEL: func @test_types -// CHECK-SAME: !polynomial.polynomial< -// CHECK-SAME: ring = < -// CHECK-SAME: coefficientType = i32, -// CHECK-SAME: coefficientModulus = 2837465 : i32, -// CHECK-SAME: polynomialModulus = <1 + x**1024>>> -#my_poly = #polynomial.int_polynomial<1 + x**1024> -#ring1 = #polynomial.ring -!ty = !polynomial.polynomial -func.func @test_types(%0: !ty) -> !ty { - return %0 : !ty -} - - -// CHECK-LABEL: func @test_non_x_variable_64_bit -// CHECK-SAME: !polynomial.polynomial< -// CHECK-SAME: ring = < -// CHECK-SAME: coefficientType = i64, -// CHECK-SAME: coefficientModulus = 2837465 : i64, -// CHECK-SAME: polynomialModulus = <2 + 4x + x**3>>> -#my_poly_2 = #polynomial.int_polynomial -#ring2 = #polynomial.ring -!ty2 = !polynomial.polynomial -func.func @test_non_x_variable_64_bit(%0: !ty2) -> !ty2 { - return %0 : !ty2 -} - - -// CHECK-LABEL: func @test_linear_poly -// CHECK-SAME: !polynomial.polynomial< -// CHECK-SAME: ring = < -// CHECK-SAME: coefficientType = i32, -// CHECK-SAME: coefficientModulus = 12 : i32, -// CHECK-SAME: polynomialModulus = <4x>> -#my_poly_3 = #polynomial.int_polynomial<4x> -#ring3 = #polynomial.ring -!ty3 = !polynomial.polynomial -func.func @test_linear_poly(%0: !ty3) -> !ty3 { - return %0 : !ty3 -} - -// CHECK-LABEL: func @test_negative_leading_1 -// CHECK-SAME: !polynomial.polynomial< -// CHECK-SAME: ring = < -// CHECK-SAME: coefficientType = i32, -// CHECK-SAME: coefficientModulus = 2837465 : i32, -// CHECK-SAME: polynomialModulus = <-1 + x**1024>>> -#my_poly_4 = #polynomial.int_polynomial<-1 + x**1024> -#ring4 = #polynomial.ring -!ty4 = !polynomial.polynomial -func.func @test_negative_leading_1(%0: !ty4) -> !ty4 { - return %0 : !ty4 -} - -// CHECK-LABEL: func @test_float_coefficients -// CHECK-SAME: !polynomial.polynomial> -#my_poly_5 = #polynomial.float_polynomial<0.5 + 1.6e03 x**1024> -#ring5 = #polynomial.ring -!ty5 = !polynomial.polynomial -func.func @test_float_coefficients(%0: !ty5) -> !ty5 { - return %0 : !ty5 -} - diff --git a/mlir/test/Dialect/SCF/bufferize.mlir b/mlir/test/Dialect/SCF/bufferize.mlir index 6c08d9f68e8a9..20a640776b561 100644 --- a/mlir/test/Dialect/SCF/bufferize.mlir +++ b/mlir/test/Dialect/SCF/bufferize.mlir @@ -4,8 +4,8 @@ // CHECK-SAME: %[[PRED:.*]]: i1, // CHECK-SAME: %[[TRUE_TENSOR:.*]]: tensor, // CHECK-SAME: %[[FALSE_TENSOR:.*]]: tensor) -> tensor { -// CHECK-DAG: %[[TRUE_MEMREF:.*]] = bufferization.to_memref %[[TRUE_TENSOR]] : tensor to memref -// CHECK-DAG: %[[FALSE_MEMREF:.*]] = bufferization.to_memref %[[FALSE_TENSOR]] : tensor to memref +// CHECK-DAG: %[[TRUE_MEMREF:.*]] = bufferization.to_buffer %[[TRUE_TENSOR]] : tensor to memref +// CHECK-DAG: %[[FALSE_MEMREF:.*]] = bufferization.to_buffer %[[FALSE_TENSOR]] : tensor to memref // CHECK: %[[RESULT_MEMREF:.*]] = scf.if %[[PRED]] -> (memref) { // CHECK: scf.yield %[[TRUE_MEMREF]] : memref // CHECK: } else { @@ -29,7 +29,7 @@ func.func @if(%pred: i1, %true_val: tensor, %false_val: tensor) -> // CHECK-SAME: %[[TENSOR:.*]]: tensor, // CHECK-SAME: %[[LB:.*]]: index, %[[UB:.*]]: index, // CHECK-SAME: %[[STEP:.*]]: index) -> tensor { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor to memref +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor to memref // Note: scf.for iter_args always bufferize to a memory write. This could be // optimized by analyzing the loop body. // CHECK: %[[MEMREF_COPY:.*]] = memref.alloc() @@ -70,7 +70,7 @@ func.func @if_correct_recursive_legalization_behavior(%pred: i1, %tensor: tensor // CHECK-LABEL: func @for_correct_recursive_legalization_behavior( // CHECK-SAME: %[[TENSOR:.*]]: tensor, // CHECK-SAME: %[[INDEX:.*]]: index) -> tensor { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor to memref +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor to memref // Note: scf.for iter_args always bufferize to a memory write. This could be // optimized by analyzing the loop body. // CHECK: %[[MEMREF_COPY:.*]] = memref.alloc() @@ -78,7 +78,7 @@ func.func @if_correct_recursive_legalization_behavior(%pred: i1, %tensor: tensor // CHECK: %[[RESULT:.*]] = scf.for %{{.*}} = %[[INDEX]] to %[[INDEX]] step %[[INDEX]] iter_args(%[[MEMREF_ITER:.*]] = %[[MEMREF_COPY]]) -> (memref) { // CHECK: %[[TENSOR_ITER:.*]] = bufferization.to_tensor %[[MEMREF_ITER]] : memref // CHECK: %[[TENSOR_MUNGED:.*]] = "test.munge_tensor"(%[[TENSOR_ITER]]) : (tensor) -> tensor -// CHECK: %[[MEMREF_MUNGED:.*]] = bufferization.to_memref %[[TENSOR_MUNGED]] : tensor to memref +// CHECK: %[[MEMREF_MUNGED:.*]] = bufferization.to_buffer %[[TENSOR_MUNGED]] : tensor to memref // CHECK: scf.yield %[[MEMREF_MUNGED]] : memref // CHECK: } // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[RESULT]] : memref @@ -96,7 +96,7 @@ func.func @for_correct_recursive_legalization_behavior(%arg0: tensor, %inde // CHECK-LABEL: func @bufferize_while( // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64, %[[ARG2:.*]]: tensor -// CHECK: %[[M:.*]] = bufferization.to_memref %[[ARG2]] : tensor to memref +// CHECK: %[[M:.*]] = bufferization.to_buffer %[[ARG2]] : tensor to memref // Note: scf.while iter_args always bufferize to a memory write. This could be // optimized by analyzing the loop body. // CHECK: %[[MEMREF_COPY:.*]] = memref.alloc() diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir index 709943e596585..6b6207395f14e 100644 --- a/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir @@ -13,14 +13,14 @@ func.func @scf_for_iter_arg(%arg0: tensor<128xf32, 1>, %arg1: index, %arg2: inde // CHECK-LABEL: func.func @scf_for_iter_arg // CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index) -// CHECK: %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> +// CHECK: %[[v0:.+]] = bufferization.to_buffer %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1> // CHECK: memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 1> // CHECK: %[[cast:.+]] = memref.cast %[[alloc]] : memref<128xf32, 1> to memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[v1:.+]] = scf.for %{{.+}} = %[[arg1]] to %[[arg2]] step %[[arg3]] iter_args(%[[arg6:.+]] = %[[cast]]) -> (memref<128xf32, strided<[?], offset: ?>, 1>) // CHECK-NEXT: %[[v3:.+]] = bufferization.to_tensor %[[arg6]] : memref<128xf32, strided<[?], offset: ?>, 1> to tensor<128xf32, 1 : i64> // CHECK-NEXT: %[[v4:.+]] = "some.use"(%[[v3]]) : (tensor<128xf32, 1 : i64>) -> tensor<128xf32, 1 : i64> -// CHECK-NEXT: %[[v5:.+]] = bufferization.to_memref %[[v4]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> +// CHECK-NEXT: %[[v5:.+]] = bufferization.to_buffer %[[v4]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> // CHECK-NEXT: scf.yield %[[v5]] : memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[v2:.+]] = bufferization.to_tensor %[[v1]] : memref<128xf32, strided<[?], offset: ?>, 1> to tensor<128xf32, 1 : i64> // CHECK: return %[[v2]] : tensor<128xf32, 1 : i64> @@ -49,7 +49,7 @@ func.func @scf_forall( // CHECK: scf.forall // CHECK: %[[v2:.+]] = bufferization.to_tensor %{{.+}} : memref to tensor // CHECK: %[[v3:.+]] = "some.use"(%[[v2]]) : (tensor) -> tensor -// CHECK: bufferization.to_memref %[[v3]] : tensor to memref, 1> +// CHECK: bufferization.to_buffer %[[v3]] : tensor to memref, 1> // CHECK: %[[v1:.+]] = bufferization.to_tensor %{{.+}} : memref to tensor // CHECK: return %[[v1]] : tensor @@ -65,7 +65,7 @@ func.func @scf_execute_region(%arg0: tensor<128xf32, 1>) -> tensor<128xf32, 1> { // CHECK-LABEL: func.func @scf_execute_region // CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>) -// CHECK: %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> +// CHECK: %[[v0:.+]] = bufferization.to_buffer %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[v1:.+]] = scf.execute_region -> memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: scf.yield %[[v0]] : memref<128xf32, strided<[?], offset: ?>, 1> // CHECK: %[[v2:.+]] = bufferization.to_tensor %[[v1]] : memref<128xf32, strided<[?], offset: ?>, 1> to tensor<128xf32, 1 : i64> diff --git a/mlir/test/Dialect/Shape/bufferize.mlir b/mlir/test/Dialect/Shape/bufferize.mlir index 02e147d917d0f..f6788f845a833 100644 --- a/mlir/test/Dialect/Shape/bufferize.mlir +++ b/mlir/test/Dialect/Shape/bufferize.mlir @@ -6,7 +6,7 @@ // CHECK: %[[WTRUE:.*]] = shape.const_witness true // CHECK: %[[MEMREF:.*]] = shape.assuming %[[WTRUE]] -> (memref<2xf16>) { // CHECK: %[[TENSOR_VAL:.*]] = "test.source"() : () -> tensor<2xf16> -// CHECK: %[[YIELDED_MEMREF:.*]] = bufferization.to_memref %[[TENSOR_VAL]] : tensor<2xf16> to memref<2xf16> +// CHECK: %[[YIELDED_MEMREF:.*]] = bufferization.to_buffer %[[TENSOR_VAL]] : tensor<2xf16> to memref<2xf16> // CHECK: shape.assuming_yield %[[YIELDED_MEMREF]] : memref<2xf16> // CHECK: } // CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[MEMREF:.*]] : memref<2xf16> diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir index 6d98667e77563..e20345f27b11a 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir @@ -14,19 +14,19 @@ // CHECK-SAME: %[[VAL_2:.*2]]: tensor) -> tensor { // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK: %[[VAL_5:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK: %[[VAL_6:.*]] = gpu.wait async // CHECK: %[[VAL_7:.*]] = memref.dim %[[VAL_5]], %[[VAL_3]] : memref // CHECK: %[[VAL_8:.*]] = memref.dim %[[VAL_5]], %[[VAL_4]] : memref // CHECK: %[[VAL_9:.*]], %[[VAL_10:.*]] = gpu.alloc async {{\[}}%[[VAL_6]]] (%[[VAL_7]], %[[VAL_8]]) : memref // CHECK: %[[VAL_11:.*]] = gpu.memcpy async {{\[}}%[[VAL_10]]] %[[VAL_9]], %[[VAL_5]] : memref, memref -// CHECK: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK: %[[VAL_13:.*]] = gpu.wait async // CHECK: %[[VAL_14:.*]] = memref.dim %[[VAL_12]], %[[VAL_3]] : memref // CHECK: %[[VAL_15:.*]] = memref.dim %[[VAL_12]], %[[VAL_4]] : memref // CHECK: %[[VAL_16:.*]], %[[VAL_17:.*]] = gpu.alloc async {{\[}}%[[VAL_13]]] (%[[VAL_14]], %[[VAL_15]]) : memref // CHECK: %[[VAL_18:.*]] = gpu.memcpy async {{\[}}%[[VAL_17]]] %[[VAL_16]], %[[VAL_12]] : memref, memref -// CHECK: %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK: %[[VAL_19:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK: %[[VAL_20:.*]] = gpu.wait async // CHECK: %[[VAL_21:.*]] = memref.dim %[[VAL_19]], %[[VAL_3]] : memref // CHECK: %[[VAL_22:.*]] = memref.dim %[[VAL_19]], %[[VAL_4]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir index 63c308a3d5e6f..01906f4c45171 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -30,13 +30,13 @@ // CHECK: %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref // CHECK: %[[VAL_24:.*]], %[[VAL_25:.*]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref // CHECK: %[[VAL_26:.*]] = gpu.memcpy async {{\[}}%[[VAL_25]]] %[[VAL_24]], %[[VAL_11]] : memref, memref -// CHECK: %[[VAL_27:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK: %[[VAL_27:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK: %[[VAL_28:.*]] = gpu.wait async // CHECK: %[[VAL_29:.*]] = memref.dim %[[VAL_27]], %[[VAL_3]] : memref // CHECK: %[[VAL_30:.*]] = memref.dim %[[VAL_27]], %[[VAL_4]] : memref // CHECK: %[[VAL_31:.*]], %[[VAL_32:.*]] = gpu.alloc async {{\[}}%[[VAL_28]]] (%[[VAL_29]], %[[VAL_30]]) : memref // CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_27]] : memref, memref -// CHECK: %[[VAL_34:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK: %[[VAL_34:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK: %[[VAL_35:.*]] = gpu.wait async // CHECK: %[[VAL_36:.*]] = memref.dim %[[VAL_34]], %[[VAL_3]] : memref // CHECK: %[[VAL_37:.*]] = memref.dim %[[VAL_34]], %[[VAL_4]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir index 088e468cee795..dea71fa03c777 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -30,12 +30,12 @@ module { // CHECK: %[[VAL_22:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref // CHECK: %[[VAL_23:.*]], %[[VAL_24:.*]] = gpu.alloc async {{\[}}%[[VAL_21]]] (%[[VAL_22]]) : memref // CHECK: %[[VAL_25:.*]] = gpu.memcpy async {{\[}}%[[VAL_24]]] %[[VAL_23]], %[[VAL_10]] : memref, memref -// CHECK: %[[VAL_26:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK: %[[VAL_26:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK: %[[VAL_27:.*]] = gpu.wait async // CHECK: %[[VAL_28:.*]] = memref.dim %[[VAL_26]], %[[VAL_3]] : memref // CHECK: %[[VAL_29:.*]], %[[VAL_30:.*]] = gpu.alloc async {{\[}}%[[VAL_27]]] (%[[VAL_28]]) : memref // CHECK: %[[VAL_31:.*]] = gpu.memcpy async {{\[}}%[[VAL_30]]] %[[VAL_29]], %[[VAL_26]] : memref, memref -// CHECK: %[[VAL_32:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK: %[[VAL_32:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK: %[[VAL_33:.*]] = gpu.wait async // CHECK: %[[VAL_34:.*]] = memref.dim %[[VAL_32]], %[[VAL_3]] : memref // CHECK: %[[VAL_35:.*]], %[[VAL_36:.*]] = gpu.alloc async {{\[}}%[[VAL_33]]] (%[[VAL_34]]) : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir index 1058bc03fe9cb..6675df2be0c53 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir @@ -28,11 +28,11 @@ // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 8 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK: %[[VAL_5:.*]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<8x8xf64, #sparse{{[0-9]*}}> -// CHECK: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> // CHECK: %[[VAL_7:.*]] = gpu.wait async // CHECK: %[[VAL_8:.*]], %[[VAL_9:.*]] = gpu.alloc async {{\[}}%[[VAL_7]]] () : memref<8x8xf64> // CHECK: %[[VAL_10:.*]] = gpu.memcpy async {{\[}}%[[VAL_9]]] %[[VAL_8]], %[[VAL_6]] : memref<8x8xf64>, memref<8x8xf64> -// CHECK: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> // CHECK: %[[VAL_12:.*]] = gpu.wait async // CHECK: %[[VAL_13:.*]], %[[VAL_14:.*]] = gpu.alloc async {{\[}}%[[VAL_12]]] () : memref<8x8xf64> // CHECK: %[[VAL_15:.*]] = gpu.memcpy async {{\[}}%[[VAL_14]]] %[[VAL_13]], %[[VAL_11]] : memref<8x8xf64>, memref<8x8xf64> diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir index 32741086b9e6e..7b7657a0e9ba5 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir @@ -30,13 +30,13 @@ // CHECK: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor // CHECK: %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor // CHECK: %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor -// CHECK: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK: %[[VAL_12:.*]] = gpu.wait async // CHECK: %[[VAL_13:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref // CHECK: %[[VAL_14:.*]] = memref.dim %[[VAL_11]], %[[VAL_4]] : memref // CHECK: %[[VAL_15:.*]], %[[VAL_16:.*]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]], %[[VAL_14]]) : memref // CHECK: %[[VAL_17:.*]] = gpu.memcpy async {{\[}}%[[VAL_16]]] %[[VAL_15]], %[[VAL_11]] : memref, memref -// CHECK: %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK: %[[VAL_18:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK: %[[VAL_19:.*]] = gpu.wait async // CHECK: %[[VAL_20:.*]] = memref.dim %[[VAL_18]], %[[VAL_3]] : memref // CHECK: %[[VAL_21:.*]] = memref.dim %[[VAL_18]], %[[VAL_4]] : memref diff --git a/mlir/test/Dialect/SparseTensor/constant_index_map.mlir b/mlir/test/Dialect/SparseTensor/constant_index_map.mlir index 857967bcf521a..cf1eb3e9e44f5 100644 --- a/mlir/test/Dialect/SparseTensor/constant_index_map.mlir +++ b/mlir/test/Dialect/SparseTensor/constant_index_map.mlir @@ -14,8 +14,8 @@ // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_5:.*]] = tensor.empty() : tensor<77xi1, #{{.*}}> -// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<1x77xi1> -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<1x77xi1> +// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<1x77xi1> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<1x77xi1> // CHECK: %[[VAL_8:.*]] = scf.for %[[VAL_9:.*]] = %[[VAL_3]] to %[[VAL_2]] step %[[VAL_4]] iter_args(%[[VAL_10:.*]] = %[[VAL_5]]) -> (tensor<77xi1, #{{.*}}>) { // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]], %[[VAL_9]]] : memref<1x77xi1> // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_9]]] : memref<1x77xi1> diff --git a/mlir/test/Dialect/SparseTensor/dense.mlir b/mlir/test/Dialect/SparseTensor/dense.mlir index 5ed1558a53163..c7022706f1e05 100644 --- a/mlir/test/Dialect/SparseTensor/dense.mlir +++ b/mlir/test/Dialect/SparseTensor/dense.mlir @@ -40,7 +40,7 @@ // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK: scf.for %[[VAL_9:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK: %[[VAL_11:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] { @@ -79,7 +79,7 @@ func.func @dense1(%arga: tensor<32x16xf32, #DenseMatrix>, // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 16 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK: scf.for %[[VAL_9:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK: %[[VAL_11:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index @@ -122,7 +122,7 @@ func.func @dense2(%arga: tensor<32x16xf32>, // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 16 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK: scf.for %[[VAL_9:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK: %[[VAL_11:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index diff --git a/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir b/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir index 275f7f2ff25f7..d828afe13c622 100644 --- a/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir +++ b/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir @@ -30,7 +30,7 @@ // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<4x4xf32, #sparse> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<4x4xf32, #sparse> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<4x4xf32, #sparse> to memref -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_10]] : +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_10]] : // CHECK-DAG: linalg.fill ins(%[[VAL_8]] : f32) outs(%[[VAL_14]] : memref<8x8xf32>) // CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_5]] { // CHECK: %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_7]] : index diff --git a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir index 58f182dbdc44d..81d300e851ec1 100644 --- a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir +++ b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir @@ -101,7 +101,7 @@ func.func @sparse_scale(%argx: tensor) -> tensor to memref> // C_HECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref> // C_HECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref -// C_HECK: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// C_HECK: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // C_HECK: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // C_HECK: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref // C_HECK: %[[VAL_13:.*]] = scf.while (%[[VAL_14:.*]] = %[[VAL_11]]) : (index) -> index { @@ -170,7 +170,7 @@ func.func @matvec(%arga: tensor<32x64xf64, #SortedCOO>, // C_HECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref> // C_HECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref> // C_HECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref -// C_HECK: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x64xf64> to memref<32x64xf64> +// C_HECK: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x64xf64> to memref<32x64xf64> // C_HECK: linalg.fill ins(%[[VAL_4]] : f64) outs(%[[VAL_15]] : memref<32x64xf64>) // C_HECK: %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref // C_HECK: %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir index 003dcc6708d63..a2f3f7704ddde 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir @@ -21,7 +21,7 @@ // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>) // CHECK: scf.for %[[VAL_9:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_9]]] : memref @@ -51,7 +51,7 @@ func.func @add_d(%arga: tensor<32xf32, #DV>, %argb: f32, %argx: tensor<32xf32>) // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK: %[[VAL_INITTENSOR:.*]] = tensor.empty() : tensor<32xf32> // CHECK: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_INITTENSOR]] : tensor<32xf32> to memref<32xf32> +// CHECK: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_INITTENSOR]] : tensor<32xf32> to memref<32xf32> // CHECK: linalg.fill ins(%[[VAL_3]] : f32) outs(%[[VAL_7]] : memref<32xf32>) // CHECK: scf.for %[[VAL_8:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_5]] { // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_8]]] : memref @@ -81,7 +81,7 @@ func.func @add_d_init(%arga: tensor<32xf32, #DV>, %argb: f32) -> tensor<32xf32> // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>) // CHECK: scf.for %[[VAL_9:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_9]]] : memref @@ -115,7 +115,7 @@ func.func @mul_d(%arga: tensor<32xf32, #DV>, %argb: f32, %argx: tensor<32xf32>) // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>) // CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { // CHECK: %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_13]] : index @@ -165,7 +165,7 @@ func.func @add_s(%arga: tensor<32xf32, #SV>, %argb: f32, %argx: tensor<32xf32>) // CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] // CHECK-DAG: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>) @@ -205,7 +205,7 @@ func.func @repeated_add_s(%arga: tensor<32xf32, #SV>, %argx: tensor<32xf32>) -> // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>) // CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -247,8 +247,8 @@ func.func @mul_s(%arga: tensor<32xf32, #SV>, %argb: f32, %argx: tensor<32xf32>) // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>) // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref @@ -278,8 +278,8 @@ func.func @add_dd(%arga: tensor<32xf32, #DV>, %argb: tensor<32xf32>, %argx: tens // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>) // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref @@ -309,11 +309,11 @@ func.func @mul_dd(%arga: tensor<32xf32, #DV>, %argb: tensor<32xf32>, %argx: tens // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant true // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<32xf32> to memref<32xf32> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref @@ -366,11 +366,11 @@ func.func @add_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tens // CHECK-SAME: %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> { // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_5:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<32xf32> to memref<32xf32> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>) // CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref @@ -406,8 +406,8 @@ func.func @mul_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tens // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32> -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref @@ -463,8 +463,8 @@ func.func @add_sd(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32>, %argx: tens // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>) // CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -500,7 +500,7 @@ func.func @mul_sd(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32>, %argx: tens // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -582,7 +582,7 @@ func.func @add_ss(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32, #SV>, %argx: // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>) // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -643,7 +643,7 @@ func.func @mul_ss(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32, #SV>, %argx: // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_3]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>) // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -735,7 +735,7 @@ func.func @two_way_inv(%arga: tensor<16xf32, #SV>, %argb: tensor<16xf32, #SV>, % // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_3]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>) // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -830,7 +830,7 @@ func.func @two_way_inv_alt(%arga: tensor<16xf32, #SV>, // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor to memref // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor to memref -// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK-DAG: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-DAG: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_6]][] : memref @@ -875,7 +875,7 @@ func.func @sum_reduction(%arga: tensor, %argx: tensor) -> tenso // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -977,11 +977,11 @@ func.func @sum_reduction_ss(%arga: tensor<16xf32, #SV>, // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : tensor to memref +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_3]] : tensor to memref // CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref // CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref // CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref @@ -1089,16 +1089,16 @@ func.func @sum_reduction_inv(%arga: tensor<16xf32, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant true // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_3]] {level = 0 : index} : tensor to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.coordinates %[[VAL_3]] {level = 0 : index} : tensor to memref // CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor to memref // CHECK-DAG: %[[VAL_16:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor -// CHECK-DAG: %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_4]] +// CHECK-DAG: %[[VAL_18:.*]] = bufferization.to_buffer %[[VAL_4]] // CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref) // CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref // CHECK-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref @@ -1272,7 +1272,7 @@ func.func @four_tensors_op(%arga: tensor, // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 0 : index} : tensor to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 0 : index} : tensor to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor to memref -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : tensor to memref +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_3]] : tensor to memref // CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref // CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir index 9c34e54db6c85..faf6404a96564 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir @@ -25,8 +25,8 @@ // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32x16xf32>) // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index @@ -62,8 +62,8 @@ func.func @add_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> // CHECK: linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_10]] : memref<32x16xi1>) // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] { // CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index @@ -98,8 +98,8 @@ func.func @cmp_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32x16xf32>) // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index @@ -137,8 +137,8 @@ func.func @mul_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<32x16xf32>) // CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_7]] { // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref @@ -202,8 +202,8 @@ func.func @add_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> // CHECK-DAG: linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>) // CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] { // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref @@ -265,8 +265,8 @@ func.func @cmp_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32x16xf32>) // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref @@ -306,8 +306,8 @@ func.func @mul_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<32x16xf32>) // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref @@ -376,8 +376,8 @@ func.func @add_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> // CHECK-DAG: linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>) // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref @@ -444,8 +444,8 @@ func.func @cmp_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32x16xf32>) // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -488,8 +488,8 @@ func.func @mul_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_15]] : memref<32x16xf32>) // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref @@ -584,8 +584,8 @@ func.func @add_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> // CHECK-DAG: linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_16]] : memref<32x16xi1>) // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref @@ -679,8 +679,8 @@ func.func @cmp_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32x16xf32>) // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -726,7 +726,7 @@ func.func @mul_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_16]] : memref<32x16xf32>) // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -891,7 +891,7 @@ func.func @add_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> +// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1> // CHECK-DAG: linalg.fill ins(%[[VAL_3]] : i1) outs(%[[VAL_17]] : memref<32x16xi1>) // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref @@ -1166,7 +1166,7 @@ func.func @sub_ss_batched(%0: tensor<2x3xf64, #BatchedVector>, %1: tensor<2x3xf6 // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_16]] : memref<32x16xf32>) // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -1260,7 +1260,7 @@ func.func @mul_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_15]] : memref<32x16xf32>) // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref @@ -1362,7 +1362,7 @@ func.func @add_sd_ds(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32, #T // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<32x16xf32>) // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -1415,8 +1415,8 @@ func.func @mul_sd_ds(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32, #T // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<16x32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<16x32xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16x32xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<16xf32> to memref<16xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<16xf32> to memref<16xf32> // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref // CHECK-DAG: %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index @@ -1464,7 +1464,7 @@ func.func @matvec(%argA: tensor<16x32xf32, #Tds>, %argb: tensor<32xf32>, %argx: // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK: %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_9]]) -> (f32) { // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref @@ -1511,7 +1511,7 @@ func.func @sum_reduction(%arga: tensor<10x20xf32, #Tds>, %argx: tensor) -> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.lvl %[[VAL_0]], %[[VAL_3]] : tensor -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_11]] : memref) // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] { // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_12]]] : memref @@ -1563,9 +1563,9 @@ func.func @scale(%arga: tensor, %argx: tensor) -> tensor // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor to memref // CHECK-DAG: %[[VAL_11:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_3]] : tensor to memref +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_3]] : tensor to memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_4]] { @@ -1638,10 +1638,10 @@ func.func @sampled_dense_dense(%args: tensor, // CHECK-DAG: %[[VAL_17:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 1 : index} : tensor to memref // CHECK-DAG: %[[VAL_18:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 1 : index} : tensor to memref // CHECK-DAG: %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor to memref -// CHECK-DAG: %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : tensor to memref -// CHECK-DAG: %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : tensor to memref +// CHECK-DAG: %[[VAL_20:.*]] = bufferization.to_buffer %[[VAL_3]] : tensor to memref +// CHECK-DAG: %[[VAL_21:.*]] = bufferization.to_buffer %[[VAL_4]] : tensor to memref // CHECK-DAG: %[[VAL_22:.*]] = sparse_tensor.lvl %[[VAL_2]], %[[VAL_6]] : tensor to memref +// CHECK-DAG: %[[VAL_24:.*]] = bufferization.to_buffer %[[VAL_5]] : tensor to memref // CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_21]][] : memref // CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_27:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir index 9158ac427763b..f6ecfa0beba26 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir @@ -33,8 +33,8 @@ // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_11]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] { // CHECK: %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_4]] : index @@ -75,8 +75,8 @@ func.func @add_ddd(%arga: tensor<32x16x8xf32, #Tddd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_11]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] { // CHECK: %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_4]] : index @@ -120,8 +120,8 @@ func.func @mul_ddd(%arga: tensor<32x16x8xf32, #Tddd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_15]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_16:.*]] = %[[VAL_7]] to %[[VAL_4]] step %[[VAL_9]] { // CHECK: %[[VAL_18:.*]] = arith.muli %[[VAL_16]], %[[VAL_5]] : index @@ -187,8 +187,8 @@ func.func @add_dds(%arga: tensor<32x16x8xf32, #Tdds>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] { // CHECK: %[[VAL_16:.*]] = arith.muli %[[VAL_14]], %[[VAL_5]] : index @@ -234,8 +234,8 @@ func.func @mul_dds(%arga: tensor<32x16x8xf32, #Tdds>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_7]] to %[[VAL_3]] step %[[VAL_8]] { // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref @@ -305,8 +305,8 @@ func.func @add_dsd(%arga: tensor<32x16x8xf32, #Tdsd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_12]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref @@ -354,8 +354,8 @@ func.func @mul_dsd(%arga: tensor<32x16x8xf32, #Tdsd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_17]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_18:.*]] = %[[VAL_8]] to %[[VAL_4]] step %[[VAL_9]] { // CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_18]]] : memref @@ -450,8 +450,8 @@ func.func @add_dss(%arga: tensor<32x16x8xf32, #Tdss>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>) // CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] { // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref @@ -499,8 +499,8 @@ func.func @mul_dss(%arga: tensor<32x16x8xf32, #Tdss>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>) // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_8]]] : memref @@ -575,8 +575,8 @@ func.func @add_sdd(%arga: tensor<32x16x8xf32, #Tsdd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_12]] : memref<32x16x8xf32>) // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref @@ -625,8 +625,8 @@ func.func @mul_sdd(%arga: tensor<32x16x8xf32, #Tsdd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_17]] : memref<32x16x8xf32>) // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_8]]] : memref // CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_9]]] : memref @@ -726,8 +726,8 @@ func.func @add_sds(%arga: tensor<32x16x8xf32, #Tsds>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>) // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref @@ -778,8 +778,8 @@ func.func @mul_sds(%arga: tensor<32x16x8xf32, #Tsds>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_16]] : memref<32x16x8xf32>) // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_8]]] : memref @@ -883,8 +883,8 @@ func.func @add_ssd(%arga: tensor<32x16x8xf32, #Tssd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<32x16x8xf32>) // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -937,8 +937,8 @@ func.func @mul_ssd(%arga: tensor<32x16x8xf32, #Tssd>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_19:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_19]] : memref<32x16x8xf32>) // CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_8]]] : memref // CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_9]]] : memref @@ -1067,8 +1067,8 @@ func.func @add_sss(%arga: tensor<32x16x8xf32, #Tsss>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32> +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_15]] : memref<32x16x8xf32>) // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -1127,11 +1127,11 @@ func.func @mul_sss(%arga: tensor<32x16x8xf32, #Tsss>, %argb: tensor<32x16x8xf32> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 2 : index} : tensor to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.lvl %[[VAL_1]], %[[VAL_6]] : tensor -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : tensor to memref +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_3]] : tensor to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.lvl %[[VAL_1]], %[[VAL_5]] : tensor // CHECK-DAG: %[[VAL_14:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor -// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK: scf.for %[[VAL_17:.*]] = %[[VAL_5]] to %[[VAL_13]] step %[[VAL_6]] { // CHECK: %[[VAL_19:.*]] = arith.muli %[[VAL_17]], %[[VAL_10]] : index // CHECK: scf.for %[[VAL_18:.*]] = %[[VAL_5]] to %[[VAL_10]] step %[[VAL_6]] { @@ -1191,7 +1191,7 @@ func.func @kernel_3d(%arga: tensor, // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<10x20x30xf32, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<10x20x30xf32, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20x30xf32, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref @@ -1246,10 +1246,10 @@ func.func @sum_reduction(%arga: tensor<10x20x30xf32, #Tsss>, %argx: tensor) // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor // CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-DAG: %[[VAL_9:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref // CHECK: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (f32) { // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref @@ -1305,9 +1305,9 @@ func.func @sum_reduction_inv(%arga: tensor, // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_8:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<20xf32> to memref<20xf32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<30xf32> to memref<30xf32> -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : tensor<10x20x30xf32> to memref<10x20x30xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<20xf32> to memref<20xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<30xf32> to memref<30xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_3]] : tensor<10x20x30xf32> to memref<10x20x30xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<10x20x30xf32>) // CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_7]] to %[[VAL_4]] step %[[VAL_8]] { // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_14]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir index e2dbadc4db5bf..973f8f575ed7d 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir @@ -25,8 +25,8 @@ // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<4xf32> to memref<4xf32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<4xf32> to memref<4xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf32> to memref<32xf32> // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<4xf32> // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -112,8 +112,8 @@ func.func @mul_inv_enc_dense1d(%arga: tensor<32xf32, #EncDenseVec>, // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi32, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi32, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<34xi32> to memref<34xi32> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi32> to memref<32xi32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<34xi32> to memref<34xi32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xi32> to memref<32xi32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : i32) outs(%[[VAL_11]] : memref<32xi32>) // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref @@ -163,8 +163,8 @@ func.func @and_affine_dense1d(%arga: tensor<32xi32, #SpVec>, // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<34x19xf64> to memref<34x19xf64> -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<34x19xf64> to memref<34x19xf64> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64> // CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_3]] { // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref // CHECK: %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_3]] : index @@ -223,7 +223,7 @@ func.func @mul_affine_dense2d(%arga: tensor<32x16xf64, #CSR>, // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64> // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref // CHECK: scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_5]] { @@ -287,7 +287,7 @@ func.func @mul_affine_dense_dim_2d(%arga: tensor<34x16xf64, #CSR>, // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64> // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_6]]] : memref // CHECK: scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_6]] { diff --git a/mlir/test/Dialect/SparseTensor/sparse_batch.mlir b/mlir/test/Dialect/SparseTensor/sparse_batch.mlir index cfddef743cf28..88e93be62a9e6 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_batch.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_batch.mlir @@ -14,7 +14,7 @@ // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<8x4x2xf32, #sparse{{[0-9]*}}> to memref<8x?xindex> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<8x4x2xf32, #sparse{{[0-9]*}}> to memref<8x?xindex> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x4x2xf32, #sparse{{[0-9]*}}> to memref<8x?xf32> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_6]] : tensor<8x4x2xf32> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_6]] : tensor<8x4x2xf32> // CHECK-DAG: linalg.fill ins(%[[VAL_3]] : f32) outs(%[[VAL_10]] : memref<8x4x2xf32>) // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_1]] { // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_1]] { diff --git a/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir index d1d8276f8daef..69c0d8c84abbe 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir @@ -38,7 +38,7 @@ // CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] { @@ -70,7 +70,7 @@ func.func @abs(%arga: tensor<32xf64, #SV>, // CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] { @@ -102,7 +102,7 @@ func.func @ceil(%arga: tensor<32xf64, #SV>, // CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] { @@ -134,7 +134,7 @@ func.func @floor(%arga: tensor<32xf64, #SV>, // CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] { @@ -169,8 +169,8 @@ func.func @neg(%arga: tensor<32xf64, #SV>, // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { @@ -229,8 +229,8 @@ func.func @add(%arga: tensor<32xf64, #SV>, // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { @@ -289,8 +289,8 @@ func.func @sub(%arga: tensor<32xf64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] { @@ -325,7 +325,7 @@ func.func @mul(%arga: tensor<32xf64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xf64> to memref<32xf64> // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] { diff --git a/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir b/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir index d9f48afef4810..352a0fa242300 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir @@ -25,7 +25,7 @@ // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<100xf64, #sparse> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<100xf64, #sparse> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<100xf64, #sparse> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_8]] : +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_8]] : // CHECK-DAG: linalg.fill ins(%[[VAL_4]] : f64) outs(%[[VAL_12]] : memref<100xf64>) // CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref // CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir index 3a33a200f8279..be96dbf10242e 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir @@ -33,8 +33,8 @@ // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { @@ -94,8 +94,8 @@ func.func @add(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { @@ -154,8 +154,8 @@ func.func @sub(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] { @@ -190,7 +190,7 @@ func.func @mul(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] { @@ -224,7 +224,7 @@ func.func @divsbyc(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] { @@ -258,8 +258,8 @@ func.func @divubyc(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] { @@ -296,8 +296,8 @@ func.func @and(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { @@ -356,8 +356,8 @@ func.func @or(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref // CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { @@ -414,7 +414,7 @@ func.func @xor(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] { @@ -448,7 +448,7 @@ func.func @ashrbyc(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] { @@ -482,7 +482,7 @@ func.func @lsrbyc(%arga: tensor<32xi64, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<32xi64> to memref<32xi64> // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] { diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir index d215ebb1c0c6f..5f2aa5e3a2736 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir @@ -18,8 +18,8 @@ // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<20x30xf32> to memref<20x30xf32> -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<10x30xf32> to memref<10x30xf32> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<20x30xf32> to memref<20x30xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<10x30xf32> to memref<10x30xf32> // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref // CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_5]] { @@ -58,13 +58,13 @@ func.func @matmul1(%a: tensor<10x20xf32, #DCSR>, // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 10 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<10x20xf32> to memref<10x20xf32> +// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<10x20xf32> to memref<10x20xf32> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<10x30xf32> to memref<10x30xf32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<10x30xf32> to memref<10x30xf32> // CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref @@ -203,13 +203,13 @@ func.func @matmul2(%A: tensor<4x8xf64, #DCSR>, // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 6 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<8x8xi32> to memref<8x8xi32> +// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<8x8xi32> to memref<8x8xi32> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<6x6xi32> to memref<6x6xi32> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<6x6xi32> to memref<6x6xi32> // CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_13]], %[[VAL_14]]] : memref<6x6xi32> @@ -255,13 +255,13 @@ func.func @conv2d(%input: tensor<8x8xi32>, // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 2 : i64 -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<5x3xi8> to memref<5x3xi8> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<5x3xi8> to memref<5x3xi8> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<5x6xi64> to memref<5x6xi64> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<5x6xi64> to memref<5x6xi64> // CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref @@ -309,7 +309,7 @@ func.func @quantized_matmul(%input1: tensor<5x3xi8>, // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<1024xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<1024xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<1024xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_11]][] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref // CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir index 836e26b51f7c1..f6f7f396adab5 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir @@ -85,7 +85,7 @@ func.func @sqsum(%arg0: tensor) -> tensor { // CHECK: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 // CHECK: %[[VAL_5:.*]] = arith.constant dense<0> : tensor<10xi32> -// CHECK: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_5]] : tensor<10xi32> to memref<10xi32> +// CHECK: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_5]] : tensor<10xi32> to memref<10xi32> // CHECK: linalg.fill ins(%[[VAL_4]] : i32) outs(%[[VAL_6]] : memref<10xi32>) // CHECK: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref // CHECK: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir index cab57389f032e..2866e115065d2 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir @@ -29,8 +29,8 @@ // CHECK-HIR-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> // CHECK-HIR-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> // CHECK-HIR-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse{{[0-9]*}}> -// CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64> -// CHECK-HIR-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<64xf64> to memref<64xf64> +// CHECK-HIR-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK-HIR: scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK-HIR-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref // CHECK-HIR-DAG: %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index @@ -60,8 +60,8 @@ // CHECK-MIR-DAG: %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref // CHECK-MIR-DAG: %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref // CHECK-MIR-DAG: %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref -// CHECK-MIR-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64> -// CHECK-MIR-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-MIR-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<64xf64> to memref<64xf64> +// CHECK-MIR-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK-MIR: scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK-MIR-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_14]]] : memref // CHECK-MIR-DAG: %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_5]] : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir index b998eeb0d3944..17c3c29cf5211 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir @@ -32,8 +32,8 @@ // CHECK-HIR-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[DEMAP]] {level = 1 : index} // CHECK-HIR-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[DEMAP]] {level = 1 : index} // CHECK-HIR-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[DEMAP]] -// CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64> -// CHECK-HIR-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<64xf64> to memref<64xf64> +// CHECK-HIR-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK-HIR: scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK-HIR-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64> // CHECK-HIR-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref @@ -62,8 +62,8 @@ // CHECK-MIR-DAG: %[[VAL_7:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref // CHECK-MIR-DAG: %[[VAL_8:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref // CHECK-MIR-DAG: %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref -// CHECK-MIR-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64> -// CHECK-MIR-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-MIR-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<64xf64> to memref<64xf64> +// CHECK-MIR-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK-MIR: scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK-MIR: %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<64xf64> // CHECK-MIR: %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir index e1e1953779fa8..f2a29a550ed01 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir @@ -29,8 +29,8 @@ // CHECK-HIR-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> // CHECK-HIR-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> // CHECK-HIR-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse{{[0-9]*}}> -// CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64> -// CHECK-HIR-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<64xf64> to memref<64xf64> +// CHECK-HIR-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK-HIR: scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK-HIR-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref // CHECK-HIR-DAG: %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_5]] : index @@ -60,8 +60,8 @@ // CHECK-MIR-DAG: %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref // CHECK-MIR-DAG: %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref // CHECK-MIR-DAG: %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref -// CHECK-MIR-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64> -// CHECK-MIR-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64> +// CHECK-MIR-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<64xf64> to memref<64xf64> +// CHECK-MIR-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<32xf64> to memref<32xf64> // CHECK-MIR: scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { // CHECK-MIR-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref // CHECK-MIR-DAG: %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_5]] : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_nd.mlir b/mlir/test/Dialect/SparseTensor/sparse_nd.mlir index b80a48363773f..8f06df3c9b98d 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_nd.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_nd.mlir @@ -35,13 +35,13 @@ // CHECK-DAG: %[[VAL_10:.*]] = arith.constant 80 : index // CHECK-DAG: %[[VAL_11:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_12:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<10x20x30x40x50x60x70x80xf32> to memref<10x20x30x40x50x60x70x80xf32> +// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<10x20x30x40x50x60x70x80xf32> to memref<10x20x30x40x50x60x70x80xf32> // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 3 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 3 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_16:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 4 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_17:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 4 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_18:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<10x20x30x40x50x60x70x80xf32> to memref<10x20x30x40x50x60x70x80xf32> +// CHECK-DAG: %[[VAL_20:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<10x20x30x40x50x60x70x80xf32> to memref<10x20x30x40x50x60x70x80xf32> // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_20]] : memref<10x20x30x40x50x60x70x80xf32> // CHECK: scf.for %[[VAL_21:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_12]] { // CHECK: %[[VAL_23:.*]] = arith.muli %[[VAL_21]], %[[VAL_9]] : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir b/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir index ab7a30e2f96a5..ebb5ab6075da2 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir @@ -19,7 +19,7 @@ // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xi32, #{{.*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<10xf32> to memref<10xf32> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<10xf32> to memref<10xf32> // CHECK-DAG: linalg.fill ins(%[[VAL_3]] : f32) outs(%[[VAL_8]] : memref<10xf32>) // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref @@ -53,7 +53,7 @@ func.func @allout_inplace(%arga: tensor<10xi32, #SV>, // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xi32, #{{.*}}> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_4]] : tensor<10xf32> to memref<10xf32> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_4]] : tensor<10xf32> to memref<10xf32> // CHECK-DAG: linalg.fill ins(%[[VAL_2]] : f32) outs(%[[VAL_8]] : memref<10xf32>) // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_1]]] : memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref @@ -86,7 +86,7 @@ func.func @allout_materialize(%arga: tensor<10xi32, #SV>) -> tensor<10xf32> { // CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xf32, #{{.*}}> to memref // CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xf32, #{{.*}}> to memref // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xf32, #{{.*}}> to memref -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<10xf32> to memref<10xf32> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<10xf32> to memref<10xf32> // CHECK-DAG: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-DAG: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] { diff --git a/mlir/test/Dialect/SparseTensor/sparse_pack.mlir b/mlir/test/Dialect/SparseTensor/sparse_pack.mlir index 91e3842bdd367..4546d3367b16d 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_pack.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_pack.mlir @@ -12,12 +12,12 @@ // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 2 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 100 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<2xindex> to memref<2xindex> +// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<2xindex> to memref<2xindex> // CHECK-DAG: %[[VAL_7:.*]] = memref.cast %[[VAL_6]] : memref<2xindex> to memref -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<6x2xi32> to memref<6x2xi32> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<6x2xi32> to memref<6x2xi32> // CHECK-DAG: %[[VAL_9:.*]] = memref.collapse_shape %[[VAL_8]] {{\[\[}}0, 1]] : memref<6x2xi32> into memref<12xi32> // CHECK-DAG: %[[VAL_10:.*]] = memref.cast %[[VAL_9]] : memref<12xi32> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<6xf64> to memref<6xf64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<6xf64> to memref<6xf64> // CHECK-DAG: %[[VAL_12:.*]] = memref.cast %[[VAL_11]] : memref<6xf64> to memref // CHECK: %[[VAL_13:.*]] = sparse_tensor.storage_specifier.init // CHECK: %[[VAL_14:.*]] = sparse_tensor.storage_specifier.set %[[VAL_13]] lvl_sz at 0 with %[[VAL_4]] @@ -45,18 +45,18 @@ func.func @sparse_pack(%values: tensor<6xf64>, %pos:tensor<2xindex>, %coordinate // CHECK-SAME: %[[VAL_5:.*]]: tensor<2xindex>, // CHECK-SAME: %[[VAL_6:.*]]: tensor<6x2xi32>) -> (tensor<6xf64>, tensor<2xindex>, tensor<6x2xi32>) { // CHECK: %[[VAL_7:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]] pos_mem_sz at 0 -// CHECK: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_5]] : tensor<2xindex> to memref<2xindex> +// CHECK: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_5]] : tensor<2xindex> to memref<2xindex> // CHECK: %[[VAL_9:.*]] = memref.subview %[[VAL_8]][0] {{\[}}%[[VAL_7]]] [1] : memref<2xindex> to memref // CHECK: %[[VAL_10:.*]] = memref.subview %[[VAL_0]][0] {{\[}}%[[VAL_7]]] [1] : memref to memref // CHECK: memref.copy %[[VAL_10]], %[[VAL_9]] : memref to memref // CHECK: %[[VAL_11:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]] crd_mem_sz at 0 -// CHECK: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_6]] : tensor<6x2xi32> to memref<6x2xi32> +// CHECK: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_6]] : tensor<6x2xi32> to memref<6x2xi32> // CHECK: %[[VAL_13:.*]] = memref.collapse_shape %[[VAL_12]] {{\[\[}}0, 1]] : memref<6x2xi32> into memref<12xi32> // CHECK: %[[VAL_14:.*]] = memref.subview %[[VAL_13]][0] {{\[}}%[[VAL_11]]] [1] : memref<12xi32> to memref // CHECK: %[[VAL_15:.*]] = memref.subview %[[VAL_1]][0] {{\[}}%[[VAL_11]]] [1] : memref to memref // CHECK: memref.copy %[[VAL_15]], %[[VAL_14]] : memref to memref // CHECK: %[[VAL_16:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]] val_mem_sz -// CHECK: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_4]] : tensor<6xf64> to memref<6xf64> +// CHECK: %[[VAL_17:.*]] = bufferization.to_buffer %[[VAL_4]] : tensor<6xf64> to memref<6xf64> // CHECK: %[[VAL_18:.*]] = memref.subview %[[VAL_17]][0] {{\[}}%[[VAL_16]]] [1] : memref<6xf64> to memref // CHECK: %[[VAL_19:.*]] = memref.subview %[[VAL_2]][0] {{\[}}%[[VAL_16]]] [1] : memref to memref // CHECK: memref.copy %[[VAL_19]], %[[VAL_18]] : memref to memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir index c2cabd4351112..1cfa8571a9f0f 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir @@ -24,8 +24,8 @@ // CHECK-DAG: %[[TMP_0:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} // CHECK-DAG: %[[TMP_1:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index} // CHECK-DAG: %[[TMP_2:.*]] = sparse_tensor.values %[[TMP_arg0]] -// CHECK-DAG: %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg1]] : tensor<32xf32> to memref<32xf32> -// CHECK-DAG: %[[TMP_4:.*]] = bufferization.to_memref %[[TMP_arg2]] : tensor<16xf32> to memref<16xf32> +// CHECK-DAG: %[[TMP_3:.*]] = bufferization.to_buffer %[[TMP_arg1]] : tensor<32xf32> to memref<32xf32> +// CHECK-DAG: %[[TMP_4:.*]] = bufferization.to_buffer %[[TMP_arg2]] : tensor<16xf32> to memref<16xf32> // CHECK: scf.parallel (%[[TMP_arg3:.*]]) = (%[[TMP_c0]]) to (%[[TMP_c16]]) step (%[[TMP_c1]]) { // CHECK: %[[TMP_6:.*]] = memref.load %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32> // CHECK: %[[TMP_7:.*]] = memref.load %[[TMP_0]][%[[TMP_arg3]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir index 5f8002b5b6d31..289939fbfc16b 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir @@ -24,7 +24,7 @@ // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index // CHECK: %[[DEMAP:.*]] = sparse_tensor.reinterpret_map %[[VAL_0]] // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[DEMAP]] : tensor<30x10x20xf32, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<20x30x10xf32> to memref<20x30x10xf32> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<20x30x10xf32> to memref<20x30x10xf32> // CHECK: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_9]] : memref<20x30x10xf32>) // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] { // CHECK: %[[VAL_12:.*]] = arith.muli %[[VAL_10]], %[[VAL_4]] : index @@ -64,7 +64,7 @@ func.func @sparse_static_dims(%arga: tensor<10x20x30xf32, #X>, // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_2]] : tensor // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_3]] : tensor // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_4]] : tensor -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK-DAG: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_10]] : memref) // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] { // CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_8]] : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir index 93b5da41fc7f9..4abaf03dff50f 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir @@ -26,7 +26,7 @@ // CHECK-HIR-DAG: %[[VAL_6:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_2]] : tensor // CHECK-HIR-DAG: %[[VAL_7:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_4]] : tensor // CHECK-HIR-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[DEMAP]] : tensor -// CHECK-HIR-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref +// CHECK-HIR-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref // CHECK-HIR: %[[VAL_11:.*]] = tensor.extract %[[VAL_1]][] : tensor // CHECK-HIR: %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_3]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) { // CHECK-HIR: %[[VAL_18:.*]] = arith.muli %[[VAL_13]], %[[VAL_6]] : index @@ -58,7 +58,7 @@ // CHECK-MIR-DAG: %[[DimSize1:.*]] = call @sparseLvlSize(%[[ARGA]], %[[I1]]) // CHECK-MIR-DAG: %[[DimSize2:.*]] = call @sparseLvlSize(%[[ARGA]], %[[I2]]) // CHECK-MIR-DAG: %[[VAL_8:.*]] = call @sparseValuesF32(%[[ARGA]]) : (!llvm.ptr) -> memref -// CHECK-MIR-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[ARGX]] : tensor to memref +// CHECK-MIR-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[ARGX]] : tensor to memref // CHECK-MIR: %[[VAL_11:.*]] = tensor.extract %[[ARGX]][] : tensor // CHECK-MIR: %[[VAL_12:.*]] = scf.for %[[D2:.*]] = %[[I0]] to %[[DimSize0]] step %[[I1]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) { // CHECK-MIR: %[[VAL_18:.*]] = arith.muli %[[D2]], %[[DimSize1]] : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir index e5df646851d43..8d1f62f69f0f6 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir @@ -33,8 +33,8 @@ // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref -// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : tensor<32x16xf32> to memref<32x16xf32> +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_buffer %[[VAL_4]] : tensor<32x16xf32> to memref<32x16xf32> // CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref // CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref // CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir b/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir index e769534641ec8..d653e144fb3bd 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir @@ -64,14 +64,14 @@ func.func @fold_yield_direct_zero() -> tensor<32xf64> { // CHECK-DAG: %[[VAL_6:.*]] = arith.constant dense<0.000000e+00> : tensor<8x8xf64> // CHECK-DAG: %[[VAL_7:.*]] = bufferization.alloc_tensor() copy(%[[VAL_6]]) : tensor<8x8xf64> // CHECK-DAG: %[[VAL_8:.*]] = bufferization.alloc_tensor() copy(%[[VAL_6]]) : tensor<8x8xf64> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_8]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_buffer %[[VAL_8]] : tensor<8x8xf64> to memref<8x8xf64> // CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref // CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_17]] to %[[VAL_18]] step %[[VAL_5]] { @@ -132,8 +132,8 @@ func.func @sampled_dd_unfused(%args: tensor<8x8xf64, #SM>, // CHECK-DAG: %[[VAL_8:.*]] = arith.constant dense<0.000000e+00> : tensor<8x8xf64> // CHECK-DAG: %[[VAL_9:.*]] = bufferization.alloc_tensor() copy(%[[VAL_8]]) : tensor<8x8xf64> // CHECK-DAG: %[[VAL_10:.*]] = tensor.empty() : tensor<8x8xf64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> -// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir b/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir index 3cc0aa26c8bc2..39962b46d5d51 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir @@ -30,8 +30,8 @@ // CHECK-DAG: %[[VAL_6:.*]] = arith.constant false // CHECK-DAG: %[[VAL_7:.*]] = arith.constant true // CHECK-DAG: %[[VAL_8:.*]] = tensor.empty() : tensor<8x8xf64, #sparse{{[0-9]*}}> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64> // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir index c99d5d25f7b4a..f4b565c7f9c8a 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir @@ -31,7 +31,7 @@ // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 1 : index} : tensor<64x32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 1 : index} : tensor<64x32xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<64x32xf64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_14]][] : memref // CHECK: %[[VAL_16:.*]] = scf.for %[[VAL_17:.*]] = %[[VAL_6]] to %[[VAL_5]] step %[[VAL_7]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (f64) { // CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir index d88372276989d..e9587edef4678 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir @@ -28,7 +28,7 @@ // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : tensor<8xi64> to memref<8xi64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_7]] : tensor<8xi64> to memref<8xi64> // CHECK-DAG: linalg.fill ins(%[[VAL_4]] : i64) outs(%[[VAL_11]] : memref<8xi64>) // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref @@ -70,7 +70,7 @@ func.func @sparse_index_1d_conj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8 // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse{{[0-9]*}}> to memref -// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : tensor<8xi64> to memref<8xi64> +// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_buffer %[[VAL_7]] : tensor<8xi64> to memref<8xi64> // CHECK-DAG: linalg.fill ins(%[[VAL_3]] : i64) outs(%[[VAL_11]] : memref<8xi64>) // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref // CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref diff --git a/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir b/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir index 6c3acf43f241e..0c73d2fe8a079 100644 --- a/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir +++ b/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir @@ -24,8 +24,8 @@ // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 8 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<8x8xf64> to memref<8x8xf64> -// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<8x8xf64> to memref<8x8xf64> +// CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64> // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref // CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref diff --git a/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir b/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir index df1e564c06231..a673b0dacf4af 100755 --- a/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir +++ b/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir @@ -37,8 +37,8 @@ // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.reinterpret_map %[[VAL_0]] : tensor to tensor // CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor to memref -// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor to memref +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_1]] : tensor to memref +// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor to memref // CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.lvl %[[VAL_7]], %[[VAL_4]] : tensor // CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_7]] {level = 1 : index} : tensor to memref // CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_7]] {level = 1 : index} : tensor to memref diff --git a/mlir/test/Dialect/SparseTensor/unused-tensor.mlir b/mlir/test/Dialect/SparseTensor/unused-tensor.mlir index 7e8b9f83fac79..526c3f4f8830c 100644 --- a/mlir/test/Dialect/SparseTensor/unused-tensor.mlir +++ b/mlir/test/Dialect/SparseTensor/unused-tensor.mlir @@ -28,8 +28,8 @@ // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 4 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<2x4xf64> -// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<2x4xf64> +// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor<2x4xf64> +// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_buffer %[[VAL_2]] : tensor<2x4xf64> // CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] { // CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] { // CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_5]] step %[[VAL_7]] { diff --git a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir index 15228c6a5f79a..01b717090e87a 100644 --- a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir +++ b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir @@ -16,7 +16,7 @@ // CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-ON-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-ON-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -42,7 +42,7 @@ // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-OFF-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref // CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref @@ -93,7 +93,7 @@ func.func @sparse_reduction_ori(%argx: tensor, // CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-ON-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-ON-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -119,7 +119,7 @@ func.func @sparse_reduction_ori(%argx: tensor, // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-OFF-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref // CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref @@ -168,7 +168,7 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor, // CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-ON-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-ON-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref // CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -194,7 +194,7 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor, // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-OFF-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref // CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref @@ -243,7 +243,7 @@ func.func @sparse_reduction_subi(%argx: tensor, // CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-ON-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-ON-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -269,7 +269,7 @@ func.func @sparse_reduction_subi(%argx: tensor, // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-OFF-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref // CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref @@ -319,7 +319,7 @@ func.func @sparse_reduction_xor(%argx: tensor, // CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-ON-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-ON-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -345,7 +345,7 @@ func.func @sparse_reduction_xor(%argx: tensor, // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-OFF-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref // CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref @@ -395,7 +395,7 @@ func.func @sparse_reduction_addi(%argx: tensor, // CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-ON-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-ON-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -421,7 +421,7 @@ func.func @sparse_reduction_addi(%argx: tensor, // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-OFF-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref // CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref @@ -471,7 +471,7 @@ func.func @sparse_reduction_subf(%argx: tensor, // CHECK-ON-DAG: %[[VAL_5:.*]] = arith.constant 1 : index // CHECK-ON-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-ON-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-ON-DAG: %[[VAL_8:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-ON: %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref // CHECK-ON: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref // CHECK-ON: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref @@ -497,7 +497,7 @@ func.func @sparse_reduction_subf(%argx: tensor, // CHECK-OFF-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-OFF-DAG: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor to memref // CHECK-OFF-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor to memref -// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor to memref +// CHECK-OFF-DAG: %[[VAL_6:.*]] = bufferization.to_buffer %[[VAL_0]] : tensor to memref // CHECK-OFF: %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref // CHECK-OFF: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK-OFF: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir index c1beed95f2006..567c4abea488e 100644 --- a/mlir/test/Dialect/Tensor/bufferize.mlir +++ b/mlir/test/Dialect/Tensor/bufferize.mlir @@ -3,7 +3,7 @@ // CHECK-LABEL: func @dim( // CHECK-SAME: %[[TENSOR:.*]]: tensor<*xf32>, // CHECK-SAME: %[[INDEX:.*]]: index) -> index { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<*xf32> to memref<*xf32> +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor<*xf32> to memref<*xf32> // CHECK: %[[EXTENT:.*]] = memref.dim %[[MEMREF]], %[[INDEX]] : memref<*xf32> // CHECK: return %[[EXTENT]] : index func.func @dim(%arg0: tensor<*xf32>, %arg1: index) -> index { @@ -15,7 +15,7 @@ func.func @dim(%arg0: tensor<*xf32>, %arg1: index) -> index { // CHECK-LABEL: func @rank( // CHECK-SAME: %[[TENSOR:.*]]: tensor<*xf32>) -> index { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] // CHECK: %[[EXTENT:.*]] = memref.rank %[[MEMREF]] : memref<*xf32> func.func @rank(%arg0: tensor<*xf32>) -> index { %0 = tensor.rank %arg0 : tensor<*xf32> @@ -26,7 +26,7 @@ func.func @rank(%arg0: tensor<*xf32>) -> index { // CHECK-LABEL: func @tensor.cast( // CHECK-SAME: %[[TENSOR:.*]]: tensor) -> tensor<2xindex> { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] // CHECK: %[[CASTED:.*]] = memref.cast %[[MEMREF]] : memref to memref<2xindex> // CHECK: %[[RET:.*]] = bufferization.to_tensor %[[CASTED]] // CHECK: return %[[RET]] : tensor<2xindex> @@ -39,7 +39,7 @@ func.func @tensor.cast(%arg0: tensor) -> tensor<2xindex> { // CHECK-LABEL: func @tensor.cast_from_unranked( // CHECK-SAME: %[[TENSOR:.*]]: tensor<*xf32>) -> tensor<2xf32> { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<*xf32> to memref<*xf32> +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor<*xf32> to memref<*xf32> // CHECK: %[[CASTED_MEMREF:.*]] = memref.cast %[[MEMREF]] : memref<*xf32> to memref<2xf32, strided<[?], offset: ?>> // CHECK: %[[RET:.*]] = bufferization.to_tensor %[[CASTED_MEMREF]] : memref<2xf32, strided<[?], offset: ?>> // CHECK: return %[[RET]] : tensor<2xf32> @@ -52,7 +52,7 @@ func.func @tensor.cast_from_unranked(%arg0: tensor<*xf32>) -> tensor<2xf32> { // CHECK-LABEL: func @tensor.cast_to_unranked( // CHECK-SAME: %[[TENSOR:.*]]: tensor<2xf32>) -> tensor<*xf32> { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<2xf32> to memref<2xf32> +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor<2xf32> to memref<2xf32> // CHECK: %[[CASTED_MEMREF:.*]] = memref.cast %[[MEMREF]] : memref<2xf32> to memref<*xf32> // CHECK: %[[RET:.*]] = bufferization.to_tensor %[[CASTED_MEMREF]] : memref<*xf32> // CHECK: return %[[RET]] : tensor<*xf32> @@ -77,7 +77,7 @@ func.func @tensor.empty() -> tensor<5xf32> { // CHECK-LABEL: func @tensor.extract( // CHECK-SAME: %[[TENSOR:.*]]: tensor, // CHECK-SAME: %[[IDX:.*]]: index) -> f32 { -// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor to memref +// CHECK: %[[MEMREF:.*]] = bufferization.to_buffer %[[TENSOR]] : tensor to memref // CHECK: %[[RET:.*]] = memref.load %[[MEMREF]][%[[IDX]]] : memref // CHECK: return %[[RET]] : f32 // CHECK: } @@ -199,7 +199,7 @@ func.func @tensor.from_elements_3d(%f0 : f32) -> tensor<3x2x2xf32> { // CHECK-LABEL: func @tensor.generate( // CHECK-SAME: %[[ARG:.*]]: tensor<*xf32>, // CHECK-SAME: %[[DYNAMIC_EXTENT:.*]]: index) -> tensor { -// CHECK-DAG: %[[ARG_M:.*]] = bufferization.to_memref %[[ARG]] : tensor<*xf32> to memref<*xf32> +// CHECK-DAG: %[[ARG_M:.*]] = bufferization.to_buffer %[[ARG]] : tensor<*xf32> to memref<*xf32> // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc(%[[DYNAMIC_EXTENT]]) {{.*}} : memref // CHECK: %[[ALLOC_T:.*]] = bufferization.to_tensor %[[ALLOC]] // CHECK: %[[MAPPED:.*]] = linalg.map @@ -266,7 +266,7 @@ func.func @tensor.generate_unknown_ops_in_body(%arg0: index) -> tensor // CHECK-SAME: %[[t1:.*]]: tensor, %[[idx1:.*]]: index, %[[idx2:.*]]: index func.func @tensor.extract_slice( %t1: tensor, %idx1: index, %idx2: index) -> tensor { - // CHECK: %[[m:.*]] = bufferization.to_memref %[[t1]] : tensor to memref + // CHECK: %[[m:.*]] = bufferization.to_buffer %[[t1]] : tensor to memref // CHECK: %[[r:.*]] = memref.subview %[[m]][5, %[[idx2]]] [%[[idx1]], 10] [1, 1] : memref to memref> %0 = tensor.extract_slice %t1[5, %idx2][%idx1, 10][1, 1] : tensor to tensor @@ -282,7 +282,7 @@ func.func @tensor.extract_slice( // CHECK-SAME: %[[idx2:.*]]: index func.func @tensor.extract_slice_rank_reducing( %t1: tensor, %idx1: index, %idx2: index) -> tensor { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor to memref + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor to memref // CHECK: %[[r:.*]] = memref.subview %[[m1]][5, %[[idx1]], 10] [%[[idx2]], 1, 15] [1, 1, 1] : memref to memref> %0 = tensor.extract_slice %t1[5, %idx1, 10][%idx2, 1, 15][1, 1, 1] : tensor to tensor @@ -300,8 +300,8 @@ func.func @tensor.insert_slice(%t1: tensor, %t2: tensor, %idx1: index, %idx2: index) -> tensor { // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index - // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor to memref - // CHECK-DAG: %[[m2:.*]] = bufferization.to_memref %[[t2]] : tensor to memref + // CHECK-DAG: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor to memref + // CHECK-DAG: %[[m2:.*]] = bufferization.to_buffer %[[t2]] : tensor to memref // CHECK-DAG: %[[dim0:.*]] = memref.dim %[[m1]], %[[c0]] // CHECK-DAG: %[[dim1:.*]] = memref.dim %[[m1]], %[[c1]] // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim0]], %[[dim1]]) @@ -353,7 +353,7 @@ func.func @tensor.insert_slice_rank_reducing_2( // CHECK-SAME: %[[f:.*]]: f32 func.func @tensor.insert(%t1: tensor<5xf32>, %idx1: index, %f: f32) -> tensor<5xf32> { // CHECK-DAG: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32> - // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<5xf32> to memref<5xf32> + // CHECK-DAG: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor<5xf32> to memref<5xf32> // CHECK: memref.copy %[[m1]], %[[alloc]] // CHECK: memref.store %[[f]], %[[alloc]][%[[idx1]]] %0 = tensor.insert %f into %t1[%idx1] : tensor<5xf32> @@ -368,7 +368,7 @@ func.func @tensor.insert(%t1: tensor<5xf32>, %idx1: index, %f: f32) -> tensor<5x // CHECK-LABEL: func @tensor.expand_shape( // CHECK-SAME: %[[t1:.*]]: tensor, %[[sz0:.*]]: index func.func @tensor.expand_shape(%t1: tensor, %sz0: index) -> tensor<2x?x10xf32> { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] // CHECK: %[[expanded:.*]] = memref.expand_shape %[[m1]] {{\[\[}}0, 1], [2]] output_shape [2, %[[sz0]], 10] : memref into memref<2x?x10xf32> %0 = tensor.expand_shape %t1 [[0, 1], [2]] output_shape [2, %sz0, 10] : tensor into tensor<2x?x10xf32> @@ -384,7 +384,7 @@ func.func @tensor.expand_shape(%t1: tensor, %sz0: index) -> tensor<2x? // CHECK-SAME: %[[t1:.*]]: tensor, %{{.*}}: index, %{{.*}}: index, %[[sz0:.*]]: index func.func @tensor.expand_shape_of_slice( %t1: tensor, %o1: index, %s1: index, %sz0: index) -> tensor { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : // CHECK: %[[subview:.*]] = memref.subview %[[m1]][%{{.*}}, 5] [%{{.*}}, 10] [1, 1] : memref to memref> %0 = tensor.extract_slice %t1[%o1, 5][%s1, 10][1, 1] : tensor to tensor @@ -401,7 +401,7 @@ func.func @tensor.expand_shape_of_slice( // CHECK-SAME: %[[t1:.*]]: tensor func.func @tensor.expand_shape_of_scalar_slice( %t1: tensor, %o1: index, %s1: index) -> tensor<1xf32> { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor to memref + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor to memref // CHECK: %[[subview:.*]] = memref.subview %[[m1]][%{{.*}}] [1] [1] : memref to memref> %0 = tensor.extract_slice %t1[%o1][1][1] : tensor to tensor // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] [] output_shape [1] : memref into memref<1xf32, strided<[1], offset: ?>> @@ -415,7 +415,7 @@ func.func @tensor.expand_shape_of_scalar_slice( // CHECK-LABEL: func @tensor.expand_shape_multiple_dynamic_indices( // CHECK-SAME: %[[t1:.*]]: tensor, %[[sz0:.*]]: index, %[[sz1:.*]]: index, %[[sz2:.*]]: index func.func @tensor.expand_shape_multiple_dynamic_indices(%t1: tensor, %sz0: index, %sz1: index, %sz2: index) -> tensor { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] // CHECK: %[[expanded:.*]] = memref.expand_shape %[[m1]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[sz0]], %[[sz1]], %[[sz2]], 256] : memref into memref %0 = tensor.expand_shape %t1 [[0, 1, 2], [3]] output_shape [%sz0, %sz1, %sz2, 256] : tensor into tensor @@ -429,7 +429,7 @@ func.func @tensor.expand_shape_multiple_dynamic_indices(%t1: tensor, // CHECK-LABEL: func @tensor.collapse_shape( // CHECK-SAME: %[[t1:.*]]: tensor<2x?x?xf32> func.func @tensor.collapse_shape(%t1: tensor<2x?x?xf32>) -> tensor { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<2x?x?xf32> to memref<2x?x?xf32> + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor<2x?x?xf32> to memref<2x?x?xf32> // CHECK: %[[collapsed:.*]] = memref.collapse_shape %[[m1]] [ // CHECK-SAME: [0, 1], [2]] : memref<2x?x?xf32> into memref %0 = tensor.collapse_shape %t1 [[0, 1], [2]] @@ -445,7 +445,7 @@ func.func @tensor.collapse_shape(%t1: tensor<2x?x?xf32>) -> tensor { // CHECK-LABEL: func @tensor.collapse_shape_to_scalar( // CHECK-SAME: %[[t1:.*]]: tensor<1x1x1xf32> func.func @tensor.collapse_shape_to_scalar(%t1: tensor<1x1x1xf32>) -> tensor { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<1x1x1xf32> to memref<1x1x1xf32> + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor<1x1x1xf32> to memref<1x1x1xf32> // CHECK: %[[collapsed:.*]] = memref.collapse_shape %[[m1]] [] : memref<1x1x1xf32> into memref %0 = tensor.collapse_shape %t1 [] : tensor<1x1x1xf32> into tensor @@ -534,7 +534,7 @@ func.func @tensor.collapse_shape_of_slice5(%arg0: tensor<2x2x2xi64>) -> tensor<4 // CHECK-LABEL: func @tensor.reshape( // CHECK-SAME: %[[t1:.*]]: tensor func.func @tensor.reshape(%t1: tensor) -> tensor<2x2x5xf32> { - // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor to memref + // CHECK: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor to memref // CHECK: %[[two:.*]] = arith.constant 2 : i64 %two = arith.constant 2 : i64 @@ -566,7 +566,7 @@ func.func @tensor.reshape(%t1: tensor) -> tensor<2x2x5xf32> { // CHECK-SAME: %[[t1:.*]]: tensor, %[[l2:.*]]: index, %[[h1:.*]]: index, %[[h2:.*]]: index func.func @tensor.pad(%t1: tensor, %l2: index, %h1: index, %h2: index) -> tensor { - // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor to memref + // CHECK-DAG: %[[m1:.*]] = bufferization.to_buffer %[[t1]] : tensor to memref // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[dim0:.*]] = memref.dim %[[m1]], %[[c0]] @@ -582,7 +582,7 @@ func.func @tensor.pad(%t1: tensor, %l2: index, %h1: index, // CHECK: %[[mul:.*]] = arith.muli %[[index0]], %[[index1]] // CHECK: linalg.yield %[[mul]] // CHECK: } - // CHECK: %[[mapped_m:.*]] = bufferization.to_memref %[[mapped]] + // CHECK: %[[mapped_m:.*]] = bufferization.to_buffer %[[mapped]] // CHECK: %[[subview:.*]] = memref.subview %[[mapped_m]][5, %[[l2]]] [%[[dim0]], 10] [1, 1] // CHECK: memref.copy %[[m1]], %[[subview]] %0 = tensor.pad %t1 low[5, %l2] high[%h1, %h2] { diff --git a/mlir/test/Dialect/Vector/bufferize.mlir b/mlir/test/Dialect/Vector/bufferize.mlir index c2abebe706ac0..887fb941cc651 100644 --- a/mlir/test/Dialect/Vector/bufferize.mlir +++ b/mlir/test/Dialect/Vector/bufferize.mlir @@ -2,7 +2,7 @@ // CHECK-LABEL: func @transfer_read( // CHECK-SAME: %[[t:.*]]: tensor, %[[o1:.*]]: index, %[[o2:.*]]: index, %[[pad:.*]]: f32) -// CHECK: %[[m:.*]] = bufferization.to_memref %[[t]] : tensor to memref +// CHECK: %[[m:.*]] = bufferization.to_buffer %[[t]] : tensor to memref // CHECK: %[[r:.*]] = vector.transfer_read %[[m]][%[[o1]], %[[o2]]], %[[pad]] {in_bounds = [true, false]} : memref, vector<5x6xf32> // CHECK: return %[[r]] func.func @transfer_read(%t: tensor, %o1: index, @@ -16,7 +16,7 @@ func.func @transfer_read(%t: tensor, %o1: index, // CHECK-LABEL: func @transfer_write( // CHECK-SAME: %[[t:.*]]: tensor, %[[o1:.*]]: index, %[[o2:.*]]: index, %[[vec:.*]]: vector<5x6xf32>, %[[mask:.*]]: vector<5x6xi1>) -// CHECK: %[[m:.*]] = bufferization.to_memref %[[t]] : tensor to memref +// CHECK: %[[m:.*]] = bufferization.to_buffer %[[t]] : tensor to memref // CHECK: %[[alloc:.*]] = memref.alloc(%{{.*}}, %{{.*}}) {{.*}} : memref // CHECK: memref.copy %[[m]], %[[alloc]] // CHECK: vector.transfer_write %[[vec]], %[[alloc]][%[[o1]], %[[o2]]], %[[mask]] {in_bounds = [true, false]} : vector<5x6xf32>, memref @@ -35,7 +35,7 @@ func.func @transfer_write(%t: tensor, %o1: index, // CHECK-LABEL: func @gather( // CHECK-SAME: %[[base:.*]]: tensor, %[[v:.*]]: vector<16xi32>, // CHECK-SAME: %[[mask:.*]]: vector<16xi1>, %[[pass_thru:.*]]: vector<16xf32>) -// CHECK: %[[m:.*]] = bufferization.to_memref %[[base]] : tensor to memref +// CHECK: %[[m:.*]] = bufferization.to_buffer %[[base]] : tensor to memref // CHECK: %[[c0:.*]] = arith.constant 0 : index // CHECK: %[[out:.*]] = vector.gather %[[m]][%[[c0]], %[[c0]]] [%[[v]]], %[[mask]], %[[pass_thru]] : memref, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32> func.func @gather(%base: tensor, %v: vector<16xi32>, %mask: vector<16xi1>, %pass_thru: vector<16xf32>) -> vector<16xf32> { diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 99f0850000a16..974f4506a2ef0 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -450,28 +450,6 @@ func.func @extract_strided_fold_insert(%a: vector<2x8xf32>, %b: vector<1x4xf32>, // ----- -// CHECK-LABEL: transpose_1D_identity -// CHECK-SAME: ([[ARG:%.*]]: vector<4xf32>) -func.func @transpose_1D_identity(%arg : vector<4xf32>) -> vector<4xf32> { - // CHECK-NOT: transpose - %0 = vector.transpose %arg, [0] : vector<4xf32> to vector<4xf32> - // CHECK-NEXT: return [[ARG]] - return %0 : vector<4xf32> -} - -// ----- - -// CHECK-LABEL: transpose_2D_identity -// CHECK-SAME: ([[ARG:%.*]]: vector<4x3xf32>) -func.func @transpose_2D_identity(%arg : vector<4x3xf32>) -> vector<4x3xf32> { - // CHECK-NOT: transpose - %0 = vector.transpose %arg, [0, 1] : vector<4x3xf32> to vector<4x3xf32> - // CHECK-NEXT: return [[ARG]] - return %0 : vector<4x3xf32> -} - -// ----- - // CHECK-LABEL: transpose_3D_identity // CHECK-SAME: ([[ARG:%.*]]: vector<4x3x2xf32>) func.func @transpose_3D_identity(%arg : vector<4x3x2xf32>) -> vector<4x3x2xf32> { diff --git a/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir b/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir index 91ee0d335ecca..c84aea6609665 100644 --- a/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir +++ b/mlir/test/Dialect/Vector/canonicalize/vector-transpose.mlir @@ -1,6 +1,10 @@ // RUN: mlir-opt %s -canonicalize="test-convergence" -split-input-file -allow-unregistered-dialect | FileCheck %s -// This file contains some canonicalizations tests involving vector.transpose. +// This file contains some tests of canonicalizations and foldings involving vector.transpose. + +// +--------------------------------------------------------------------------- +// Tests of FoldTransposeBroadcast: transpose(broadcast) -> broadcast +// +--------------------------------------------------------------------------- // CHECK-LABEL: func @transpose_scalar_broadcast1 // CHECK-SAME: (%[[ARG:.+]]: vector<1xf32>) @@ -137,20 +141,22 @@ func.func @negative_broadcast_transpose_021(%arg0 : vector<3x1x3xi8>) -> vector< return %1 : vector<3x3x3xi8> } - // ----- -// Test of FoldTransposeShapeCast +/// +-------------------------------------------------------------------------- +/// Tests of ShapeCastOp::fold: shape_cast(transpose) -> shape_cast +/// +-------------------------------------------------------------------------- + // In this test, the permutation maps the non-unit dimensions (1 and 2) as follows: // 1 -> 0 // 2 -> 4 // Because 0 < 4, this permutation is order preserving and effectively a shape_cast. -// CHECK-LABEL: @transpose_shape_cast +// CHECK-LABEL: @shape_cast_of_transpose // CHECK-SAME: %[[ARG:.*]]: vector<1x4x4x1x1xi8>) -> vector<4x4xi8> { // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG]] : // CHECK-SAME: vector<1x4x4x1x1xi8> to vector<4x4xi8> // CHECK: return %[[SHAPE_CAST]] : vector<4x4xi8> -func.func @transpose_shape_cast(%arg : vector<1x4x4x1x1xi8>) -> vector<4x4xi8> { +func.func @shape_cast_of_transpose(%arg : vector<1x4x4x1x1xi8>) -> vector<4x4xi8> { %0 = vector.transpose %arg, [1, 0, 3, 4, 2] : vector<1x4x4x1x1xi8> to vector<4x1x1x1x4xi8> %1 = vector.shape_cast %0 : vector<4x1x1x1x4xi8> to vector<4x4xi8> @@ -159,18 +165,17 @@ func.func @transpose_shape_cast(%arg : vector<1x4x4x1x1xi8>) -> vector<4x4xi8> { // ----- -// Test of FoldTransposeShapeCast // In this test, the mapping of non-unit dimensions (1 and 2) is as follows: // 1 -> 2 // 2 -> 1 // As this is not increasing (2 > 1), this transpose is not order // preserving and cannot be treated as a shape_cast. -// CHECK-LABEL: @negative_transpose_shape_cast +// CHECK-LABEL: @negative_shape_cast_of_transpose // CHECK-SAME: %[[ARG:.*]]: vector<1x4x4x1xi8>) -> vector<4x4xi8> { // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[ARG]] // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[TRANSPOSE]] // CHECK: return %[[SHAPE_CAST]] : vector<4x4xi8> -func.func @negative_transpose_shape_cast(%arg : vector<1x4x4x1xi8>) -> vector<4x4xi8> { +func.func @negative_shape_cast_of_transpose(%arg : vector<1x4x4x1xi8>) -> vector<4x4xi8> { %0 = vector.transpose %arg, [0, 2, 1, 3] : vector<1x4x4x1xi8> to vector<1x4x4x1xi8> %1 = vector.shape_cast %0 : vector<1x4x4x1xi8> to vector<4x4xi8> @@ -179,13 +184,12 @@ func.func @negative_transpose_shape_cast(%arg : vector<1x4x4x1xi8>) -> vector<4x // ----- -// Test of FoldTransposeShapeCast // Currently the conversion shape_cast(transpose) -> shape_cast is disabled for // scalable vectors because of bad interaction with ConvertIllegalShapeCastOpsToTransposes -// CHECK-LABEL: @negative_transpose_shape_cast_scalable +// CHECK-LABEL: @negative_shape_cast_of_transpose_scalable // CHECK: vector.transpose // CHECK: vector.shape_cast -func.func @negative_transpose_shape_cast_scalable(%arg : vector<[4]x1xi8>) -> vector<[4]xi8> { +func.func @negative_shape_cast_of_transpose_scalable(%arg : vector<[4]x1xi8>) -> vector<[4]xi8> { %0 = vector.transpose %arg, [1, 0] : vector<[4]x1xi8> to vector<1x[4]xi8> %1 = vector.shape_cast %0 : vector<1x[4]xi8> to vector<[4]xi8> return %1 : vector<[4]xi8> @@ -193,13 +197,16 @@ func.func @negative_transpose_shape_cast_scalable(%arg : vector<[4]x1xi8>) -> ve // ----- -// Test of shape_cast folding. +/// +-------------------------------------------------------------------------- +/// Tests of FoldTransposeShapeCast: transpose(shape_cast) -> shape_cast +/// +-------------------------------------------------------------------------- + // The conversion transpose(shape_cast) -> shape_cast is not disabled for scalable // vectors. -// CHECK-LABEL: @shape_cast_transpose_scalable +// CHECK-LABEL: @transpose_of_shape_cast_scalable // CHECK: vector.shape_cast // CHECK-SAME: vector<[4]xi8> to vector<[4]x1xi8> -func.func @shape_cast_transpose_scalable(%arg : vector<[4]xi8>) -> vector<[4]x1xi8> { +func.func @transpose_of_shape_cast_scalable(%arg : vector<[4]xi8>) -> vector<[4]x1xi8> { %0 = vector.shape_cast %arg : vector<[4]xi8> to vector<1x[4]xi8> %1 = vector.transpose %0, [1, 0] : vector<1x[4]xi8> to vector<[4]x1xi8> return %1 : vector<[4]x1xi8> @@ -207,14 +214,13 @@ func.func @shape_cast_transpose_scalable(%arg : vector<[4]xi8>) -> vector<[4]x1x // ----- -// Test of shape_cast folding. // A transpose that is 'order preserving' can be treated like a shape_cast. -// CHECK-LABEL: @shape_cast_transpose +// CHECK-LABEL: @transpose_of_shape_cast // CHECK-SAME: %[[ARG:.*]]: vector<2x3x1x1xi8>) -> vector<6x1x1xi8> { // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG]] : // CHECK-SAME: vector<2x3x1x1xi8> to vector<6x1x1xi8> // CHECK: return %[[SHAPE_CAST]] : vector<6x1x1xi8> -func.func @shape_cast_transpose(%arg : vector<2x3x1x1xi8>) -> vector<6x1x1xi8> { +func.func @transpose_of_shape_cast(%arg : vector<2x3x1x1xi8>) -> vector<6x1x1xi8> { %0 = vector.shape_cast %arg : vector<2x3x1x1xi8> to vector<6x1x1xi8> %1 = vector.transpose %0, [0, 2, 1] : vector<6x1x1xi8> to vector<6x1x1xi8> @@ -223,12 +229,11 @@ func.func @shape_cast_transpose(%arg : vector<2x3x1x1xi8>) -> vector<6x1x1xi8> // ----- -// Test of shape_cast folding. // Scalable dimensions should be treated as non-unit dimensions. -// CHECK-LABEL: @shape_cast_transpose_scalable +// CHECK-LABEL: @transpose_of_shape_cast_scalable // CHECK: vector.shape_cast // CHECK: vector.transpose -func.func @shape_cast_transpose_scalable_unit(%arg : vector<[1]x4x1xi8>) -> vector<4x[1]xi8> { +func.func @transpose_of_shape_cast_scalable_unit(%arg : vector<[1]x4x1xi8>) -> vector<4x[1]xi8> { %0 = vector.shape_cast %arg : vector<[1]x4x1xi8> to vector<[1]x4xi8> %1 = vector.transpose %0, [1, 0] : vector<[1]x4xi8> to vector<4x[1]xi8> return %1 : vector<4x[1]xi8> @@ -237,13 +242,57 @@ func.func @shape_cast_transpose_scalable_unit(%arg : vector<[1]x4x1xi8>) -> vect // ----- // Test of shape_cast (not) folding. -// CHECK-LABEL: @negative_shape_cast_transpose +// CHECK-LABEL: @negative_transpose_of_shape_cast // CHECK-SAME: %[[ARG:.*]]: vector<6xi8>) -> vector<2x3xi8> { // CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[ARG]] : // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[SHAPE_CAST]] // CHECK: return %[[TRANSPOSE]] : vector<2x3xi8> -func.func @negative_shape_cast_transpose(%arg : vector<6xi8>) -> vector<2x3xi8> { +func.func @negative_transpose_of_shape_cast(%arg : vector<6xi8>) -> vector<2x3xi8> { %0 = vector.shape_cast %arg : vector<6xi8> to vector<3x2xi8> %1 = vector.transpose %0, [1, 0] : vector<3x2xi8> to vector<2x3xi8> return %1 : vector<2x3xi8> } + +// ----- + +// +----------------------------------- +// Tests of TransposeOp::fold +// +----------------------------------- + +// CHECK-LABEL: transpose_1D_identity +// CHECK-SAME: [[ARG:%.*]]: vector<4xf32> +// CHECK-NEXT: return [[ARG]] +func.func @transpose_1D_identity(%arg : vector<4xf32>) -> vector<4xf32> { + %0 = vector.transpose %arg, [0] : vector<4xf32> to vector<4xf32> + return %0 : vector<4xf32> +} + +// ----- + +// CHECK-LABEL: transpose_2D_identity +// CHECK-SAME: [[ARG:%.*]]: vector<4x3xf32> +// CHECK-NEXT: return [[ARG]] +func.func @transpose_2D_identity(%arg : vector<4x3xf32>) -> vector<4x3xf32> { + %0 = vector.transpose %arg, [0, 1] : vector<4x3xf32> to vector<4x3xf32> + return %0 : vector<4x3xf32> +} + +// ----- + +// CHECK-LABEL: transpose_shape_and_order_preserving +// CHECK-SAME: [[ARG:%.*]]: vector<6x1x1x4xi8> +// CHECK-NEXT: return [[ARG]] +func.func @transpose_shape_and_order_preserving(%arg : vector<6x1x1x4xi8>) -> vector<6x1x1x4xi8> { + %0 = vector.transpose %arg, [0, 2, 1, 3] : vector<6x1x1x4xi8> to vector<6x1x1x4xi8> + return %0 : vector<6x1x1x4xi8> +} + +// ----- + +// CHECK-LABEL: negative_transpose_fold +// CHECK: [[TRANSP:%.*]] = vector.transpose +// CHECK: return [[TRANSP]] +func.func @negative_transpose_fold(%arg : vector<2x2xi8>) -> vector<2x2xi8> { + %0 = vector.transpose %arg, [1, 0] : vector<2x2xi8> to vector<2x2xi8> + return %0 : vector<2x2xi8> +} diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir index 01ad1ac48b012..9cbf319ffddb2 100644 --- a/mlir/test/Dialect/Vector/linearize.mlir +++ b/mlir/test/Dialect/Vector/linearize.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -test-vector-linearize -verify-diagnostics | FileCheck %s +// RUN: mlir-opt %s -split-input-file -test-vector-linearize -verify-diagnostics | FileCheck %s // CHECK-LABEL: test_linearize // CHECK-SAME: (%[[ORIG_ARG:.*]]: vector<2x2xf32>) @@ -131,9 +131,9 @@ func.func @test_0d_vector() -> vector { // ----- -// CHECK-LABEL: test_extract_strided_slice_1 +// CHECK-LABEL: test_extract_strided_slice_2D // CHECK-SAME: (%[[ORIG_ARG:.*]]: vector<4x8xf32>) -> vector<2x2xf32> { -func.func @test_extract_strided_slice_1(%arg0 : vector<4x8xf32>) -> vector<2x2xf32> { +func.func @test_extract_strided_slice_2D(%arg0 : vector<4x8xf32>) -> vector<2x2xf32> { // CHECK: %[[ARG:.*]] = vector.shape_cast %[[ORIG_ARG]] : vector<4x8xf32> to vector<32xf32> // CHECK: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG]], %[[ARG]] @@ -147,13 +147,13 @@ func.func @test_extract_strided_slice_1(%arg0 : vector<4x8xf32>) -> vector<2x2xf // ----- -// CHECK-LABEL: func.func @test_extract_strided_slice_1_scalable( +// CHECK-LABEL: func.func @test_extract_strided_slice_2D_scalable( // CHECK-SAME: %[[VAL_0:.*]]: vector<4x[8]xf32>) -> vector<2x[8]xf32> { -func.func @test_extract_strided_slice_1_scalable(%arg0: vector<4x[8]xf32>) -> vector<2x[8]xf32> { +func.func @test_extract_strided_slice_2D_scalable(%arg0: vector<4x[8]xf32>) -> vector<2x[8]xf32> { // CHECK-NOT: vector.shuffle // CHECK-NOT: vector.shape_cast - // CHECK: %[[RES:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [1, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x[8]xf32> to vector<2x[8]xf32> + // CHECK: %[[RES:.*]] = vector.extract_strided_slice %[[VAL_0]] %0 = vector.extract_strided_slice %arg0 { sizes = [2, 8], strides = [1, 1], offsets = [1, 0] } : vector<4x[8]xf32> to vector<2x[8]xf32> // CHECK: return %[[RES]] : vector<2x[8]xf32> @@ -162,9 +162,9 @@ func.func @test_extract_strided_slice_1_scalable(%arg0: vector<4x[8]xf32>) -> ve // ----- -// CHECK-LABEL: test_extract_strided_slice_2 +// CHECK-LABEL: test_extract_strided_slice_3D // CHECK-SAME: (%[[ORIG_ARG:.*]]: vector<2x8x2xf32>) -> vector<1x4x2xf32> { -func.func @test_extract_strided_slice_2(%arg0 : vector<2x8x2xf32>) -> vector<1x4x2xf32> { +func.func @test_extract_strided_slice_3D(%arg0 : vector<2x8x2xf32>) -> vector<1x4x2xf32> { // CHECK: %[[ARG:.*]] = vector.shape_cast %[[ORIG_ARG]] : vector<2x8x2xf32> to vector<32xf32> // CHECK: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG]], %[[ARG]] @@ -178,6 +178,76 @@ func.func @test_extract_strided_slice_2(%arg0 : vector<2x8x2xf32>) -> vector<1x4 // ----- +// Test of insert_strided_slice -> shuffle. +// This is a contiguous insertion of 4 elements at offset 6 into a vector of 12 elements. +// CHECK-LABEL: insert_strided_slice_2D_into_4D +func.func @insert_strided_slice_2D_into_4D(%arg0 : vector<2x2xi8>, %arg1 : vector<2x1x3x2xi8>) -> vector<2x1x3x2xi8> { + +// CHECK-DAG: %[[ARG0:.*]] = vector.shape_cast {{.*}} to vector<4xi8> +// CHECK-DAG: %[[ARG1:.*]] = vector.shape_cast {{.*}} to vector<12xi8> +// CHECK: vector.shuffle %[[ARG1]], %[[ARG0]] +// CHECK-SAME: [0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 10, 11] : vector<12xi8>, vector<4xi8> + %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [1, 0, 0, 0], strides = [1, 1]} : vector<2x2xi8> into vector<2x1x3x2xi8> + +// CHECK: %[[RES:.*]] = vector.shape_cast {{.*}} to vector<2x1x3x2xi8> +// CHECK: return %[[RES]] : vector<2x1x3x2xi8> + return %0 : vector<2x1x3x2xi8> +} + +// ----- + +// Test of insert_strided_slice -> shuffle. +// [[[0, 1], [2, 3], [4, 5]], [[6, 7], [8, 9], [10, 11]], [[12, 13], [14, 15]], [[16, 17]]] +// ^ ^ +// | | +// where the 2 elements are inserted into the 3x3x2 vector +// CHECK-LABEL: insert_strided_slice_3D +func.func @insert_strided_slice_3D(%arg0 : vector<1x2x1xi8>, %arg1 : vector<3x3x2xi8>) -> vector<3x3x2xi8> { + +// CHECK-DAG: %[[ARG0:.*]] = vector.shape_cast {{.*}} to vector<2xi8> +// CHECK-DAG: %[[ARG1:.*]] = vector.shape_cast {{.*}} to vector<18xi8> +// CHECK: vector.shuffle %[[ARG1]], %[[ARG0]] +// CHECK-SAME: [0, 1, 2, 3, 4, 5, 6, 7, 8, 18, 10, 19, 12, 13, 14, 15, 16, 17] : vector<18xi8>, vector<2xi8> + %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [1, 1, 1], sizes = [1, 2, 1], strides = [1, 1, 1]} : vector<1x2x1xi8> into vector<3x3x2xi8> + +// CHECK: %[[RES:.*]] = vector.shape_cast {{.*}} to vector<3x3x2xi8> +// CHECK: return %[[RES]] : vector<3x3x2xi8> + return %0 : vector<3x3x2xi8> +} + +// ----- + +// CHECK-LABEL: insert_strided_slice_2D_higher_offsets +func.func @insert_strided_slice_2D_higher_offsets(%arg0 : vector<2x1xi8>, %arg1 : vector<2x2xi8>, %arg2 : vector<5x2xi8>) -> vector<5x2xi8> { + + // CHECK: [0, 1, 2, 3, 10, 11, 12, 13, 8, 9] + // ^^^ ^^^ ^^^ ^^^ + // insertion indices + %0 = vector.insert_strided_slice %arg1, %arg2 {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<2x2xi8> into vector<5x2xi8> + + // CHECK: [0, 1, 2, 3, 10, 5, 11, 7, 8, 9] + // ^^^ ^^^ + %1 = vector.insert_strided_slice %arg0, %0 {offsets = [2, 0], sizes = [2, 1], strides = [1, 1]} : vector<2x1xi8> into vector<5x2xi8> + + // CHECK: [0, 1, 2, 3, 4, 5, 6, 10, 8, 11] + // ^^^ ^^^ + %2 = vector.insert_strided_slice %arg0, %1 {offsets = [3, 1], sizes = [2, 1], strides = [1, 1]} : vector<2x1xi8> into vector<5x2xi8> + + return %2 : vector<5x2xi8> +} + +// ----- + +// CHECK-LABEL: negative_insert_strided_slice_scalable +// CHECK-NOT: vector.shuffle +// CHECK: return +func.func @negative_insert_strided_slice_scalable(%arg0 : vector<1x[2]xi8>, %arg1 : vector<2x[2]xi8>) -> vector<2x[2]xi8> { + %0 = vector.insert_strided_slice %arg0, %arg1 {offsets = [0, 0], strides = [1,1]} : vector<1x[2]xi8> into vector<2x[2]xi8> + return %0 : vector<2x[2]xi8> +} + +// ----- + // CHECK-LABEL: test_vector_shuffle // CHECK-SAME: (%[[ORIG_ARG0:.*]]: vector<4x2xf32>, %[[ORIG_ARG1:.*]]: vector<4x2xf32>) -> vector<8x2xf32> { func.func @test_vector_shuffle(%arg0: vector<4x2xf32>, %arg1: vector<4x2xf32>) -> vector<8x2xf32> { @@ -322,6 +392,28 @@ func.func @test_vector_bitcast(%arg0: vector<[4]x2xf32>) -> vector<[4]x4xf16> { // ----- +// CHECK-LABEL: test_linearize_across_for +func.func @test_linearize_across_for(%arg0 : vector<4xi8>) -> vector<4xi8> { + %0 = vector.shape_cast %arg0 : vector<4xi8> to vector<2x2xi8> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + + // CHECK: scf.for {{.*}} -> (vector<4xi8>) + %1 = scf.for %i = %c0 to %c4 step %c1 iter_args(%arg1 = %0) -> (vector<2x2xi8>) { + + // CHECK: arith.addi {{.*}} : vector<4xi8> + %2 = arith.addi %arg1, %0 : vector<2x2xi8> + + // CHECK: scf.yield {{.*}} : vector<4xi8> + scf.yield %2 : vector<2x2xi8> + } + %3 = vector.shape_cast %1 : vector<2x2xi8> to vector<4xi8> + return %3 : vector<4xi8> +} + +// ----- + // CHECK-LABEL: linearize_vector_splat // CHECK-SAME: (%[[ARG:.*]]: i32) -> vector<4x2xi32> func.func @linearize_vector_splat(%arg0: i32) -> vector<4x2xi32> { @@ -344,4 +436,31 @@ func.func @linearize_scalable_vector_splat(%arg0: i32) -> vector<4x[2]xi32> { // CHECK: return %[[CAST]] : vector<4x[2]xi32> %0 = vector.splat %arg0 : vector<4x[2]xi32> return %0 : vector<4x[2]xi32> + +} + +// ----- + +// CHECK-LABEL: linearize_create_mask +// CHECK-SAME: (%[[ARG0:.*]]: index, %[[ARG1:.*]]: index) -> vector<1x16xi1> +func.func @linearize_create_mask(%arg0 : index, %arg1 : index) -> vector<1x16xi1> { + + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[ARG0]], %[[C0]] : index + // CHECK: %[[INDEXCAST:.*]] = arith.index_cast %[[CMP]] : i1 to index + // CHECK: %[[MULI:.*]] = arith.andi %[[INDEXCAST]], %[[ARG1]] : index + // CHECK: %[[MASK_1D:.*]] = vector.create_mask %[[MULI]] : vector<16xi1> + // CHECK: %[[CAST:.*]] = vector.shape_cast %[[MASK_1D]] : vector<16xi1> to vector<1x16xi1> + // CHECK: return %[[CAST]] : vector<1x16xi1> + %0 = vector.create_mask %arg0, %arg1 : vector<1x16xi1> + return %0 : vector<1x16xi1> +} + +// ----- +// CHECK-LABEL: linearize_scalable_create_mask +func.func @linearize_scalable_create_mask(%arg0 : index, %arg1 : index) -> vector<1x[16]xi1> { + + // CHECK: %[[MASK_1D:.*]] = vector.create_mask {{%.*}} : vector<[16]xi1> + %0 = vector.create_mask %arg0, %arg1 : vector<1x[16]xi1> + return %0 : vector<1x[16]xi1> } diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir index 83395504e8c74..a730f217f027d 100644 --- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir @@ -65,13 +65,15 @@ func.func @transpose102_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<1x8x8xf32> return %0 : vector<1x8x8xf32> } -// CHECK-LABEL: func @transpose1023_1x1x8x8xf32( -func.func @transpose1023_1x1x8x8xf32(%arg0: vector<1x1x8x8xf32>) -> vector<1x1x8x8xf32> { - // Note the single 2-D extract/insert pair since 2 and 3 are not transposed! - // CHECK: vector.extract {{.*}}[0, 0] : vector<8x8xf32> from vector<1x1x8x8xf32> - // CHECK-NEXT: vector.insert {{.*}} [0, 0] : vector<8x8xf32> into vector<1x1x8x8xf32> - %0 = vector.transpose %arg0, [1, 0, 2, 3] : vector<1x1x8x8xf32> to vector<1x1x8x8xf32> - return %0 : vector<1x1x8x8xf32> +// CHECK-LABEL: func @transpose1023_2x1x8x4xf32( +func.func @transpose1023_2x1x8x4xf32(%arg0: vector<2x1x8x4xf32>) -> vector<1x2x8x4xf32> { + // Note the 2-D extract/insert pair since dimensions 2 and 3 are not transposed! + // CHECK: vector.extract {{.*}}[0, 0] : vector<8x4xf32> from vector<2x1x8x4xf32> + // CHECK-NEXT: vector.insert {{.*}} [0, 0] : vector<8x4xf32> into vector<1x2x8x4xf32> + // CHECK-NEXT: vector.extract {{.*}}[1, 0] : vector<8x4xf32> from vector<2x1x8x4xf32> + // CHECK-NEXT: vector.insert {{.*}} [0, 1] : vector<8x4xf32> into vector<1x2x8x4xf32> + %0 = vector.transpose %arg0, [1, 0, 2, 3] : vector<2x1x8x4xf32> to vector<1x2x8x4xf32> + return %0 : vector<1x2x8x4xf32> } /// Scalable dim should not be unrolled. diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir index bb19cb63b65a5..3bb6e38b4d613 100644 --- a/mlir/test/IR/parser.mlir +++ b/mlir/test/IR/parser.mlir @@ -1232,6 +1232,13 @@ func.func @parse_base64_test() { return } +// CHECK-LABEL: func @parse_slash_test +func.func @parse_slash_test() { + // CHECK: "test.slash_attr"() <{attr = #test.slash_attr<1 / 2>}> : () -> () + "test.slash_attr"() { attr = #test.slash_attr<1 / 2> } : () -> () + return +} + // CHECK-LABEL: func @"\22_string_symbol_reference\22" func.func @"\"_string_symbol_reference\""() { // CHECK: ref = @"\22_string_symbol_reference\22" diff --git a/mlir/test/IR/parser_dialect_loading.mlir b/mlir/test/IR/parser_dialect_loading.mlir deleted file mode 100644 index b9c2d30cf3c98..0000000000000 --- a/mlir/test/IR/parser_dialect_loading.mlir +++ /dev/null @@ -1,19 +0,0 @@ -// RUN: mlir-opt -allow-unregistered-dialect --split-input-file %s | FileCheck %s - -// This is a testing that a non-qualified attribute in a custom format -// correctly preload the dialect before creating the attribute. -#attr = #test.nested_polynomial> -// CHECK-LABEL: @parse_correctly -llvm.func @parse_correctly() { - test.containing_int_polynomial_attr #attr - llvm.return -} - -// ----- - -#attr2 = #test.nested_polynomial2> -// CHECK-LABEL: @parse_correctly_2 -llvm.func @parse_correctly_2() { - test.containing_int_polynomial_attr2 #attr2 - llvm.return -} diff --git a/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir index 3c508fbb67a11..06bc0e7ef44ec 100644 --- a/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir +++ b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir @@ -54,7 +54,7 @@ func.func @main() { %result_static = func.call @max_pool_static(%A) : (!tensor_type) -> !tensor_type %result_dynamic = func.call @max_pool_dynamic(%A_dynamic) : (tensor) -> tensor - %static_buffer = bufferization.to_memref %result_static : !tensor_type to !memref_type + %static_buffer = bufferization.to_buffer %result_static : !tensor_type to !memref_type %unranked_static_buffer = memref.cast %static_buffer : !memref_type to memref<*xf32> // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data = @@ -81,7 +81,7 @@ func.func @main() { func.call @printMemrefF32(%unranked_static_buffer) : (memref<*xf32>) -> () - %dynamic_buffer = bufferization.to_memref %result_dynamic : tensor to memref + %dynamic_buffer = bufferization.to_buffer %result_dynamic : tensor to memref %unranked_dynamic_buffer = memref.cast %dynamic_buffer : memref to memref<*xf32> // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data = diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir index 8cf15cd697868..8014bb7d2dcce 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir @@ -100,8 +100,8 @@ func.func @entry() -> i32 { ]> : tensor<16x32xbf16> // Set up memory. - %a = bufferization.to_memref %0 : tensor<16x32xbf16> to memref<16x32xbf16> - %b = bufferization.to_memref %1 : tensor<16x32xbf16> to memref<16x32xbf16> + %a = bufferization.to_buffer %0 : tensor<16x32xbf16> to memref<16x32xbf16> + %b = bufferization.to_buffer %1 : tensor<16x32xbf16> to memref<16x32xbf16> %c = memref.alloc() : memref<16x16xf32> // Call kernel. diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir index 652ba0698c4c9..a0076db6660d7 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir @@ -100,8 +100,8 @@ func.func @entry() -> i32 { ]> : tensor<16x64xi8> // Set up memory. - %a = bufferization.to_memref %0 : tensor<16x64xi8> to memref<16x64xi8> - %b = bufferization.to_memref %1 : tensor<16x64xi8> to memref<16x64xi8> + %a = bufferization.to_buffer %0 : tensor<16x64xi8> to memref<16x64xi8> + %b = bufferization.to_buffer %1 : tensor<16x64xi8> to memref<16x64xi8> %c = memref.alloc() : memref<16x16xi32> // Call kernel. diff --git a/mlir/test/Target/LLVMIR/Import/import-structs-as-literals.ll b/mlir/test/Target/LLVMIR/Import/import-structs-as-literals.ll new file mode 100644 index 0000000000000..40fd834817d04 --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/import-structs-as-literals.ll @@ -0,0 +1,13 @@ +; RUN: mlir-translate -import-llvm -import-structs-as-literals -split-input-file %s | FileCheck %s + +%named = type {i32, i8, i16, i32} + +; CHECK: @named +; CHECK-SAME: !llvm.struct<(i32, i8, i16, i32)> +@named = external global %named + +%opaque = type opaque + +; CHECK: @opaque +; CHECK-SAME: !llvm.struct<()> +@opaque = external global %opaque diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll index 9dacd35c26833..68ef47c3f42f1 100644 --- a/mlir/test/Target/LLVMIR/Import/instructions.ll +++ b/mlir/test/Target/LLVMIR/Import/instructions.ll @@ -542,14 +542,55 @@ define void @indirect_vararg_call(ptr addrspace(42) %fn) { ; CHECK-LABEL: @inlineasm ; CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] define i32 @inlineasm(i32 %arg1) { - ; CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects "bswap $0", "=r,r" %[[ARG1]] : (i32) -> i32 - %1 = call i32 asm "bswap $0", "=r,r"(i32 %arg1) + ; CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects is_align_stack asm_dialect = intel "bswap $0", "=r,r" %[[ARG1]] : (i32) -> i32 + %1 = call i32 asm sideeffect alignstack inteldialect "bswap $0", "=r,r"(i32 %arg1) ; CHECK: return %[[RES]] ret i32 %1 } ; // ----- +; CHECK-LABEL: @inlineasm2 +define void @inlineasm2() { + %p = alloca ptr, align 8 + ; CHECK: {{.*}} = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr + ; CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [{elementtype = !llvm.ptr}] "", "*m,~{memory}" {{.*}} : (!llvm.ptr) -> !llvm.void + call void asm sideeffect "", "*m,~{memory}"(ptr elementtype(ptr) %p) + ret void +} + +; // ----- + +; CHECK: llvm.func @inlineasm3 +; CHECK-SAME:(%[[A0:.*]]: !llvm.ptr, %[[A1:.*]]: i64, %[[A2:.*]]: !llvm.ptr, %[[A3:.*]]: i64) { +define void @inlineasm3( + ptr %ptr0, + i64 %b, + ptr %ptr1, + i64 %c + ) { + ; CHECK: llvm.inline_asm asm_dialect = att operand_attrs = + ; CHECK-SAME: [{elementtype = !llvm.array<16 x i64>}, {}, + ; CHECK-SAME: {elementtype = !llvm.array<16 x i64>}, {}, {}, {}, + ; CHECK-SAME: {elementtype = !llvm.array<16 x i64>}] + ; CHECK-SAME: "ldr x4, [$2], #8 \0A\09ldr x5, [$1] \0A\09mul x6, x4, $4 \0A\09", + ; CHECK-SAME: "=r,=r,=r,=*m,r,*m,0,1,2,*m,~{x4},~{x5},~{x6},~{x7},~{cc}" + ; CHECK-SAME: %[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A0]], %[[A2]], %[[A0]] : + ; CHECK-SAME: (!llvm.ptr, i64, !llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> !llvm.struct<(i64, ptr, ptr)> + %r = call { i64, ptr, ptr } asm "ldr x4, [$2], #8 \0A\09ldr x5, [$1] \0A\09mul x6, x4, $4 \0A\09", + "=r,=r,=r,=*m,r,*m,0,1,2,*m,~{x4},~{x5},~{x6},~{x7},~{cc}"( + ptr elementtype([16 x i64]) %ptr0, + i64 %b, + ptr elementtype([16 x i64]) %ptr1, + i64 %c, + ptr %ptr0, + ptr %ptr1, + ptr elementtype([16 x i64]) %ptr0) + ret void +} + +; // ----- + ; CHECK-LABEL: @gep_static_idx ; CHECK-SAME: %[[PTR:[a-zA-Z0-9]+]] define void @gep_static_idx(ptr %ptr) { diff --git a/mlir/test/Target/LLVMIR/Import/struct.ll b/mlir/test/Target/LLVMIR/Import/struct.ll new file mode 100644 index 0000000000000..dd94035e585f4 --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/struct.ll @@ -0,0 +1,10 @@ +; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s + +%"bucket::Iterator" = type { ptr, i64, i64 } + +; CHECK-LABEL: llvm.func @g +define void @g() { + %item.i = alloca %"bucket::Iterator", align 8 + ; CHECK: llvm.alloca %0 x !llvm.struct<"bucket::Iterator", (ptr, i64, i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + ret void +} diff --git a/mlir/test/Target/LLVMIR/arm-sve.mlir b/mlir/test/Target/LLVMIR/arm-sve.mlir index ed5a1fc7ba2e4..da71cb5a63bd2 100644 --- a/mlir/test/Target/LLVMIR/arm-sve.mlir +++ b/mlir/test/Target/LLVMIR/arm-sve.mlir @@ -48,6 +48,18 @@ llvm.func @arm_sve_ummla(%arg0: vector<[16]xi8>, llvm.return %0 : vector<[4]xi32> } +// CHECK-LABEL: define @arm_sve_usmmla +llvm.func @arm_sve_usmmla(%arg0: vector<[16]xi8>, + %arg1: vector<[16]xi8>, + %arg2: vector<[4]xi32>) + -> vector<[4]xi32> { + // CHECK: call @llvm.aarch64.sve.usmmla.nxv4i32(, vector<[16]xi8>, vector<[16]xi8>) + -> vector<[4]xi32> + llvm.return %0 : vector<[4]xi32> +} + // CHECK-LABEL: define @arm_sve_arithi llvm.func @arm_sve_arithi(%arg0: vector<[4]xi32>, %arg1: vector<[4]xi32>, @@ -390,3 +402,35 @@ llvm.func @arm_sve_psel(%pn: vector<[16]xi1>, %p1: vector<[2]xi1>, %p2: vector<[ "arm_sve.intr.psel"(%pn, %p4, %index) : (vector<[16]xi1>, vector<[16]xi1>, i32) -> vector<[16]xi1> llvm.return } + +// CHECK-LABEL: @arm_sve_dupq_lane +// CHECK-SAME: %[[V0:[0-9]+]] +// CHECK-SAME: %[[V1:[0-9]+]] +// CHECK-SAME: %[[V2:[0-9]+]] +// CHECK-SAME: %[[V3:[0-9]+]] +// CHECK-SAME: %[[V4:[0-9]+]] +// CHECK-SAME: %[[V5:[0-9]+]] +// CHECK-SAME: %[[V6:[0-9]+]] +// CHECK-SAME: %[[V7:[0-9]+]] +llvm.func @arm_sve_dupq_lane(%nxv16i8: vector<[16]xi8>, %nxv8i16: vector<[8]xi16>, + %nxv8f16: vector<[8]xf16>, %nxv8bf16: vector<[8]xbf16>, + %nxv4i32: vector<[4]xi32>, %nxv4f32: vector<[4]xf32>, + %nxv2i64: vector<[2]xi64>, %nxv2f64: vector<[2]xf64>) { + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv16i8( %[[V0]], i64 0) + %0 = "arm_sve.intr.dupq_lane"(%nxv16i8) <{lane = 0 : i64}> : (vector<[16]xi8>) -> vector<[16]xi8> + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv8i16( %[[V1]], i64 1) + %1 = "arm_sve.intr.dupq_lane"(%nxv8i16) <{lane = 1 : i64}> : (vector<[8]xi16>) -> vector<[8]xi16> + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv8f16( %[[V2]], i64 2) + %2 = "arm_sve.intr.dupq_lane"(%nxv8f16) <{lane = 2 : i64}> : (vector<[8]xf16>) -> vector<[8]xf16> + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %[[V3]], i64 3) + %3 = "arm_sve.intr.dupq_lane"(%nxv8bf16) <{lane = 3 : i64}> : (vector<[8]xbf16>) -> vector<[8]xbf16> + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv4i32( %[[V4]], i64 4) + %4 = "arm_sve.intr.dupq_lane"(%nxv4i32) <{lane = 4 : i64}> : (vector<[4]xi32>) -> vector<[4]xi32> + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv4f32( %[[V5]], i64 5) + %5 = "arm_sve.intr.dupq_lane"(%nxv4f32) <{lane = 5 : i64}> : (vector<[4]xf32>) -> vector<[4]xf32> + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv2i64( %[[V6]], i64 6) + %6 = "arm_sve.intr.dupq_lane"(%nxv2i64) <{lane = 6 : i64}> : (vector<[2]xi64>) -> vector<[2]xi64> + // CHECK: call @llvm.aarch64.sve.dupq.lane.nxv2f64( %[[V7]], i64 7) + %7 = "arm_sve.intr.dupq_lane"(%nxv2f64) <{lane = 7 : i64}> : (vector<[2]xf64>) -> vector<[2]xf64> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/blockaddress.mlir b/mlir/test/Target/LLVMIR/blockaddress.mlir index fb3d853531122..4473f91c4bdb5 100644 --- a/mlir/test/Target/LLVMIR/blockaddress.mlir +++ b/mlir/test/Target/LLVMIR/blockaddress.mlir @@ -34,3 +34,32 @@ llvm.func @blockaddr0() -> !llvm.ptr { // CHECK: [[RET]]: // CHECK: ret ptr blockaddress(@blockaddr0, %1) // CHECK: } + +// ----- + +llvm.mlir.global private @h() {addr_space = 0 : i32, dso_local} : !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.return %0 : !llvm.ptr +} + +// CHECK: @h = private global ptr blockaddress(@h3, %[[BB_ADDR:.*]]) + +// CHECK: define void @h3() { +// CHECK: br label %[[BB_ADDR]] + +// CHECK: [[BB_ADDR]]: +// CHECK: ret void +// CHECK: } + +// CHECK: define void @h0() + +llvm.func @h3() { + llvm.br ^bb1 +^bb1: + llvm.blocktag + llvm.return +} + +llvm.func @h0() { + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 4ef68fa83a70d..3c8de1cf63b94 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2056,7 +2056,7 @@ module attributes {} {} // ----- // CHECK-LABEL: @useInlineAsm -llvm.func @useInlineAsm(%arg0: i32) { +llvm.func @useInlineAsm(%arg0: i32, %arg1 : !llvm.ptr) { // Constraints string is checked at LLVM InlineAsm instruction construction time. // So we can't just use "bar" everywhere, number of in/out arguments has to match. @@ -2081,6 +2081,28 @@ llvm.func @useInlineAsm(%arg0: i32) { // CHECK-NEXT: call { i8, i8 } asm "foo", "=r,=r,r"(i32 {{.*}}) %5 = llvm.inline_asm "foo", "=r,=r,r" %arg0 : (i32) -> !llvm.struct<(i8, i8)> + // CHECK-NEXT: call void asm sideeffect "", "*m,~{memory}"(ptr elementtype(ptr) %1) + %6 = llvm.inline_asm has_side_effects operand_attrs = [{elementtype = !llvm.ptr}] "", "*m,~{memory}" %arg1 : (!llvm.ptr) -> !llvm.void + + llvm.return +} + +// ----- + +// CHECK: @useInlineAsm2(ptr %[[A0:.*]], i64 %[[A1:.*]], ptr %[[A2:.*]], i64 %[[A3:.*]]) { +llvm.func @useInlineAsm2(%arg0: !llvm.ptr, %arg1: i64, %arg2: !llvm.ptr, %arg3: i64) { + // CHECK: call { i64, ptr, ptr } asm sideeffect + // CHECK-SAME: "ldr x4, [$2], #8 \0A\09ldr x5, [$1] \0A\09mul x6, x4, $4 \0A\09", + // CHECK-SAME: "=r,=r,=r,=*m,r,*m,0,1,2,*m,~{x4},~{x5},~{x6},~{x7},~{cc}" + // CHECK-SAME:(ptr elementtype([16 x i64]) %[[A0]], i64 %[[A1]], ptr elementtype([16 x i64]) %[[A2]], + // CHECK-SAME: i64 %[[A3]], ptr %[[A0]], ptr %[[A2]], ptr elementtype([16 x i64]) %[[A0]]) + %0 = llvm.inline_asm has_side_effects operand_attrs = [ + {elementtype = !llvm.array<16 x i64>}, {}, {elementtype = !llvm.array<16 x i64>}, + {}, {}, {}, {elementtype = !llvm.array<16 x i64>}] + "ldr x4, [$2], #8 \0A\09ldr x5, [$1] \0A\09mul x6, x4, $4 \0A\09", + "=r,=r,=r,=*m,r,*m,0,1,2,*m,~{x4},~{x5},~{x6},~{x7},~{cc}" + %arg0, %arg1, %arg2, %arg3, %arg0, %arg2, %arg0 : + (!llvm.ptr, i64, !llvm.ptr, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> !llvm.struct<(i64, ptr, ptr)> llvm.return } diff --git a/mlir/test/Transforms/composite-pass.mlir b/mlir/test/Transforms/composite-pass.mlir index 829470c2c9aa6..75587edd5b96d 100644 --- a/mlir/test/Transforms/composite-pass.mlir +++ b/mlir/test/Transforms/composite-pass.mlir @@ -1,6 +1,10 @@ -// RUN: mlir-opt %s --log-actions-to=- --test-composite-fixed-point-pass -split-input-file | FileCheck %s +// RUN: mlir-opt %s --log-actions-to=- --test-composite-fixed-point-pass -split-input-file --dump-pass-pipeline 2>&1 | FileCheck %s --check-prefixes=CHECK,PIPELINE // RUN: mlir-opt %s --log-actions-to=- --composite-fixed-point-pass='name=TestCompositePass pipeline=any(canonicalize,cse)' -split-input-file | FileCheck %s +// Ensure the composite pass correctly prints its options. +// PIPELINE: builtin.module(composite-fixed-point-pass{max-iterations=10 name=TestCompositePass +// PIPELINE-SAME: pipeline=canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse}) + // CHECK-LABEL: running `TestCompositePass` // CHECK: running `Canonicalizer` // CHECK: running `CSE` diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgFusionTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgFusionTransforms.cpp index 81e7eedabd5d1..aab7f30127d96 100644 --- a/mlir/test/lib/Dialect/Linalg/TestLinalgFusionTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgFusionTransforms.cpp @@ -47,8 +47,7 @@ static LogicalResult fuseLinalgOpsGreedily(func::FuncOp f) { if (failed(info)) continue; auto *originalOp = info->originalProducer.getOperation(); - auto *originalOpInLinalgOpsVector = - std::find(linalgOps.begin(), linalgOps.end(), originalOp); + auto *originalOpInLinalgOpsVector = llvm::find(linalgOps, originalOp); *originalOpInLinalgOpsVector = info->fusedProducer; // Don't mark for erasure in the tensor case, let DCE handle this. changed = true; diff --git a/mlir/test/lib/Dialect/Test/CMakeLists.txt b/mlir/test/lib/Dialect/Test/CMakeLists.txt index 6e608e4772391..d2181cea0ecf9 100644 --- a/mlir/test/lib/Dialect/Test/CMakeLists.txt +++ b/mlir/test/lib/Dialect/Test/CMakeLists.txt @@ -87,7 +87,6 @@ mlir_target_link_libraries(MLIRTestDialect PUBLIC MLIRPtrDialect MLIRLLVMDialect MLIRPass - MLIRPolynomialDialect MLIRReduce MLIRTensorDialect MLIRTransformUtils diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td index f470406eac86f..4d825e2f0a8cc 100644 --- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td +++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td @@ -16,7 +16,6 @@ // To get the test dialect definition. include "TestDialect.td" include "TestEnumDefs.td" -include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.td" include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.td" include "mlir/Dialect/Utils/StructuredOpsUtils.td" include "mlir/IR/AttrTypeBase.td" @@ -379,22 +378,6 @@ def TestCustomStructAttr : Test_Attr<"TestCustomStruct"> { }]; } -def NestedPolynomialAttr : Test_Attr<"NestedPolynomialAttr"> { - let mnemonic = "nested_polynomial"; - let parameters = (ins Polynomial_IntPolynomialAttr:$poly); - let assemblyFormat = [{ - `<` struct(params) `>` - }]; -} - -def NestedPolynomialAttr2 : Test_Attr<"NestedPolynomialAttr2"> { - let mnemonic = "nested_polynomial2"; - let parameters = (ins OptionalParameter<"::mlir::polynomial::IntPolynomialAttr">:$poly); - let assemblyFormat = [{ - `<` struct(params) `>` - }]; -} - // Test a ptr constant memory space. def TestConstMemorySpaceAttr : Test_Attr<"TestConstMemorySpace", [ DeclareAttrInterfaceMethods @@ -435,4 +418,17 @@ def TestOpAsmAttrInterfaceTablegenDefaultAttr : Test_Attr<"TestOpAsmAttrInterfac let genMnemonicAlias = 1; } +// Test attribute containing a slash token +def SlashAttr: Test_Attr<"Slash">{ + let mnemonic = "slash_attr"; + + let parameters = ( + ins + "int":$lhs, + "int":$rhs + ); + + let hasCustomAssemblyFormat = 1; +} + #endif // TEST_ATTRDEFS diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp index b36f246b83d76..80661e68754ce 100644 --- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp +++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp @@ -497,6 +497,24 @@ getDynamicCustomAssemblyFormatAttr(TestDialect *testDialect) { std::move(parser), std::move(printer)); } +//===----------------------------------------------------------------------===// +// SlashAttr +//===----------------------------------------------------------------------===// + +Attribute SlashAttr::parse(AsmParser &parser, Type type) { + int lhs, rhs; + + if (parser.parseLess() || parser.parseInteger(lhs) || parser.parseSlash() || + parser.parseInteger(rhs) || parser.parseGreater()) + return Attribute(); + + return SlashAttr::get(parser.getContext(), lhs, rhs); +} + +void SlashAttr::print(AsmPrinter &printer) const { + printer << "<" << getLhs() << " / " << getRhs() << ">"; +} + //===----------------------------------------------------------------------===// // TestDialect //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.h b/mlir/test/lib/Dialect/Test/TestAttributes.h index bcbc360758eec..778d84fae7365 100644 --- a/mlir/test/lib/Dialect/Test/TestAttributes.h +++ b/mlir/test/lib/Dialect/Test/TestAttributes.h @@ -17,17 +17,17 @@ #include #include "TestTraits.h" -#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h" #include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Diagnostics.h" #include "mlir/IR/Dialect.h" #include "mlir/IR/DialectImplementation.h" +#include "mlir/IR/DialectResourceBlobManager.h" +// generated files require above includes to come first #include "TestAttrInterfaces.h.inc" #include "TestOpEnums.h.inc" -#include "mlir/IR/DialectResourceBlobManager.h" namespace test { class TestDialect; diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 3e461999e2730..43a0bdaf86cf3 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -243,16 +243,6 @@ def FloatElementsAttrOp : TEST_Op<"float_elements_attr"> { ); } -def ContainingIntPolynomialAttrOp : TEST_Op<"containing_int_polynomial_attr"> { - let arguments = (ins NestedPolynomialAttr:$attr); - let assemblyFormat = "$attr attr-dict"; -} - -def ContainingIntPolynomialAttr2Op : TEST_Op<"containing_int_polynomial_attr2"> { - let arguments = (ins NestedPolynomialAttr2:$attr); - let assemblyFormat = "$attr attr-dict"; -} - // A pattern that updates dense<[3.0, 4.0]> to dense<[5.0, 6.0]>. // This tests both matching and generating float elements attributes. def UpdateFloatElementsAttr : Pat< @@ -334,6 +324,10 @@ def DenseArrayAttrOp : TEST_Op<"dense_array_attr"> { }]; } +def SlashAttrOp : TEST_Op<"slash_attr"> { + let arguments = (ins SlashAttr:$attr); +} + //===----------------------------------------------------------------------===// // Test Attributes Constraints //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp index eda2594fbc7c7..ccba2e2806862 100644 --- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp +++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" @@ -836,9 +837,6 @@ struct TestVectorEmulateMaskedLoadStore final } }; -// TODO: move this code into the user project. -namespace vendor { - /// Get the set of operand/result types to check for sufficiently /// small inner-most dimension size. static SmallVector> @@ -960,8 +958,6 @@ struct TestVectorBitWidthLinearize final } }; -} // namespace vendor - struct TestVectorLinearize final : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestVectorLinearize) @@ -973,7 +969,7 @@ struct TestVectorLinearize final return "Linearizes ND vectors for N >= 2 into 1D vectors"; } void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } void runOnOperation() override { @@ -987,6 +983,8 @@ struct TestVectorLinearize final vector::populateVectorLinearizeBasePatterns(converter, target, patterns); vector::populateVectorLinearizeShuffleLikeOpsPatterns(converter, target, patterns); + mlir::scf::populateSCFStructuralTypeConversionsAndLegality( + converter, patterns, target); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) @@ -1067,7 +1065,7 @@ void registerTestVectorLowerings() { PassRegistration(); - PassRegistration(); + PassRegistration(); PassRegistration(); } diff --git a/mlir/test/mlir-runner/verify-entry-point-result.mlir b/mlir/test/mlir-runner/verify-entry-point-result.mlir deleted file mode 100644 index ad46e0b5fe1bf..0000000000000 --- a/mlir/test/mlir-runner/verify-entry-point-result.mlir +++ /dev/null @@ -1,7 +0,0 @@ -// RUN: not mlir-runner %s -e entry -entry-point-result=void 2>&1 | FileCheck %s - -// CHECK: Error: expected void function -llvm.func @entry() -> (i32) { - %0 = llvm.mlir.constant(0 : index) : i32 - llvm.return %0 : i32 -} diff --git a/mlir/test/mlir-runner/verify-entry-point.mlir b/mlir/test/mlir-runner/verify-entry-point.mlir new file mode 100644 index 0000000000000..c7165bd46302f --- /dev/null +++ b/mlir/test/mlir-runner/verify-entry-point.mlir @@ -0,0 +1,48 @@ +// RUN: not mlir-runner %s -e entry_point_void -entry-point-result=void 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-POINT-VOID +// RUN: not mlir-runner %s -e entry_inputs_void -entry-point-result=void 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-INPUTS-VOID +// RUN: not mlir-runner %s -e entry_result_void -entry-point-result=void 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-RESULT-VOID +// RUN: not mlir-runner %s -e entry_point_i32 -entry-point-result=i32 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-POINT-I32 +// RUN: not mlir-runner %s -e entry_inputs_i32 -entry-point-result=i32 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-INPUTS-I32 +// RUN: not mlir-runner %s -e entry_result_i32 -entry-point-result=i32 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-RESULT-I32 +// RUN: not mlir-runner %s -e entry_result_i64 -entry-point-result=i64 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-RESULT-I64 +// RUN: not mlir-runner %s -e entry_result_f32 -entry-point-result=f32 2>&1 | FileCheck %s --check-prefix=CHECK-ENTRY-RESULT-F32 + +// CHECK-ENTRY-POINT-VOID: Error: entry point not found +llvm.func @entry_point_void() -> () + +// CHECK-ENTRY-INPUTS-VOID: Error: JIT can't invoke a main function expecting arguments +llvm.func @entry_inputs_void(%arg0: i32) { + llvm.return +} + +// CHECK-ENTRY-RESULT-VOID: Error: expected void function +llvm.func @entry_result_void() -> (i32) { + %0 = llvm.mlir.constant(0 : index) : i32 + llvm.return %0 : i32 +} + +// CHECK-ENTRY-POINT-I32: Error: entry point not found +llvm.func @entry_point_i32() -> (i32) + +// CHECK-ENTRY-INPUTS-I32: Error: JIT can't invoke a main function expecting arguments +llvm.func @entry_inputs_i32(%arg0: i32) { + llvm.return +} + +// CHECK-ENTRY-RESULT-I32: Error: only single i32 function result supported +llvm.func @entry_result_i32() -> (i64) { + %0 = llvm.mlir.constant(0 : index) : i64 + llvm.return %0 : i64 +} + +// CHECK-ENTRY-RESULT-I64: Error: only single i64 function result supported +llvm.func @entry_result_i64() -> (i32) { + %0 = llvm.mlir.constant(0 : index) : i32 + llvm.return %0 : i32 +} + +// CHECK-ENTRY-RESULT-F32: Error: only single f32 function result supported +llvm.func @entry_result_f32() -> (i32) { + %0 = llvm.mlir.constant(0 : index) : i32 + llvm.return %0 : i32 +} diff --git a/mlir/test/mlir-tblgen/cpp-class-comments.td b/mlir/test/mlir-tblgen/cpp-class-comments.td new file mode 100644 index 0000000000000..a896888d944b2 --- /dev/null +++ b/mlir/test/mlir-tblgen/cpp-class-comments.td @@ -0,0 +1,139 @@ +// RUN: mlir-tblgen -gen-dialect-decls -I %S/../../include %s | FileCheck %s --check-prefix=DIALECT +// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s --check-prefix=OP +// RUN: mlir-tblgen -gen-typedef-decls -I %S/../../include %s | FileCheck %s --check-prefix=TYPE +// RUN: mlir-tblgen -gen-attrdef-decls -I %S/../../include %s | FileCheck %s --check-prefix=ATTR +// RUN: mlir-tblgen -gen-attr-interface-decls -I %S/../../include %s | FileCheck %s --check-prefix=ATTR-INTERFACE +// RUN: mlir-tblgen -gen-op-interface-decls -I %S/../../include %s | FileCheck %s --check-prefix=OP-INTERFACE +// RUN: mlir-tblgen -gen-type-interface-decls -I %S/../../include %s | FileCheck %s --check-prefix=TYPE-INTERFACE +// RUN: mlir-tblgen -gen-enum-decls -I %S/../../include %s | FileCheck %s --check-prefix=ENUM + +include "mlir/IR/AttrTypeBase.td" +include "mlir/IR/EnumAttr.td" +include "mlir/IR/OpBase.td" + +// check dialect with summary and description +def A_Dialect : Dialect { + let name = "a"; + let cppNamespace = ""; + + let summary = "This is a summary"; + let description = [{ + + This is a description, needs trimming + + }]; +// DIALECT: /// This is a summary +// DIALECT-NEXT: /// This is a description, needs trimming +// DIALECT-NEXT: class ADialect : public ::mlir::Dialect { +} + +def A_SomeOp1 : Op{ + let summary = "Some Op1 summary line1 \nsummary line2"; + + let description = [{ + Some Op1 description + }]; + + let cppNamespace = "OP1"; +// OP: namespace OP1 +// OP-NEXT: /// Some Op1 summary line1 +// OP-NEXT: /// summary line2 +// OP-NEXT: /// Some Op1 description +// OP-NEXT: class SomeOp1; +} + +// test weird characters in description +def A_SomeOp2 : Op{ + let summary = ""; + + let description = [{ + $ptr (`,` $mask^)? (`,` $other^)? + oilist( + `a` `=` $1 | `b` `=` $2 + ) + }]; +// OP: /// $ptr (`,` $mask^)? (`,` $other^)? +// OP-NEXT: /// oilist( +// OP-NEXT: /// `a` `=` $1 | `b` `=` $2 +// OP-NEXT: /// ) +// OP-NEXT: class SomeOp2; +} + +def A_TensorType : TypeDef { + let typeName = "a.simple_a_tensor"; + + let summary = "Tensor Type A summary"; + + let description = [{ + Tensor Type A description + }]; + + let extraClassDeclaration = [{ + void getSignlessBlockType() const { + } + }]; +// TYPE: /// Tensor Type A summary +// TYPE-NEXT: /// Tensor Type A description +// TYPE-NEXT: class TensorType; +} + +def A_SimpleAttr : AttrDef { + let attrName = "a.simple_attr"; + let summary = "Simple Attr A summary"; + + let description = [{ + Simple Attr A description + }]; +// ATTR: /// Simple Attr A summary +// ATTR-NEXT: /// Simple Attr A description +// ATTR-NEXT: class SimpleAAttr; +} + +def EncodingTrait : AttrInterface<"EncodingTrait"> { + let cppNamespace = "mlir::a::traits"; + let description = [{ + Common trait for all layouts. + }]; + let methods = [ + ]; +// ATTR-INTERFACE: namespace mlir +// ATTR-INTERFACE-NEXT: namespace a +// ATTR-INTERFACE-NEXT: namespace traits +// ATTR-INTERFACE-NEXT: /// Common trait for all layouts. +// ATTR-INTERFACE-NEXT: class EncodingTrait; +} + +def SimpleEncodingTrait : AttrInterface<"SimpleEncodingTrait"> { + let cppNamespace = "a::traits"; +// ATTR-INTERFACE: namespace a { +// ATTR-INTERFACE-NEXT: namespace traits { +// ATTR-INTERFACE-NEXT: class SimpleEncodingTrait; +} + +def SimpleOpInterface : OpInterface<"SimpleOpInterface"> { + let cppNamespace = "a::traits"; + let description = [{ + + Simple Op Interface description + }]; +// OP-INTERFACE: namespace a { +// OP-INTERFACE-NEXT: namespace traits { +// OP-INTERFACE-NEXT: /// Simple Op Interface description +// OP-INTERFACE-NEXT: class SimpleOpInterface; +} + +def SimpleTypeInterface : TypeInterface<"SimpleTypeInterface"> { + let description = [{ + Simple Type Interface description + }]; +// TYPE-INTERFACE: /// Simple Type Interface description +// TYPE-INTERFACE-NEXT: class SimpleTypeInterface; +} + +def MyBitEnum: I32BitEnumAttr<"MyBitEnum", "An example bit enum", + [I32BitEnumCaseBit<"Bit0", 0, "tagged">, + I32BitEnumCaseBit<"Bit1", 1>]> { + let genSpecializedAttr = 0; +// ENUM: // An example bit enum +// ENUM-NEXT: enum class MyBitEnum +} diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp index 05686c0539754..2a6071602fa49 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "AttrOrTypeFormatGen.h" +#include "CppGenUtilities.h" #include "mlir/TableGen/AttrOrTypeDef.h" #include "mlir/TableGen/Class.h" #include "mlir/TableGen/CodeGenHelpers.h" @@ -813,8 +814,14 @@ bool DefGenerator::emitDecls(StringRef selectedDialect) { NamespaceEmitter nsEmitter(os, defs.front().getDialect()); // Declare all the def classes first (in case they reference each other). - for (const AttrOrTypeDef &def : defs) + for (const AttrOrTypeDef &def : defs) { + std::string comments = tblgen::emitSummaryAndDescComments( + def.getSummary(), def.getDescription()); + if (!comments.empty()) { + os << comments << "\n"; + } os << "class " << def.getCppClassName() << ";\n"; + } // Emit the declarations. for (const AttrOrTypeDef &def : defs) diff --git a/mlir/tools/mlir-tblgen/CMakeLists.txt b/mlir/tools/mlir-tblgen/CMakeLists.txt index 9431c59860522..2a7ef7e0576c8 100644 --- a/mlir/tools/mlir-tblgen/CMakeLists.txt +++ b/mlir/tools/mlir-tblgen/CMakeLists.txt @@ -33,6 +33,7 @@ add_tablegen(mlir-tblgen MLIR RewriterGen.cpp SPIRVUtilsGen.cpp TosaUtilsGen.cpp + CppGenUtilities.cpp ) target_link_libraries(mlir-tblgen diff --git a/mlir/tools/mlir-tblgen/CppGenUtilities.cpp b/mlir/tools/mlir-tblgen/CppGenUtilities.cpp new file mode 100644 index 0000000000000..ebca20cc685f4 --- /dev/null +++ b/mlir/tools/mlir-tblgen/CppGenUtilities.cpp @@ -0,0 +1,39 @@ +//===- CppGenUtilities.cpp - MLIR cpp gen utilities --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines common utilities for generating cpp files from tablegen +// structures. +// +//===----------------------------------------------------------------------===// + +#include "CppGenUtilities.h" +#include "mlir/Support/IndentedOstream.h" + +std::string +mlir::tblgen::emitSummaryAndDescComments(llvm::StringRef summary, + llvm::StringRef description) { + + std::string comments = ""; + StringRef trimmedSummary = summary.trim(); + StringRef trimmedDesc = description.trim(); + llvm::raw_string_ostream os(comments); + raw_indented_ostream ros(os); + + if (!trimmedSummary.empty()) { + ros.printReindented(trimmedSummary, "/// "); + } + + if (!trimmedDesc.empty()) { + if (!trimmedSummary.empty()) { + // If there is a summary, add a newline after it. + ros << "\n"; + } + ros.printReindented(trimmedDesc, "/// "); + } + return comments; +} diff --git a/mlir/tools/mlir-tblgen/CppGenUtilities.h b/mlir/tools/mlir-tblgen/CppGenUtilities.h new file mode 100644 index 0000000000000..231c59a9e148f --- /dev/null +++ b/mlir/tools/mlir-tblgen/CppGenUtilities.h @@ -0,0 +1,29 @@ +//===- CppGenUtilities.h - MLIR cpp gen utilities ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines common utilities for generating cpp files from tablegen +// structures. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TOOLS_MLIRTBLGEN_CPPGENUTILITIES_H_ +#define MLIR_TOOLS_MLIRTBLGEN_CPPGENUTILITIES_H_ + +#include "llvm/ADT/StringRef.h" + +namespace mlir { +namespace tblgen { + +// Emit the summary and description as a C++ comment, perperly aligned placed +// adjacent to the class declaration of generated classes. +std::string emitSummaryAndDescComments(llvm::StringRef summary, + llvm::StringRef description); +} // namespace tblgen +} // namespace mlir + +#endif // MLIR_TOOLS_MLIRTBLGEN_CPPGENUTILITIES_H_ diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp index 6cf71d2bb0174..02941ec1268cb 100644 --- a/mlir/tools/mlir-tblgen/DialectGen.cpp +++ b/mlir/tools/mlir-tblgen/DialectGen.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CppGenUtilities.h" #include "DialectGenUtilities.h" #include "mlir/TableGen/Class.h" #include "mlir/TableGen/CodeGenHelpers.h" @@ -108,7 +109,9 @@ tblgen::findDialectToGenerate(ArrayRef dialects) { /// {0}: The name of the dialect class. /// {1}: The dialect namespace. /// {2}: The dialect parent class. +/// {3}: The summary and description comments. static const char *const dialectDeclBeginStr = R"( +{3} class {0} : public ::mlir::{2} { explicit {0}(::mlir::MLIRContext *context); @@ -245,8 +248,11 @@ static void emitDialectDecl(Dialect &dialect, raw_ostream &os) { std::string cppName = dialect.getCppClassName(); StringRef superClassName = dialect.isExtensible() ? "ExtensibleDialect" : "Dialect"; + + std::string comments = tblgen::emitSummaryAndDescComments( + dialect.getSummary(), dialect.getDescription()); os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName(), - superClassName); + superClassName, comments); // If the dialect requested the default attribute printer and parser, emit // the declarations for the hooks. diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index 3f397f3a8e6fd..373d3762cbb1a 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "CppGenUtilities.h" #include "OpClass.h" #include "OpFormatGen.h" #include "OpGenHelpers.h" @@ -2640,8 +2641,7 @@ void OpEmitter::genSeparateArgParamBuilder() { // Avoid emitting "resultTypes.size() >= 0u" which is always true. if (!hasVariadicResult || numNonVariadicResults != 0) - body << " " - << "assert(resultTypes.size() " + body << " " << "assert(resultTypes.size() " << (hasVariadicResult ? ">=" : "==") << " " << numNonVariadicResults << "u && \"mismatched number of results\");\n"; @@ -4749,6 +4749,11 @@ static void emitOpClassDecls(const RecordKeeper &records, for (auto *def : defs) { Operator op(*def); NamespaceEmitter emitter(os, op.getCppNamespace()); + std::string comments = tblgen::emitSummaryAndDescComments( + op.getSummary(), op.getDescription()); + if (!comments.empty()) { + os << comments << "\n"; + } os << "class " << op.getCppClassName() << ";\n"; } diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 077f9d1ea2b13..f2b269e3a4542 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -583,15 +583,14 @@ static bool emitDialectDoc(const RecordKeeper &records, raw_ostream &os) { // sections. // TODO: The sorting order could be revised, currently attempting to sort of // keep in alphabetical order. - std::sort(dialectOps.begin(), dialectOps.end(), - [](const OpDocGroup &lhs, const OpDocGroup &rhs) { - auto getDesc = [](const OpDocGroup &arg) -> StringRef { - if (!arg.summary.empty()) - return arg.summary; - return arg.ops.front().getDef().getValueAsString("opName"); - }; - return getDesc(lhs).compare_insensitive(getDesc(rhs)) < 0; - }); + llvm::sort(dialectOps, [](const OpDocGroup &lhs, const OpDocGroup &rhs) { + auto getDesc = [](const OpDocGroup &arg) -> StringRef { + if (!arg.summary.empty()) + return arg.summary; + return arg.ops.front().getDef().getValueAsString("opName"); + }; + return getDesc(lhs).compare_insensitive(getDesc(rhs)) < 0; + }); os << "\n"; emitDialectDoc(*dialect, records.getInputFilename(), dialectAttrs, diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp index dcd68e6c2d636..4dfa1908b3267 100644 --- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp +++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CppGenUtilities.h" #include "DocGenUtilities.h" #include "mlir/TableGen/Format.h" #include "mlir/TableGen/GenInfo.h" @@ -527,6 +528,11 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) { // Emit a forward declaration of the interface class so that it becomes usable // in the signature of its methods. + std::string comments = tblgen::emitSummaryAndDescComments( + "", interface.getDescription().value_or("")); + if (!comments.empty()) { + os << comments << "\n"; + } os << "class " << interfaceName << ";\n"; // Emit the traits struct containing the concept and model declarations. @@ -589,7 +595,8 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) { << " auto* interface = getInterfaceFor(base);\n" << " if (!interface)\n" " return false;\n" - " " << interfaceName << " odsInterfaceInstance(base, interface);\n" + " " + << interfaceName << " odsInterfaceInstance(base, interface);\n" << " " << tblgen::tgfmt(extraClassOf->trim(), &extraClassOfFmt) << "\n }\n"; } diff --git a/mlir/unittests/Dialect/CMakeLists.txt b/mlir/unittests/Dialect/CMakeLists.txt index a88dc98c034e4..aea247547473d 100644 --- a/mlir/unittests/Dialect/CMakeLists.txt +++ b/mlir/unittests/Dialect/CMakeLists.txt @@ -12,7 +12,6 @@ add_subdirectory(Index) add_subdirectory(LLVMIR) add_subdirectory(MemRef) add_subdirectory(OpenACC) -add_subdirectory(Polynomial) add_subdirectory(SCF) add_subdirectory(SparseTensor) add_subdirectory(SPIRV) diff --git a/mlir/unittests/Dialect/Polynomial/CMakeLists.txt b/mlir/unittests/Dialect/Polynomial/CMakeLists.txt deleted file mode 100644 index 97f5b890ab4fb..0000000000000 --- a/mlir/unittests/Dialect/Polynomial/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -add_mlir_unittest(MLIRPolynomialTests - PolynomialMathTest.cpp -) -mlir_target_link_libraries(MLIRPolynomialTests - PRIVATE - MLIRIR - MLIRPolynomialDialect -) diff --git a/mlir/unittests/Dialect/Polynomial/PolynomialMathTest.cpp b/mlir/unittests/Dialect/Polynomial/PolynomialMathTest.cpp deleted file mode 100644 index 95906ad42588e..0000000000000 --- a/mlir/unittests/Dialect/Polynomial/PolynomialMathTest.cpp +++ /dev/null @@ -1,44 +0,0 @@ -//===- PolynomialMathTest.cpp - Polynomial math Tests ---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Polynomial/IR/Polynomial.h" -#include "gtest/gtest.h" - -using namespace mlir; -using namespace mlir::polynomial; - -TEST(AddTest, checkSameDegreeAdditionOfIntPolynomial) { - IntPolynomial x = IntPolynomial::fromCoefficients({1, 2, 3}); - IntPolynomial y = IntPolynomial::fromCoefficients({2, 3, 4}); - IntPolynomial expected = IntPolynomial::fromCoefficients({3, 5, 7}); - EXPECT_EQ(expected, x.add(y)); -} - -TEST(AddTest, checkDifferentDegreeAdditionOfIntPolynomial) { - IntMonomial term2t = IntMonomial(2, 1); - IntPolynomial x = IntPolynomial::fromMonomials({term2t}).value(); - IntPolynomial y = IntPolynomial::fromCoefficients({2, 3, 4}); - IntPolynomial expected = IntPolynomial::fromCoefficients({2, 5, 4}); - EXPECT_EQ(expected, x.add(y)); - EXPECT_EQ(expected, y.add(x)); -} - -TEST(AddTest, checkSameDegreeAdditionOfFloatPolynomial) { - FloatPolynomial x = FloatPolynomial::fromCoefficients({1.5, 2.5, 3.5}); - FloatPolynomial y = FloatPolynomial::fromCoefficients({2.5, 3.5, 4.5}); - FloatPolynomial expected = FloatPolynomial::fromCoefficients({4, 6, 8}); - EXPECT_EQ(expected, x.add(y)); -} - -TEST(AddTest, checkDifferentDegreeAdditionOfFloatPolynomial) { - FloatPolynomial x = FloatPolynomial::fromCoefficients({1.5, 2.5}); - FloatPolynomial y = FloatPolynomial::fromCoefficients({2.5, 3.5, 4.5}); - FloatPolynomial expected = FloatPolynomial::fromCoefficients({4, 6, 4.5}); - EXPECT_EQ(expected, x.add(y)); - EXPECT_EQ(expected, y.add(x)); -} diff --git a/mlir/utils/tree-sitter-mlir/dialect/bufferization.js b/mlir/utils/tree-sitter-mlir/dialect/bufferization.js index 8d9fdb1fcfc39..d5c99263f8ec4 100644 --- a/mlir/utils/tree-sitter-mlir/dialect/bufferization.js +++ b/mlir/utils/tree-sitter-mlir/dialect/bufferization.js @@ -2,31 +2,26 @@ module.exports = { bufferization_dialect : $ => choice( - seq('bufferization.alloc_tensor', - field('in', $._value_use_list_parens), - field('copy', optional(seq(token('copy'), '(', - $.value_use, ')'))), - field('size_hint', - optional(seq(token('size_hint'), '=', - $.value_use))), - field('attributes', optional($.attribute)), - field('return', $._type_annotation)), + seq('bufferization.alloc_tensor', field('in', $._value_use_list_parens), + field('copy', optional(seq(token('copy'), '(', $.value_use, ')'))), + field('size_hint', + optional(seq(token('size_hint'), '=', $.value_use))), + field('attributes', optional($.attribute)), + field('return', $._type_annotation)), - // operation ::= `bufferization.to_memref` $tensor - // attr-dict `:` type($memref) - seq('bufferization.to_memref', - field('tensor', $.value_use), - field('attributes', optional($.attribute)), - field('return', $._type_annotation)), + // operation ::= `bufferization.to_buffer` $tensor + // attr-dict `:` type($memref) + seq('bufferization.to_buffer', field('tensor', $.value_use), + field('attributes', optional($.attribute)), + field('return', $._type_annotation)), - // operation ::= `bufferization.to_tensor` $memref - // (`restrict` $restrict^)? - // (`writable` $writable^)? attr-dict - // `:` type($memref) - seq('bufferization.to_tensor', - field('memref', $.value_use), - field('restrict', optional($.restrict_attr)), - field('writable', optional($.writable_attr)), - field('attributes', optional($.attribute)), - field('return', $._type_annotation))) + // operation ::= `bufferization.to_tensor` $memref + // (`restrict` $restrict^)? + // (`writable` $writable^)? attr-dict + // `:` type($memref) + seq('bufferization.to_tensor', field('memref', $.value_use), + field('restrict', optional($.restrict_attr)), + field('writable', optional($.writable_attr)), + field('attributes', optional($.attribute)), + field('return', $._type_annotation))) } diff --git a/mlir/utils/tree-sitter-mlir/queries/highlights.scm b/mlir/utils/tree-sitter-mlir/queries/highlights.scm index 97aba2b266eca..4cbea7bbca031 100644 --- a/mlir/utils/tree-sitter-mlir/queries/highlights.scm +++ b/mlir/utils/tree-sitter-mlir/queries/highlights.scm @@ -209,7 +209,7 @@ "tensor.yield" "bufferization.alloc_tensor" - "bufferization.to_memref" + "bufferization.to_buffer" "bufferization.to_tensor" "linalg.batch_matmul" diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index e54a8afdd3f4f..5a25ad104f736 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -389,9 +389,6 @@ struct GenericKernelTy { /// The prototype kernel launch environment. KernelLaunchEnvironmentTy KernelLaunchEnvironment; - - /// If the kernel is a bare kernel. - bool IsBareKernel = false; }; /// Information about an allocation, when it has been allocated, and when/if it diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c new file mode 100644 index 0000000000000..7bf3b1c11f28b --- /dev/null +++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c @@ -0,0 +1,102 @@ +// RUN: %libomptarget-compile-generic -fcreate-profile \ +// RUN: -Xarch_device -fprofile-generate \ +// RUN: -Xarch_device -fprofile-update=atomic +// RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \ +// RUN: %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.%basename_t.llvm.profraw | \ +// RUN: %fcheck-generic --check-prefix="LLVM-PGO" + +// RUN: %libomptarget-compile-generic -fcreate-profile \ +// RUN: -Xarch_device -fprofile-instr-generate \ +// RUN: -Xarch_device -fprofile-update=atomic +// RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \ +// RUN: %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.%basename_t.clang.profraw | \ +// RUN: %fcheck-generic --check-prefix="CLANG-PGO" + +// REQUIRES: gpu +// REQUIRES: pgo + +int test1(int a) { return a / 2; } +int test2(int a) { return a * 2; } + +int main() { + int device_var = 1; + +#pragma omp target teams distribute parallel for num_teams(3) \ + map(tofrom : device_var) + for (int i = 1; i <= 30; i++) { + device_var *= i; + if (i % 2 == 0) { + device_var += test1(device_var); + } + if (i % 3 == 0) { + device_var += test2(device_var); + } + } +} + +// clang-format off +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 2 +// LLVM-PGO: Block counts: [0, {{.*}}] + +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}_omp_outlined: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 4 +// LLVM-PGO: Block counts: [{{.*}}, 0, {{.*}}, 0] + +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}_omp_outlined_omp_outlined: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 4 +// LLVM-PGO: Block counts: [30, 15, 10, {{.*}}] + +// LLVM-PGO-LABEL: test1: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Block counts: [15] + +// LLVM-PGO-LABEL: test2: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Block counts: [10] + +// LLVM-PGO-LABEL: Instrumentation level: +// LLVM-PGO-SAME: IR + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: {{.*}} +// CLANG-PGO: Block counts: [] + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}_omp_outlined: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: {{.*}} +// CLANG-PGO: Block counts: [] + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}_omp_outlined_omp_outlined: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 4 +// CLANG-PGO: Function count: 30 +// CLANG-PGO: Block counts: [{{.*}}, 15, 10] + +// CLANG-PGO-LABEL: test1: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 15 +// CLANG-PGO: Block counts: [] + +// CLANG-PGO-LABEL: test2: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 10 +// CLANG-PGO: Block counts: [] + +// CLANG-PGO-LABEL: Instrumentation level: +// CLANG-PGO-SAME: Front-end +// clang-format on diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c new file mode 100644 index 0000000000000..f0e7111f7a64b --- /dev/null +++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c @@ -0,0 +1,84 @@ +// RUN: %libomptarget-compile-generic -fcreate-profile \ +// RUN: -Xarch_device -fprofile-generate \ +// RUN: -Xarch_device -fprofile-update=atomic +// RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \ +// RUN: %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.%basename_t.llvm.profraw | \ +// RUN: %fcheck-generic --check-prefix="LLVM-PGO" + +// RUN: %libomptarget-compile-generic -fcreate-profile \ +// RUN: -Xarch_device -fprofile-instr-generate \ +// RUN: -Xarch_device -fprofile-update=atomic +// RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \ +// RUN: %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.%basename_t.clang.profraw | \ +// RUN: %fcheck-generic --check-prefix="CLANG-PGO" + +// REQUIRES: gpu +// REQUIRES: pgo + +int test1(int a) { return a / 2; } + +int main() { + int device_var = 1; +#pragma omp target map(tofrom : device_var) + { +#pragma omp parallel for + for (int i = 1; i <= 10; i++) { + device_var *= i; + if (i % 2 == 0) { + device_var += test1(device_var); + } + } + } +} + +// clang-format off +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 2 +// LLVM-PGO: Block counts: [0, {{.*}}] + +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}_omp_outlined: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 5 +// LLVM-PGO: Block counts: [10, 5, {{.*}}, 10, {{.*}}] + +// LLVM-PGO-LABEL: test1: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Block counts: [5] + +// LLVM-PGO-LABEL: Instrumentation level: +// LLVM-PGO-SAME: IR +// LLVM-PGO-SAME: entry_first = 0 +// LLVM-PGO-LABEL: Functions shown: +// LLVM-PGO-SAME: 3 +// LLVM-PGO-LABEL: Maximum function count: +// LLVM-PGO-SAME: 10 + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: {{.*}} +// CLANG-PGO: Block counts: [] + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}_omp_outlined: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 3 +// CLANG-PGO: Function count: {{.*}} +// CLANG-PGO: Block counts: [{{.*}}, 5] + +// CLANG-PGO-LABEL: test1: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 5 +// CLANG-PGO: Block counts: [] + +// CLANG-PGO-LABEL: Instrumentation level: +// CLANG-PGO-SAME: Front-end +// CLANG-PGO-LABEL: Functions shown: +// CLANG-PGO-SAME: 3 +// clang-format on diff --git a/offload/test/offloading/gpupgo/pgo2.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c similarity index 95% rename from offload/test/offloading/gpupgo/pgo2.c rename to offload/test/offloading/gpupgo/pgo_device_and_host.c index af3ad9e4a6c19..3e95791ce9a50 100644 --- a/offload/test/offloading/gpupgo/pgo2.c +++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c @@ -59,8 +59,10 @@ int main() { int device_var = 1; #pragma omp target - for (int i = 0; i < 10; i++) { - device_var *= i; + { + for (int i = 0; i < 10; i++) { + device_var *= i; + } } } @@ -78,7 +80,7 @@ int main() { // LLVM-DEVICE-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // LLVM-DEVICE: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-DEVICE: Counters: 3 -// LLVM-DEVICE: Block counts: [10, 2, 1] +// LLVM-DEVICE: Block counts: [10, {{.*}}, 1] // LLVM-DEVICE: Instrumentation level: IR // CLANG-HOST-LABEL: main: @@ -97,6 +99,5 @@ int main() { // CLANG-DEV-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // CLANG-DEV: Hash: {{0[xX][0-9a-fA-F]+}} // CLANG-DEV: Counters: 2 -// CLANG-DEV: Function count: 0 -// CLANG-DEV: Block counts: [11] +// CLANG-DEV: Block counts: [10] // CLANG-DEV: Instrumentation level: Front-end diff --git a/offload/test/offloading/gpupgo/pgo1.c b/offload/test/offloading/gpupgo/pgo_device_only.c similarity index 85% rename from offload/test/offloading/gpupgo/pgo1.c rename to offload/test/offloading/gpupgo/pgo_device_only.c index 1159858c51218..2939af613b6dd 100644 --- a/offload/test/offloading/gpupgo/pgo1.c +++ b/offload/test/offloading/gpupgo/pgo_device_only.c @@ -23,10 +23,12 @@ int test2(int a) { return a * 2; } int main() { int m = 2; #pragma omp target - for (int i = 0; i < 10; i++) { - m = test1(m); - for (int j = 0; j < 2; j++) { - m = test2(m); + { + for (int i = 0; i < 10; i++) { + m = test1(m); + for (int j = 0; j < 2; j++) { + m = test2(m); + } } } } @@ -34,7 +36,7 @@ int main() { // LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 4 -// LLVM-PGO: Block counts: [20, 10, 2, 1] +// LLVM-PGO: Block counts: [20, 10, {{.*}}, 1] // LLVM-PGO-LABEL: test1: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} @@ -53,14 +55,10 @@ int main() { // LLVM-PGO-SAME: 3 // LLVM-PGO-LABEL: Maximum function count: // LLVM-PGO-SAME: 20 -// LLVM-PGO-LABEL: Maximum internal block count: -// LLVM-PGO-SAME: 10 // CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} -// CLANG-PGO: Counters: 3 -// CLANG-PGO: Function count: 0 -// CLANG-PGO: Block counts: [11, 20] +// CLANG-PGO: Block counts: [10, 20] // CLANG-PGO-LABEL: test1: // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} @@ -78,7 +76,5 @@ int main() { // CLANG-PGO-SAME: Front-end // CLANG-PGO-LABEL: Functions shown: // CLANG-PGO-SAME: 3 -// CLANG-PGO-LABEL: Maximum function count: -// CLANG-PGO-SAME: 20 // CLANG-PGO-LABEL: Maximum internal block count: // CLANG-PGO-SAME: 20 diff --git a/openmp/runtime/src/ompt-internal.h b/openmp/runtime/src/ompt-internal.h index 0cfab8bfaa190..36b45f7a91ea2 100644 --- a/openmp/runtime/src/ompt-internal.h +++ b/openmp/runtime/src/ompt-internal.h @@ -109,7 +109,8 @@ void ompt_pre_init(void); void ompt_post_init(void); void ompt_fini(void); -#define OMPT_GET_RETURN_ADDRESS(level) __builtin_return_address(level) +#define OMPT_GET_RETURN_ADDRESS(level) \ + __builtin_extract_return_addr(__builtin_return_address(level)) #define OMPT_GET_FRAME_ADDRESS(level) __builtin_frame_address(level) #define OMPT_FRAME_FLAGS_APP (ompt_frame_application | ompt_frame_cfa) #define OMPT_FRAME_FLAGS_RUNTIME (ompt_frame_runtime | ompt_frame_cfa) diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h index e9e40d43429ea..b7eb140458b40 100644 --- a/openmp/runtime/src/ompt-specific.h +++ b/openmp/runtime/src/ompt-specific.h @@ -102,15 +102,16 @@ inline void *__ompt_load_return_address(int gtid) { if (ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] && \ !__kmp_threads[gtid]->th.ompt_thread_info.return_address) \ __kmp_threads[gtid]->th.ompt_thread_info.return_address = \ - __builtin_return_address(0)*/ + __builtin_extract_return_addr(__builtin_return_address(0))*/ #define OMPT_STORE_RETURN_ADDRESS(gtid) \ - OmptReturnAddressGuard ReturnAddressGuard{gtid, __builtin_return_address(0)}; + OmptReturnAddressGuard ReturnAddressGuard{ \ + gtid, __builtin_extract_return_addr(__builtin_return_address(0))}; #define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid) #define OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid) \ ((ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] && \ __kmp_threads[gtid]->th.ompt_thread_info.return_address) \ ? __ompt_load_return_address(gtid) \ - : __builtin_return_address(0)) + : __builtin_extract_return_addr(__builtin_return_address(0))) #define OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, incr) \ do { \ diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h index 4dd1db4c4225b..0837cc5dfefcd 100644 --- a/openmp/runtime/test/ompt/callback.h +++ b/openmp/runtime/test/ompt/callback.h @@ -311,6 +311,14 @@ ompt_label_##id: printf("%" PRIu64 ": current_address=%p or %p or %p\n", \ ompt_get_thread_data()->value, ((char *)addr) - 2, \ ((char *)addr) - 8, ((char *)addr) - 12) +#elif KMP_ARCH_SPARC +// FIXME: Need to distinguish between 32 and 64-bit SPARC? +// On SPARC the NOP instruction is 4 bytes long. +// FIXME: Explain. Can use __builtin_frob_return_addr? +#define print_possible_return_addresses(addr) \ + printf("%" PRIu64 ": current_address=%p or %p\n", \ + ompt_get_thread_data()->value, ((char *)addr) - 12, \ + (char *)addr - 20) #else #error Unsupported target architecture, cannot determine address offset! #endif diff --git a/third-party/unittest/googletest/README.LLVM b/third-party/unittest/googletest/README.LLVM index b574c7f98be41..56715cff9a73d 100644 --- a/third-party/unittest/googletest/README.LLVM +++ b/third-party/unittest/googletest/README.LLVM @@ -19,3 +19,6 @@ Modified as follows: * Added StringRef support to include/gtest/internal/custom/gtest-printers.h. * Added LLVM printable value support to include/gtest/gtest-message.h and include/gtest/gtest-printers.h. +* Modified `PrintTo(char16_t c, ::std::ostream* os)` and + `PrintTo(char16_t c, ::std::ostream* os)` in include/gtest/gtest-printers.h. + to work around https://github.com/google/googletest/issues/4762 diff --git a/third-party/unittest/googletest/include/gtest/gtest-printers.h b/third-party/unittest/googletest/include/gtest/gtest-printers.h index d0da9bc1843ce..409b135fc2141 100644 --- a/third-party/unittest/googletest/include/gtest/gtest-printers.h +++ b/third-party/unittest/googletest/include/gtest/gtest-printers.h @@ -510,11 +510,15 @@ GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os); inline void PrintTo(char16_t c, ::std::ostream* os) { - PrintTo(ImplicitCast_(c), os); + // FIXME: the cast from char16_t to char32_t may be incorrect + // for a lone surrogate + PrintTo(static_cast(c), os); } #ifdef __cpp_lib_char8_t inline void PrintTo(char8_t c, ::std::ostream* os) { - PrintTo(ImplicitCast_(c), os); + // FIXME: the cast from char8_t to char32_t may be incorrect + // for c > 0x7F + PrintTo(static_cast(c), os); } #endif diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl index fcc9fc7ecc483..b976f3955febf 100644 --- a/utils/bazel/configure.bzl +++ b/utils/bazel/configure.bzl @@ -180,7 +180,7 @@ def _llvm_configure_impl(repository_ctx): ) # Create a starlark file with the requested BOLT targets. - bolt_targets = ["AArch64","X86","RISCV"] # Supported targets. + bolt_targets = ["AArch64", "X86", "RISCV"] # Supported targets. bolt_targets = [t for t in llvm_targets if t in bolt_targets] repository_ctx.file( "bolt/targets.bzl", diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 34b00f993a44d..6618e2cd61e77 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -838,7 +838,6 @@ gentbl_cc_library( deps = ["//lldb:CoreTdFiles"], ) - cc_library( name = "PluginCPlusPlusLanguageHeaders", hdrs = glob(["Language/CPlusPlus/*.h"]), @@ -866,8 +865,8 @@ cc_library( "//lldb:Core", "//lldb:DataFormatters", "//lldb:Headers", - "//lldb:InterpreterHeaders", "//lldb:Host", + "//lldb:InterpreterHeaders", "//lldb:Symbol", "//lldb:SymbolHeaders", "//lldb:Target", diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 05c2fb4819807..5f7ed5724e3f2 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6159,96 +6159,6 @@ gentbl_cc_library( deps = [":PDLInterpOpsTdFiles"], ) -cc_library( - name = "PolynomialDialect", - srcs = glob([ - "lib/Dialect/Polynomial/IR/*.cpp", - ]), - hdrs = glob([ - "include/mlir/Dialect/Polynomial/IR/*.h", - ]), - includes = ["include"], - deps = [ - ":ArithDialect", - ":IR", - ":InferTypeOpInterface", - ":PolynomialAttributesIncGen", - ":PolynomialCanonicalizationIncGen", - ":PolynomialIncGen", - ":Support", - "//llvm:Support", - ], -) - -td_library( - name = "PolynomialTdFiles", - srcs = glob(["include/mlir/Dialect/Polynomial/IR/*.td"]), - includes = ["include"], - deps = [ - ":BuiltinDialectTdFiles", - ":InferTypeOpInterfaceTdFiles", - ":OpBaseTdFiles", - ":SideEffectInterfacesTdFiles", - ], -) - -gentbl_cc_library( - name = "PolynomialIncGen", - tbl_outs = { - "include/mlir/Dialect/Polynomial/IR/Polynomial.h.inc": ["-gen-op-decls"], - "include/mlir/Dialect/Polynomial/IR/Polynomial.cpp.inc": ["-gen-op-defs"], - "include/mlir/Dialect/Polynomial/IR/PolynomialDialect.h.inc": [ - "-gen-dialect-decls", - "-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialDialect.cpp.inc": [ - "-gen-dialect-defs", - "-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialTypes.h.inc": [ - "--gen-typedef-decls", - "-typedefs-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialTypes.cpp.inc": [ - "--gen-typedef-defs", - "-typedefs-dialect=polynomial", - ], - "g3doc/Dialects/Polynomial/Polynomial.md": ["-gen-op-doc"], - }, - tblgen = ":mlir-tblgen", - td_file = "include/mlir/Dialect/Polynomial/IR/Polynomial.td", - deps = [":PolynomialTdFiles"], -) - -gentbl_cc_library( - name = "PolynomialAttributesIncGen", - tbl_outs = { - "include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.h.inc": [ - "-gen-attrdef-decls", - "-attrdefs-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.cpp.inc": [ - "-gen-attrdef-defs", - "-attrdefs-dialect=polynomial", - ], - }, - tblgen = ":mlir-tblgen", - td_file = "include/mlir/Dialect/Polynomial/IR/Polynomial.td", - deps = [":PolynomialTdFiles"], -) - -gentbl_cc_library( - name = "PolynomialCanonicalizationIncGen", - strip_include_prefix = "include/mlir/Dialect/Polynomial/IR", - tbl_outs = {"include/mlir/Dialect/Polynomial/IR/PolynomialCanonicalization.inc": ["-gen-rewriters"]}, - tblgen = ":mlir-tblgen", - td_file = "lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td", - deps = [ - ":ArithOpsTdFiles", - ":PolynomialTdFiles", - ], -) - td_library( name = "PtrTdFiles", srcs = [ @@ -8867,7 +8777,6 @@ cc_library( ":PDLDialect", ":PDLInterpDialect", ":PDLToPDLInterp", - ":PolynomialDialect", ":PtrDialect", ":QuantOps", ":QuantTransforms", diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel index c2ddf03ef28df..be5bbb0434125 100644 --- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel @@ -983,8 +983,8 @@ td_library( srcs = [], includes = ["../include"], deps = [ - "//mlir:SMTTdFiles", "//mlir:OpBaseTdFiles", + "//mlir:SMTTdFiles", ], ) diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index 0ffa8ed25303c..221c1c308062d 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -124,7 +124,6 @@ td_library( "//mlir:LinalgStructuredOpsTdFiles", "//mlir:MemorySlotInterfacesTdFiles", "//mlir:OpBaseTdFiles", - "//mlir:PolynomialTdFiles", "//mlir:PtrTdFiles", "//mlir:SideEffectInterfacesTdFiles", ], @@ -370,7 +369,6 @@ cc_library( "//mlir:LoopLikeInterface", "//mlir:MemorySlotInterfaces", "//mlir:Pass", - "//mlir:PolynomialDialect", "//mlir:PtrDialect", "//mlir:Reducer", "//mlir:SideEffectInterfaces", @@ -1087,6 +1085,7 @@ cc_library( "//mlir:NVGPUDialect", "//mlir:Pass", "//mlir:SCFDialect", + "//mlir:SCFTransforms", "//mlir:Support", "//mlir:TensorDialect", "//mlir:TransformUtils", From 1399ec4fdf8fe08000b590844f4e24c31a310a01 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 21 May 2025 16:20:57 -0700 Subject: [PATCH 2/2] Add test with variable count Created using spr 1.3.6-beta.1 --- .../CodeGenCXX/cxx2c-trivially-relocatable.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/clang/test/CodeGenCXX/cxx2c-trivially-relocatable.cpp b/clang/test/CodeGenCXX/cxx2c-trivially-relocatable.cpp index 63f3ba8e74ed5..465e539d363e8 100644 --- a/clang/test/CodeGenCXX/cxx2c-trivially-relocatable.cpp +++ b/clang/test/CodeGenCXX/cxx2c-trivially-relocatable.cpp @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -std=c++26 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s +typedef __SIZE_TYPE__ size_t; + struct S trivially_relocatable_if_eligible { S(const S&); ~S(); @@ -8,9 +10,13 @@ struct S trivially_relocatable_if_eligible { }; // CHECK: @_Z4testP1SS0_ -// CHECK: call void @llvm.memmove.p0.p0.i64({{.*}}, i64 8 -// CHECK-NOT: __builtin -// CHECK: ret -void test(S* source, S* dest) { +void test(S* source, S* dest, size_t count) { + // CHECK: call void @llvm.memmove.p0.p0.i64({{.*}}, i64 8 + // CHECK-NOT: __builtin __builtin_trivially_relocate(dest, source, 1); + // CHECK: [[A:%.*]] = load i64, ptr %count.addr + // CHECK: [[M:%.*]] = mul i64 [[A]], 8 + // CHECK: call void @llvm.memmove.p0.p0.i64({{.*}}, i64 [[M]] + __builtin_trivially_relocate(dest, source, count); + // CHECK: ret };